diff --git a/cmake/OpenCVDetectOpenCL.cmake b/cmake/OpenCVDetectOpenCL.cmake
index 2c96274a8c..c96df82e2b 100644
--- a/cmake/OpenCVDetectOpenCL.cmake
+++ b/cmake/OpenCVDetectOpenCL.cmake
@@ -20,10 +20,24 @@ else(APPLE)
               DOC "OpenCL include directory"
               NO_DEFAULT_PATH)
 
-    if (X86_64)
-      set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win64 lib/x86_64 lib/x64)
-    elseif (X86)
-      set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win32 lib/x86)
+    if(WIN32)
+      if(X86_64)
+        set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win64 lib/x86_64 lib/x64)
+      elseif(X86)
+        set(OPENCL_POSSIBLE_LIB_SUFFIXES lib/Win32 lib/x86)
+      else()
+        set(OPENCL_POSSIBLE_LIB_SUFFIXES lib)
+      endif()
+    elseif(UNIX)
+      if(X86_64)
+        set(OPENCL_POSSIBLE_LIB_SUFFIXES lib64 lib)
+      elseif(X86)
+        set(OPENCL_POSSIBLE_LIB_SUFFIXES lib32 lib)
+      else()
+        set(OPENCL_POSSIBLE_LIB_SUFFIXES lib)
+      endif()
+    else()
+      set(OPENCL_POSSIBLE_LIB_SUFFIXES lib)
     endif()
 
     find_library(OPENCL_LIBRARY
diff --git a/doc/tutorials/ml/non_linear_svms/non_linear_svms.rst b/doc/tutorials/ml/non_linear_svms/non_linear_svms.rst
index 8fbcc563a7..57e0b1b6ea 100644
--- a/doc/tutorials/ml/non_linear_svms/non_linear_svms.rst
+++ b/doc/tutorials/ml/non_linear_svms/non_linear_svms.rst
@@ -1 +1,232 @@
-.. _nonLinearSvmS: Support Vector Machines for Non-Linearly Separable Data*******************************************************Goal====In this tutorial you will learn how to:.. container:: enumeratevisibleitemswithsquare  + Define the optimization problem for SVMs when it is not possible to separate linearly the training data.  + How to configure the parameters in :svms:`CvSVMParams <cvsvmparams>` to adapt your SVM for this class of problems.Motivation==========Why is it interesting to extend the SVM optimation problem in order to handle non-linearly separable training data? Most of the applications in which SVMs are used in computer vision require a more powerful tool than a simple linear classifier. This stems from the fact that in these tasks **the training data can be rarely separated using an hyperplane**.Consider one of these tasks, for example, face detection. The training data in this case is composed by a set of images that are faces and another set of images that are non-faces (*every other thing in the world except from faces*). This training data is too complex so as to find a representation of each sample (*feature vector*) that could make the whole set of faces linearly separable from the whole set of non-faces.Extension of the Optimization Problem=====================================Remember that using SVMs we obtain a separating hyperplane. Therefore, since the training data is now non-linearly separable, we must admit that the hyperplane found will misclassify some of the samples. This *misclassification* is a new variable in the optimization that must be taken into account. The new model has to include both the old requirement of finding the hyperplane that gives the biggest margin and the new one of generalizing the training data correctly by not allowing too many classification errors.  We start here from the formulation of the optimization problem of finding the hyperplane which maximizes the **margin** (this is explained in the :ref:`previous tutorial <introductiontosvms>`):.. math::  \min_{\beta, \beta_{0}} L(\beta) = \frac{1}{2}||\beta||^{2} \text{ subject to } y_{i}(\beta^{T} x_{i} + \beta_{0}) \geq 1 \text{ } \forall iThere are multiple ways in which this model can be modified so it takes into account the misclassification errors. For example, one could think of minimizing the same quantity plus a constant times the number of misclassification errors in the training data, i.e.:.. math::  \min ||\beta||^{2} + C \text{(\# misclassication errors)}However, this one is not a very good solution since, among some other reasons, we do not distinguish between samples that are misclassified with a small distance to their appropriate decision region or samples that are not. Therefore, a better solution will take into account the *distance of the misclassified samples to their correct decision regions*, i.e.:.. math::  \min ||\beta||^{2} + C \text{(distance of misclassified samples to their correct regions)}For each sample of the training data a new parameter :math:`\xi_{i}` is defined. Each one of these parameters contains the distance from its corresponding training sample to their correct decision region. The following picture shows non-linearly separable training data from two classes, a separating hyperplane and the distances to their correct regions of the samples that are misclassified... image:: images/sample-errors-dist.png   :alt: Samples misclassified and their distances to their correct regions   :align: center .. note:: Only the distances of the samples that are misclassified are shown in the picture. The distances of the rest of the samples are zero since they lay already in their correct decision region.The red and blue lines that appear on the picture are the margins to each one of the decision regions. It is very **important** to realize that each of the :math:`\xi_{i}` goes from a misclassified training sample to the margin of its appropriate region.Finally, the new formulation for the optimization problem is:.. math::  \min_{\beta, \beta_{0}} L(\beta) = ||\beta||^{2} + C \sum_{i} {\xi_{i}} \text{ subject to } y_{i}(\beta^{T} x_{i} + \beta_{0}) \geq 1 - \xi_{i} \text{ and } \xi_{i} \geq 0 \text{ } \forall i How should the parameter C be chosen? It is obvious that the answer to this question depends on how the training data is distributed. Although there is no general answer, it is useful to take into account these rules:.. container:: enumeratevisibleitemswithsquare   * Large values of C give solutions with *less misclassification errors* but a *smaller margin*. Consider that in this case it is expensive to make misclassification errors. Since the aim of the optimization is to minimize the argument, few misclassifications errors are allowed.   * Small values of C give solutions with *bigger margin* and *more classification errors*. In this case the minimization does not consider that much the term of the sum so it focuses more on finding a hyperplane with big margin.Source Code===========You may also find the source code and these video file in the :file:`samples/cpp/tutorial_code/gpu/non_linear_svms/non_linear_svms` folder of the OpenCV source library or :download:`download it from here <../../../../samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp>`... literalinclude:: ../../../../samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp   :language: cpp   :linenos:   :tab-width: 4   :lines: 1-11, 22-23, 26-Explanation===========1. **Set up the training data**  The training data of this exercise is formed by a set of labeled 2D-points that belong to one of two different classes. To make the exercise more appealing, the training data is generated randomly using a uniform probability density functions (PDFs).  We have divided the generation of the training data into two main parts.  In the first part we generate data for both classes that is linearly separable.  .. code-block:: cpp     // Generate random points for the class 1     Mat trainClass = trainData.rowRange(0, nLinearSamples);     // The x coordinate of the points is in [0, 0.4)     Mat c = trainClass.colRange(0, 1);     rng.fill(c, RNG::UNIFORM, Scalar(1), Scalar(0.4 * WIDTH));     // The y coordinate of the points is in [0, 1)     c = trainClass.colRange(1,2);     rng.fill(c, RNG::UNIFORM, Scalar(1), Scalar(HEIGHT));     // Generate random points for the class 2     trainClass = trainData.rowRange(2*NTRAINING_SAMPLES-nLinearSamples, 2*NTRAINING_SAMPLES);     // The x coordinate of the points is in [0.6, 1]     c = trainClass.colRange(0 , 1);      rng.fill(c, RNG::UNIFORM, Scalar(0.6*WIDTH), Scalar(WIDTH));     // The y coordinate of the points is in [0, 1)     c = trainClass.colRange(1,2);     rng.fill(c, RNG::UNIFORM, Scalar(1), Scalar(HEIGHT));  In the second part we create data for both classes that is non-linearly separable, data that overlaps.  .. code-block:: cpp     // Generate random points for the classes 1 and 2     trainClass = trainData.rowRange(  nLinearSamples, 2*NTRAINING_SAMPLES-nLinearSamples);     // The x coordinate of the points is in [0.4, 0.6)     c = trainClass.colRange(0,1);     rng.fill(c, RNG::UNIFORM, Scalar(0.4*WIDTH), Scalar(0.6*WIDTH));      // The y coordinate of the points is in [0, 1)     c = trainClass.colRange(1,2);     rng.fill(c, RNG::UNIFORM, Scalar(1), Scalar(HEIGHT)); 2. **Set up SVM's parameters**  .. seealso::      In the previous tutorial :ref:`introductiontosvms` there is an explanation of the atributes of the class :svms:`CvSVMParams <cvsvmparams>` that we configure here before training the SVM.  .. code-block:: cpp     CvSVMParams params;     params.svm_type    = SVM::C_SVC;     params.C              = 0.1;     params.kernel_type = SVM::LINEAR;     params.term_crit   = TermCriteria(CV_TERMCRIT_ITER, (int)1e7, 1e-6);  There are just two differences between the configuration we do here and the one that was done in the :ref:`previous tutorial <introductiontosvms>` that we use as reference.  * *CvSVM::C_SVC*. We chose here a small value of this parameter in order not to punish too much the misclassification errors in the optimization. The idea of doing this stems from the will of obtaining a solution close to the one intuitively expected. However, we recommend to get a better insight of the problem by making adjustments to this parameter.      .. note:: Here there are just very few points in the overlapping region between classes, giving a smaller value to **FRAC_LINEAR_SEP** the density of points can be incremented and the impact of the parameter **CvSVM::C_SVC** explored deeply.  * *Termination Criteria of the algorithm*. The maximum number of iterations has to be increased considerably in order to solve correctly a problem with non-linearly separable training data. In particular, we have increased in five orders of magnitude this value.   3. **Train the SVM**  We call the method :svms:`CvSVM::train <cvsvm-train>` to build the SVM model. Watch out that the training process may take a quite long time. Have patiance when your run the program.  .. code-block:: cpp     CvSVM svm;     svm.train(trainData, labels, Mat(), Mat(), params);4. **Show the Decision Regions**  The method :svms:`CvSVM::predict <cvsvm-predict>` is used to classify an input sample using a trained SVM. In this example we have used this method in order to color the space depending on the prediction done by the SVM. In other words, an image is traversed interpreting its pixels as points of the Cartesian plane. Each of the points is colored depending on the class predicted by the SVM; in dark green if it is the class with label 1 and in dark blue if it is the class with label 2.  .. code-block:: cpp     Vec3b green(0,100,0), blue (100,0,0);     for (int i = 0; i < I.rows; ++i)          for (int j = 0; j < I.cols; ++j)          {               Mat sampleMat = (Mat_<float>(1,2) << i, j);               float response = svm.predict(sampleMat);               if      (response == 1)    I.at<Vec3b>(j, i)  = green;               else if (response == 2)    I.at<Vec3b>(j, i)  = blue;          }5. **Show the training data**  The method :drawingFunc:`circle <circle>` is used to show the samples that compose the training data. The samples of the class labeled with 1 are shown in light green and in light blue the samples of the class labeled with 2.  .. code-block:: cpp     int thick = -1;     int lineType = 8;     float px, py;     // Class 1     for (int i = 0; i < NTRAINING_SAMPLES; ++i)     {          px = trainData.at<float>(i,0);          py = trainData.at<float>(i,1);          circle(I, Point( (int) px,  (int) py ), 3, Scalar(0, 255, 0), thick, lineType);     }     // Class 2     for (int i = NTRAINING_SAMPLES; i <2*NTRAINING_SAMPLES; ++i)     {          px = trainData.at<float>(i,0);          py = trainData.at<float>(i,1);          circle(I, Point( (int) px, (int) py ), 3, Scalar(255, 0, 0), thick, lineType);     }6. **Support vectors**  We use here a couple of methods to obtain information about the support vectors. The method :svms:`CvSVM::get_support_vector_count <cvsvm-get-support-vector>` outputs the total number of support vectors used in the problem and with the method :svms:`CvSVM::get_support_vector <cvsvm-get-support-vector>` we obtain each of the support vectors using an index. We have used this methods here to find the training examples that are support vectors and highlight them.  .. code-block:: cpp     thick = 2;     lineType  = 8;     int x     = svm.get_support_vector_count();     for (int i = 0; i < x; ++i)     {          const float* v = svm.get_support_vector(i);          circle(     I,  Point( (int) v[0], (int) v[1]), 6, Scalar(128, 128, 128), thick, lineType);     }Results=======.. container:: enumeratevisibleitemswithsquare   * The code opens an image and shows the training examples of both classes. The points of one class are represented with light green and light blue ones are used for the other class.   * The SVM is trained and used to classify all the pixels of the image. This results in a division of the image in a blue region and a green region. The boundary between both regions is the separating hyperplane. Since the training data is non-linearly separable, it can be seen that some of the examples of both classes are misclassified; some green points lay on the blue region and some blue points lay on the green one.   * Finally the support vectors are shown using gray rings around the training examples... image:: images/result.png  :alt: Training data and decision regions given by the SVM  :width: 300pt  :align: center You may observe a runtime instance of this on the `YouTube here <https://www.youtube.com/watch?v=vFv2yPcSo-Q>`_. .. raw:: html  <div align="center">  <iframe title="Support Vector Machines for Non-Linearly Separable Data" width="560" height="349" src="http://www.youtube.com/embed/vFv2yPcSo-Q?rel=0&loop=1" frameborder="0" allowfullscreen align="middle"></iframe>  </div>
\ No newline at end of file
+.. _nonLinearSvmS:
+
+Support Vector Machines for Non-Linearly Separable Data
+*******************************************************
+
+Goal
+====
+
+In this tutorial you will learn how to:
+
+.. container:: enumeratevisibleitemswithsquare
+
+  + Define the optimization problem for SVMs when it is not possible to separate linearly the training data.
+
+  + How to configure the parameters in :svms:`CvSVMParams <cvsvmparams>` to adapt your SVM for this class of problems.
+
+Motivation
+==========
+
+Why is it interesting to extend the SVM optimation problem in order to handle non-linearly separable training data? Most of the applications in which SVMs are used in computer vision require a more powerful tool than a simple linear classifier. This stems from the fact that in these tasks **the training data can be rarely separated using an hyperplane**.
+
+Consider one of these tasks, for example, face detection. The training data in this case is composed by a set of images that are faces and another set of images that are non-faces (*every other thing in the world except from faces*). This training data is too complex so as to find a representation of each sample (*feature vector*) that could make the whole set of faces linearly separable from the whole set of non-faces.
+
+Extension of the Optimization Problem
+=====================================
+
+Remember that using SVMs we obtain a separating hyperplane. Therefore, since the training data is now non-linearly separable, we must admit that the hyperplane found will misclassify some of the samples. This *misclassification* is a new variable in the optimization that must be taken into account. The new model has to include both the old requirement of finding the hyperplane that gives the biggest margin and the new one of generalizing the training data correctly by not allowing too many classification errors.
+
+We start here from the formulation of the optimization problem of finding the hyperplane which maximizes the **margin** (this is explained in the :ref:`previous tutorial <introductiontosvms>`):
+
+.. math::
+  \min_{\beta, \beta_{0}} L(\beta) = \frac{1}{2}||\beta||^{2} \text{ subject to } y_{i}(\beta^{T} x_{i} + \beta_{0}) \geq 1 \text{ } \forall i
+
+There are multiple ways in which this model can be modified so it takes into account the misclassification errors. For example, one could think of minimizing the same quantity plus a constant times the number of misclassification errors in the training data, i.e.:
+
+.. math::
+  \min ||\beta||^{2} + C \text{(\# misclassication errors)}
+
+However, this one is not a very good solution since, among some other reasons, we do not distinguish between samples that are misclassified with a small distance to their appropriate decision region or samples that are not. Therefore, a better solution will take into account the *distance of the misclassified samples to their correct decision regions*, i.e.:
+
+.. math::
+  \min ||\beta||^{2} + C \text{(distance of misclassified samples to their correct regions)}
+
+For each sample of the training data a new parameter :math:`\xi_{i}` is defined. Each one of these parameters contains the distance from its corresponding training sample to their correct decision region. The following picture shows non-linearly separable training data from two classes, a separating hyperplane and the distances to their correct regions of the samples that are misclassified.
+
+.. image:: images/sample-errors-dist.png
+   :alt: Samples misclassified and their distances to their correct regions
+   :align: center
+
+.. note:: Only the distances of the samples that are misclassified are shown in the picture. The distances of the rest of the samples are zero since they lay already in their correct decision region.
+
+The red and blue lines that appear on the picture are the margins to each one of the decision regions. It is very **important** to realize that each of the :math:`\xi_{i}` goes from a misclassified training sample to the margin of its appropriate region.
+
+Finally, the new formulation for the optimization problem is:
+
+.. math::
+  \min_{\beta, \beta_{0}} L(\beta) = ||\beta||^{2} + C \sum_{i} {\xi_{i}} \text{ subject to } y_{i}(\beta^{T} x_{i} + \beta_{0}) \geq 1 - \xi_{i} \text{ and } \xi_{i} \geq 0 \text{ } \forall i
+
+How should the parameter C be chosen? It is obvious that the answer to this question depends on how the training data is distributed. Although there is no general answer, it is useful to take into account these rules:
+
+.. container:: enumeratevisibleitemswithsquare
+
+   * Large values of C give solutions with *less misclassification errors* but a *smaller margin*. Consider that in this case it is expensive to make misclassification errors. Since the aim of the optimization is to minimize the argument, few misclassifications errors are allowed.
+
+   * Small values of C give solutions with *bigger margin* and *more classification errors*. In this case the minimization does not consider that much the term of the sum so it focuses more on finding a hyperplane with big margin.
+
+Source Code
+===========
+
+You may also find the source code and these video file in the :file:`samples/cpp/tutorial_code/gpu/non_linear_svms/non_linear_svms` folder of the OpenCV source library or :download:`download it from here <../../../../samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp>`.
+
+.. literalinclude:: ../../../../samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp
+   :language: cpp
+   :linenos:
+   :tab-width: 4
+   :lines: 1-11, 22-23, 26-
+
+Explanation
+===========
+
+1. **Set up the training data**
+
+  The training data of this exercise is formed by a set of labeled 2D-points that belong to one of two different classes. To make the exercise more appealing, the training data is generated randomly using a uniform probability density functions (PDFs).
+
+  We have divided the generation of the training data into two main parts.
+
+  In the first part we generate data for both classes that is linearly separable.
+
+  .. code-block:: cpp
+
+     // Generate random points for the class 1
+     Mat trainClass = trainData.rowRange(0, nLinearSamples);
+     // The x coordinate of the points is in [0, 0.4)
+     Mat c = trainClass.colRange(0, 1);
+     rng.fill(c, RNG::UNIFORM, Scalar(1), Scalar(0.4 * WIDTH));
+     // The y coordinate of the points is in [0, 1)
+     c = trainClass.colRange(1,2);
+     rng.fill(c, RNG::UNIFORM, Scalar(1), Scalar(HEIGHT));
+
+     // Generate random points for the class 2
+     trainClass = trainData.rowRange(2*NTRAINING_SAMPLES-nLinearSamples, 2*NTRAINING_SAMPLES);
+     // The x coordinate of the points is in [0.6, 1]
+     c = trainClass.colRange(0 , 1);
+     rng.fill(c, RNG::UNIFORM, Scalar(0.6*WIDTH), Scalar(WIDTH));
+     // The y coordinate of the points is in [0, 1)
+     c = trainClass.colRange(1,2);
+     rng.fill(c, RNG::UNIFORM, Scalar(1), Scalar(HEIGHT));
+
+  In the second part we create data for both classes that is non-linearly separable, data that overlaps.
+
+  .. code-block:: cpp
+
+     // Generate random points for the classes 1 and 2
+     trainClass = trainData.rowRange(  nLinearSamples, 2*NTRAINING_SAMPLES-nLinearSamples);
+     // The x coordinate of the points is in [0.4, 0.6)
+     c = trainClass.colRange(0,1);
+     rng.fill(c, RNG::UNIFORM, Scalar(0.4*WIDTH), Scalar(0.6*WIDTH));
+     // The y coordinate of the points is in [0, 1)
+     c = trainClass.colRange(1,2);
+     rng.fill(c, RNG::UNIFORM, Scalar(1), Scalar(HEIGHT));
+
+2. **Set up SVM's parameters**
+
+  .. seealso::
+
+      In the previous tutorial :ref:`introductiontosvms` there is an explanation of the atributes of the class :svms:`CvSVMParams <cvsvmparams>` that we configure here before training the SVM.
+
+  .. code-block:: cpp
+
+     CvSVMParams params;
+     params.svm_type    = SVM::C_SVC;
+     params.C              = 0.1;
+     params.kernel_type = SVM::LINEAR;
+     params.term_crit   = TermCriteria(CV_TERMCRIT_ITER, (int)1e7, 1e-6);
+
+  There are just two differences between the configuration we do here and the one that was done in the :ref:`previous tutorial <introductiontosvms>` that we use as reference.
+
+  * *CvSVM::C_SVC*. We chose here a small value of this parameter in order not to punish too much the misclassification errors in the optimization. The idea of doing this stems from the will of obtaining a solution close to the one intuitively expected. However, we recommend to get a better insight of the problem by making adjustments to this parameter.
+
+      .. note:: Here there are just very few points in the overlapping region between classes, giving a smaller value to **FRAC_LINEAR_SEP** the density of points can be incremented and the impact of the parameter **CvSVM::C_SVC** explored deeply.
+
+  * *Termination Criteria of the algorithm*. The maximum number of iterations has to be increased considerably in order to solve correctly a problem with non-linearly separable training data. In particular, we have increased in five orders of magnitude this value.
+
+3. **Train the SVM**
+
+  We call the method :svms:`CvSVM::train <cvsvm-train>` to build the SVM model. Watch out that the training process may take a quite long time. Have patiance when your run the program.
+
+  .. code-block:: cpp
+
+     CvSVM svm;
+     svm.train(trainData, labels, Mat(), Mat(), params);
+
+4. **Show the Decision Regions**
+
+  The method :svms:`CvSVM::predict <cvsvm-predict>` is used to classify an input sample using a trained SVM. In this example we have used this method in order to color the space depending on the prediction done by the SVM. In other words, an image is traversed interpreting its pixels as points of the Cartesian plane. Each of the points is colored depending on the class predicted by the SVM; in dark green if it is the class with label 1 and in dark blue if it is the class with label 2.
+
+  .. code-block:: cpp
+
+     Vec3b green(0,100,0), blue (100,0,0);
+     for (int i = 0; i < I.rows; ++i)
+          for (int j = 0; j < I.cols; ++j)
+          {
+               Mat sampleMat = (Mat_<float>(1,2) << i, j);
+               float response = svm.predict(sampleMat);
+
+               if      (response == 1)    I.at<Vec3b>(j, i)  = green;
+               else if (response == 2)    I.at<Vec3b>(j, i)  = blue;
+          }
+
+5. **Show the training data**
+
+  The method :drawingFunc:`circle <circle>` is used to show the samples that compose the training data. The samples of the class labeled with 1 are shown in light green and in light blue the samples of the class labeled with 2.
+
+  .. code-block:: cpp
+
+     int thick = -1;
+     int lineType = 8;
+     float px, py;
+     // Class 1
+     for (int i = 0; i < NTRAINING_SAMPLES; ++i)
+     {
+          px = trainData.at<float>(i,0);
+          py = trainData.at<float>(i,1);
+          circle(I, Point( (int) px,  (int) py ), 3, Scalar(0, 255, 0), thick, lineType);
+     }
+     // Class 2
+     for (int i = NTRAINING_SAMPLES; i <2*NTRAINING_SAMPLES; ++i)
+     {
+          px = trainData.at<float>(i,0);
+          py = trainData.at<float>(i,1);
+          circle(I, Point( (int) px, (int) py ), 3, Scalar(255, 0, 0), thick, lineType);
+     }
+
+6. **Support vectors**
+
+  We use here a couple of methods to obtain information about the support vectors. The method :svms:`CvSVM::get_support_vector_count <cvsvm-get-support-vector>` outputs the total number of support vectors used in the problem and with the method :svms:`CvSVM::get_support_vector <cvsvm-get-support-vector>` we obtain each of the support vectors using an index. We have used this methods here to find the training examples that are support vectors and highlight them.
+
+  .. code-block:: cpp
+
+     thick = 2;
+     lineType  = 8;
+     int x     = svm.get_support_vector_count();
+
+     for (int i = 0; i < x; ++i)
+     {
+          const float* v = svm.get_support_vector(i);
+          circle(     I,  Point( (int) v[0], (int) v[1]), 6, Scalar(128, 128, 128), thick, lineType);
+     }
+
+Results
+========
+
+.. container:: enumeratevisibleitemswithsquare
+
+   * The code opens an image and shows the training examples of both classes. The points of one class are represented with light green and light blue ones are used for the other class.
+
+   * The SVM is trained and used to classify all the pixels of the image. This results in a division of the image in a blue region and a green region. The boundary between both regions is the separating hyperplane. Since the training data is non-linearly separable, it can be seen that some of the examples of both classes are misclassified; some green points lay on the blue region and some blue points lay on the green one.
+
+   * Finally the support vectors are shown using gray rings around the training examples.
+
+.. image:: images/result.png
+  :alt: Training data and decision regions given by the SVM
+  :width: 300pt
+  :align: center
+
+You may observe a runtime instance of this on the `YouTube here <https://www.youtube.com/watch?v=vFv2yPcSo-Q>`_.
+
+.. raw:: html
+
+  <div align="center">
+  <iframe title="Support Vector Machines for Non-Linearly Separable Data" width="560" height="349" src="http://www.youtube.com/embed/vFv2yPcSo-Q?rel=0&loop=1" frameborder="0" allowfullscreen align="middle"></iframe>
+  </div>
diff --git a/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.rst b/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.rst
index b4d6245f89..30f3102a7f 100644
--- a/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.rst
+++ b/modules/calib3d/doc/camera_calibration_and_3d_reconstruction.rst
@@ -286,6 +286,8 @@ For points in an image of a stereo pair, computes the corresponding epilines in
 
 .. ocv:cfunction:: void cvComputeCorrespondEpilines( const CvMat* points, int which_image, const CvMat* fundamental_matrix, CvMat* correspondent_lines )
 
+.. ocv:pyfunction:: cv2.computeCorrespondEpilines(points, whichImage, F[, lines]) -> lines
+
     :param points: Input points.  :math:`N \times 1`  or  :math:`1 \times N`  matrix of type  ``CV_32FC2``  or  ``vector<Point2f>`` .
 
     :param whichImage: Index of the image (1 or 2) that contains the  ``points`` .
diff --git a/modules/core/CMakeLists.txt b/modules/core/CMakeLists.txt
index 6c133e8ebb..baa287bca7 100644
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(the_description "The Core Functionality")
-ocv_add_module(core ${ZLIB_LIBRARIES})
+ocv_add_module(core ${ZLIB_LIBRARIES} OPTIONAL opencv_cudev)
 ocv_module_include_directories(${ZLIB_INCLUDE_DIR})
 
 if (HAVE_WINRT)
@@ -7,7 +7,7 @@ if (HAVE_WINRT)
 endif()
 
 if(HAVE_CUDA)
-  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef)
+  ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wenum-compare -Wunused-function)
 endif()
 
 file(GLOB lib_cuda_hdrs        "include/opencv2/${name}/cuda/*.hpp"        "include/opencv2/${name}/cuda/*.h")
diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp
index 637ecdf513..013f573a64 100644
--- a/modules/core/include/opencv2/core/base.hpp
+++ b/modules/core/include/opencv2/core/base.hpp
@@ -449,15 +449,15 @@ _AccTp normInf(const _Tp* a, const _Tp* b, int n)
 
 ////////////////// forward declarations for important OpenCV types //////////////////
 
-template<typename _Tp, int cn> class CV_EXPORTS Vec;
-template<typename _Tp, int m, int n> class CV_EXPORTS Matx;
+template<typename _Tp, int cn> class Vec;
+template<typename _Tp, int m, int n> class Matx;
 
-template<typename _Tp> class CV_EXPORTS Complex;
-template<typename _Tp> class CV_EXPORTS Point_;
-template<typename _Tp> class CV_EXPORTS Point3_;
-template<typename _Tp> class CV_EXPORTS Size_;
-template<typename _Tp> class CV_EXPORTS Rect_;
-template<typename _Tp> class CV_EXPORTS Scalar_;
+template<typename _Tp> class Complex;
+template<typename _Tp> class Point_;
+template<typename _Tp> class Point3_;
+template<typename _Tp> class Size_;
+template<typename _Tp> class Rect_;
+template<typename _Tp> class Scalar_;
 
 class CV_EXPORTS RotatedRect;
 class CV_EXPORTS Range;
@@ -472,16 +472,16 @@ class CV_EXPORTS MatExpr;
 class CV_EXPORTS SparseMat;
 typedef Mat MatND;
 
-template<typename _Tp> class CV_EXPORTS Mat_;
-template<typename _Tp> class CV_EXPORTS SparseMat_;
+template<typename _Tp> class Mat_;
+template<typename _Tp> class SparseMat_;
 
 class CV_EXPORTS MatConstIterator;
 class CV_EXPORTS SparseMatIterator;
 class CV_EXPORTS SparseMatConstIterator;
-template<typename _Tp> class CV_EXPORTS MatIterator_;
-template<typename _Tp> class CV_EXPORTS MatConstIterator_;
-template<typename _Tp> class CV_EXPORTS SparseMatIterator_;
-template<typename _Tp> class CV_EXPORTS SparseMatConstIterator_;
+template<typename _Tp> class MatIterator_;
+template<typename _Tp> class MatConstIterator_;
+template<typename _Tp> class SparseMatIterator_;
+template<typename _Tp> class SparseMatConstIterator_;
 
 namespace ogl
 {
@@ -498,6 +498,11 @@ namespace gpu
     class CV_EXPORTS Event;
 }
 
+namespace cudev
+{
+    template <typename _Tp> class GpuMat_;
+}
+
 } // cv
 
 #endif //__OPENCV_CORE_BASE_HPP__
diff --git a/modules/core/include/opencv2/core/core_c.h b/modules/core/include/opencv2/core/core_c.h
index 0ed3edec6d..74a1e403a2 100644
--- a/modules/core/include/opencv2/core/core_c.h
+++ b/modules/core/include/opencv2/core/core_c.h
@@ -1906,7 +1906,7 @@ typedef Ptr<CvMemStorage> MemStorage;
     i.e. no constructors or destructors
     are called for the sequence elements.
 */
-template<typename _Tp> class CV_EXPORTS Seq
+template<typename _Tp> class Seq
 {
 public:
     typedef SeqIterator<_Tp> iterator;
@@ -1989,7 +1989,7 @@ public:
 /*!
  STL-style Sequence Iterator inherited from the CvSeqReader structure
 */
-template<typename _Tp> class CV_EXPORTS SeqIterator : public CvSeqReader
+template<typename _Tp> class SeqIterator : public CvSeqReader
 {
 public:
     //! the default constructor
diff --git a/modules/core/include/opencv2/core/cvdef.h b/modules/core/include/opencv2/core/cvdef.h
index 5a70d38a71..46f294962d 100644
--- a/modules/core/include/opencv2/core/cvdef.h
+++ b/modules/core/include/opencv2/core/cvdef.h
@@ -201,8 +201,10 @@
 #if !defined _MSC_VER && !defined __BORLANDC__
 #  if defined __cplusplus && __cplusplus >= 201103L
 #    include <cstdint>
+     typedef std::uint32_t uint;
 #  else
 #    include <stdint.h>
+     typedef uint32_t uint;
 #  endif
 #else
    typedef unsigned uint;
diff --git a/modules/core/include/opencv2/core/cvstd.hpp b/modules/core/include/opencv2/core/cvstd.hpp
index 0f96941df1..5014dba0f7 100644
--- a/modules/core/include/opencv2/core/cvstd.hpp
+++ b/modules/core/include/opencv2/core/cvstd.hpp
@@ -127,7 +127,7 @@ CV_EXPORTS void fastFree(void* ptr);
 /*!
   The STL-compilant memory Allocator based on cv::fastMalloc() and cv::fastFree()
 */
-template<typename _Tp> class CV_EXPORTS Allocator
+template<typename _Tp> class Allocator
 {
 public:
     typedef _Tp value_type;
@@ -183,7 +183,7 @@ public:
   \note{Another good property of the class is that the operations on the reference counter are atomic,
   i.e. it is safe to use the class in multi-threaded applications}
 */
-template<typename _Tp> class CV_EXPORTS Ptr
+template<typename _Tp> class Ptr
 {
 public:
     //! empty constructor
diff --git a/modules/core/include/opencv2/core/mat.hpp b/modules/core/include/opencv2/core/mat.hpp
index d5826a9b61..c132be9456 100644
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -96,6 +96,7 @@ public:
     _InputArray(const gpu::GpuMat& d_mat);
     _InputArray(const ogl::Buffer& buf);
     _InputArray(const gpu::CudaMem& cuda_mem);
+    template<typename _Tp> _InputArray(const cudev::GpuMat_<_Tp>& m);
 
     virtual Mat getMat(int i=-1) const;
     virtual void getMatVector(std::vector<Mat>& mv) const;
@@ -144,6 +145,7 @@ public:
     _OutputArray(gpu::GpuMat& d_mat);
     _OutputArray(ogl::Buffer& buf);
     _OutputArray(gpu::CudaMem& cuda_mem);
+    template<typename _Tp> _OutputArray(cudev::GpuMat_<_Tp>& m);
     template<typename _Tp> _OutputArray(std::vector<_Tp>& vec);
     template<typename _Tp> _OutputArray(std::vector<std::vector<_Tp> >& vec);
     template<typename _Tp> _OutputArray(std::vector<Mat_<_Tp> >& vec);
@@ -156,6 +158,7 @@ public:
     _OutputArray(const gpu::GpuMat& d_mat);
     _OutputArray(const ogl::Buffer& buf);
     _OutputArray(const gpu::CudaMem& cuda_mem);
+    template<typename _Tp> _OutputArray(const cudev::GpuMat_<_Tp>& m);
     template<typename _Tp> _OutputArray(const std::vector<_Tp>& vec);
     template<typename _Tp> _OutputArray(const std::vector<std::vector<_Tp> >& vec);
     template<typename _Tp> _OutputArray(const std::vector<Mat_<_Tp> >& vec);
@@ -828,7 +831,7 @@ protected:
        img(i,j)[2] ^= (uchar)(i ^ j); // img(y,x)[c] accesses c-th channel of the pixel (x,y)
  \endcode
 */
-template<typename _Tp> class CV_EXPORTS Mat_ : public Mat
+template<typename _Tp> class Mat_ : public Mat
 {
 public:
     typedef _Tp value_type;
@@ -1355,7 +1358,7 @@ public:
  m_.ref(2) += m_(3); // equivalent to m.ref<int>(2) += m.value<int>(3);
  \endcode
 */
-template<typename _Tp> class CV_EXPORTS SparseMat_ : public SparseMat
+template<typename _Tp> class SparseMat_ : public SparseMat
 {
 public:
     typedef SparseMatIterator_<_Tp> iterator;
@@ -1727,7 +1730,7 @@ public:
  This is the derived from cv::SparseMatConstIterator_ class that
  introduces more convenient operator *() for accessing the current element.
 */
-template<typename _Tp> class CV_EXPORTS SparseMatIterator_ : public SparseMatConstIterator_<_Tp>
+template<typename _Tp> class SparseMatIterator_ : public SparseMatConstIterator_<_Tp>
 {
 public:
 
diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp
index 19a4d28e49..86a35cd756 100644
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -81,7 +81,7 @@ struct CV_EXPORTS Matx_DivOp {};
 struct CV_EXPORTS Matx_MatMulOp {};
 struct CV_EXPORTS Matx_TOp {};
 
-template<typename _Tp, int m, int n> class CV_EXPORTS Matx
+template<typename _Tp, int m, int n> class Matx
 {
 public:
     enum { depth    = DataType<_Tp>::depth,
@@ -286,7 +286,7 @@ template<typename _Tp, int m, int n> static double norm(const Matx<_Tp, m, n>& M
   In addition to the universal notation like Vec<float, 3>, you can use shorter aliases
   for the most popular specialized variants of Vec, e.g. Vec3f ~ Vec<float, 3>.
 */
-template<typename _Tp, int cn> class CV_EXPORTS Vec : public Matx<_Tp, cn, 1>
+template<typename _Tp, int cn> class Vec : public Matx<_Tp, cn, 1>
 {
 public:
     typedef _Tp value_type;
diff --git a/modules/core/include/opencv2/core/types.hpp b/modules/core/include/opencv2/core/types.hpp
index 05cf5052fa..67e551e628 100644
--- a/modules/core/include/opencv2/core/types.hpp
+++ b/modules/core/include/opencv2/core/types.hpp
@@ -68,7 +68,7 @@ namespace cv
   more convenient access to the real and imaginary parts using through the simple field access, as opposite
   to std::complex::real() and std::complex::imag().
 */
-template<typename _Tp> class CV_EXPORTS Complex
+template<typename _Tp> class Complex
 {
 public:
 
@@ -120,7 +120,7 @@ public:
   as a template parameter. There are a few shorter aliases available for user convenience.
   See cv::Point, cv::Point2i, cv::Point2f and cv::Point2d.
 */
-template<typename _Tp> class CV_EXPORTS Point_
+template<typename _Tp> class Point_
 {
 public:
     typedef _Tp value_type;
@@ -191,7 +191,7 @@ public:
 
   \see cv::Point3i, cv::Point3f and cv::Point3d
 */
-template<typename _Tp> class CV_EXPORTS Point3_
+template<typename _Tp> class Point3_
 {
 public:
     typedef _Tp value_type;
@@ -256,7 +256,7 @@ public:
   The class represents the size of a 2D rectangle, image size, matrix size etc.
   Normally, cv::Size ~ cv::Size_<int> is used.
 */
-template<typename _Tp> class CV_EXPORTS Size_
+template<typename _Tp> class Size_
 {
 public:
     typedef _Tp value_type;
@@ -314,7 +314,7 @@ public:
   The class represents a 2D rectangle with coordinates of the specified data type.
   Normally, cv::Rect ~ cv::Rect_<int> is used.
 */
-template<typename _Tp> class CV_EXPORTS Rect_
+template<typename _Tp> class Rect_
 {
 public:
     typedef _Tp value_type;
@@ -470,7 +470,7 @@ public:
    This is partially specialized cv::Vec class with the number of elements = 4, i.e. a short vector of four elements.
    Normally, cv::Scalar ~ cv::Scalar_<double> is used.
 */
-template<typename _Tp> class CV_EXPORTS Scalar_ : public Vec<_Tp, 4>
+template<typename _Tp> class Scalar_ : public Vec<_Tp, 4>
 {
 public:
     //! various constructors
diff --git a/modules/core/include/opencv2/core/utility.hpp b/modules/core/include/opencv2/core/utility.hpp
index d2942f8c5f..5cc538623b 100644
--- a/modules/core/include/opencv2/core/utility.hpp
+++ b/modules/core/include/opencv2/core/utility.hpp
@@ -80,7 +80,7 @@ namespace cv
  }
  \endcode
 */
-template<typename _Tp, size_t fixed_size = 1024/sizeof(_Tp)+8> class CV_EXPORTS AutoBuffer
+template<typename _Tp, size_t fixed_size = 1024/sizeof(_Tp)+8> class AutoBuffer
 {
 public:
     typedef _Tp value_type;
diff --git a/modules/core/src/cuda/gpu_mat.cu b/modules/core/src/cuda/gpu_mat.cu
new file mode 100644
index 0000000000..0db1584212
--- /dev/null
+++ b/modules/core/src/cuda/gpu_mat.cu
@@ -0,0 +1,486 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "opencv2/opencv_modules.hpp"
+
+#ifndef HAVE_OPENCV_CUDEV
+
+#error "opencv_cudev is required"
+
+#else
+
+#include "opencv2/core/gpu.hpp"
+#include "opencv2/cudev.hpp"
+
+using namespace cv;
+using namespace cv::gpu;
+using namespace cv::cudev;
+
+/////////////////////////////////////////////////////
+/// create
+
+void cv::gpu::GpuMat::create(int _rows, int _cols, int _type)
+{
+    CV_DbgAssert( _rows >= 0 && _cols >= 0 );
+
+    _type &= Mat::TYPE_MASK;
+
+    if (rows == _rows && cols == _cols && type() == _type && data)
+        return;
+
+    if (data)
+        release();
+
+    if (_rows > 0 && _cols > 0)
+    {
+        flags = Mat::MAGIC_VAL + _type;
+        rows = _rows;
+        cols = _cols;
+
+        size_t esz = elemSize();
+
+        void* devPtr;
+
+        if (rows > 1 && cols > 1)
+        {
+            CV_CUDEV_SAFE_CALL( cudaMallocPitch(&devPtr, &step, esz * cols, rows) );
+        }
+        else
+        {
+            // Single row or single column must be continuous
+            CV_CUDEV_SAFE_CALL( cudaMalloc(&devPtr, esz * cols * rows) );
+            step = esz * cols;
+        }
+
+        if (esz * cols == step)
+            flags |= Mat::CONTINUOUS_FLAG;
+
+        int64 _nettosize = static_cast<int64>(step) * rows;
+        size_t nettosize = static_cast<size_t>(_nettosize);
+
+        datastart = data = static_cast<uchar*>(devPtr);
+        dataend = data + nettosize;
+
+        refcount = static_cast<int*>(fastMalloc(sizeof(*refcount)));
+        *refcount = 1;
+    }
+}
+
+/////////////////////////////////////////////////////
+/// release
+
+void cv::gpu::GpuMat::release()
+{
+    if (refcount && CV_XADD(refcount, -1) == 1)
+    {
+        cudaFree(datastart);
+        fastFree(refcount);
+    }
+
+    data = datastart = dataend = 0;
+    step = rows = cols = 0;
+    refcount = 0;
+}
+
+/////////////////////////////////////////////////////
+/// upload
+
+void cv::gpu::GpuMat::upload(InputArray arr)
+{
+    Mat mat = arr.getMat();
+
+    CV_DbgAssert( !mat.empty() );
+
+    create(mat.size(), mat.type());
+
+    CV_CUDEV_SAFE_CALL( cudaMemcpy2D(data, step, mat.data, mat.step, cols * elemSize(), rows, cudaMemcpyHostToDevice) );
+}
+
+void cv::gpu::GpuMat::upload(InputArray arr, Stream& _stream)
+{
+    Mat mat = arr.getMat();
+
+    CV_DbgAssert( !mat.empty() );
+
+    create(mat.size(), mat.type());
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+    CV_CUDEV_SAFE_CALL( cudaMemcpy2DAsync(data, step, mat.data, mat.step, cols * elemSize(), rows, cudaMemcpyHostToDevice, stream) );
+}
+
+/////////////////////////////////////////////////////
+/// download
+
+void cv::gpu::GpuMat::download(OutputArray _dst) const
+{
+    CV_DbgAssert( !empty() );
+
+    _dst.create(size(), type());
+    Mat dst = _dst.getMat();
+
+    CV_CUDEV_SAFE_CALL( cudaMemcpy2D(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToHost) );
+}
+
+void cv::gpu::GpuMat::download(OutputArray _dst, Stream& _stream) const
+{
+    CV_DbgAssert( !empty() );
+
+    _dst.create(size(), type());
+    Mat dst = _dst.getMat();
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+    CV_CUDEV_SAFE_CALL( cudaMemcpy2DAsync(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToHost, stream) );
+}
+
+/////////////////////////////////////////////////////
+/// copyTo
+
+void cv::gpu::GpuMat::copyTo(OutputArray _dst) const
+{
+    CV_DbgAssert( !empty() );
+
+    _dst.create(size(), type());
+    GpuMat dst = _dst.getGpuMat();
+
+    CV_CUDEV_SAFE_CALL( cudaMemcpy2D(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToDevice) );
+}
+
+void cv::gpu::GpuMat::copyTo(OutputArray _dst, Stream& _stream) const
+{
+    CV_DbgAssert( !empty() );
+
+    _dst.create(size(), type());
+    GpuMat dst = _dst.getGpuMat();
+
+    cudaStream_t stream = StreamAccessor::getStream(_stream);
+    CV_CUDEV_SAFE_CALL( cudaMemcpy2DAsync(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToDevice, stream) );
+}
+
+namespace
+{
+    template <size_t size> struct CopyToPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct CopyToPolicy<4> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 2
+        };
+    };
+    template <> struct CopyToPolicy<8> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 1
+        };
+    };
+
+    template <typename T>
+    void copyWithMask(const GpuMat& src, const GpuMat& dst, const GpuMat& mask, Stream& stream)
+    {
+        gridTransform_< CopyToPolicy<sizeof(typename VecTraits<T>::elem_type)> >(globPtr<T>(src), globPtr<T>(dst), identity<T>(), globPtr<uchar>(mask), stream);
+    }
+}
+
+void cv::gpu::GpuMat::copyTo(OutputArray _dst, InputArray _mask, Stream& stream) const
+{
+    CV_DbgAssert( !empty() );
+    CV_DbgAssert( depth() <= CV_64F && channels() <= 4 );
+
+    GpuMat mask = _mask.getGpuMat();
+    CV_DbgAssert( size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == channels()) );
+
+    _dst.create(size(), type());
+    GpuMat dst = _dst.getGpuMat();
+
+    typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, const GpuMat& mask, Stream& stream);
+    static const func_t funcs[9][4] =
+    {
+        {0,0,0,0},
+        {copyWithMask<uchar>, copyWithMask<uchar2>, copyWithMask<uchar3>, copyWithMask<uchar4>},
+        {copyWithMask<ushort>, copyWithMask<ushort2>, copyWithMask<ushort3>, copyWithMask<ushort4>},
+        {0,0,0,0},
+        {copyWithMask<int>, copyWithMask<int2>, copyWithMask<int3>, copyWithMask<int4>},
+        {0,0,0,0},
+        {0,0,0,0},
+        {0,0,0,0},
+        {copyWithMask<double>, copyWithMask<double2>, copyWithMask<double3>, copyWithMask<double4>}
+    };
+
+    if (mask.channels() == channels())
+    {
+        const func_t func = funcs[elemSize1()][0];
+        CV_DbgAssert( func != 0 );
+        func(reshape(1), dst.reshape(1), mask.reshape(1), stream);
+    }
+    else
+    {
+        const func_t func = funcs[elemSize1()][channels() - 1];
+        CV_DbgAssert( func != 0 );
+        func(*this, dst, mask, stream);
+    }
+}
+
+/////////////////////////////////////////////////////
+/// setTo
+
+namespace
+{
+    template <typename T>
+    void setToWithOutMask(const GpuMat& mat, Scalar _scalar, Stream& stream)
+    {
+        Scalar_<typename VecTraits<T>::elem_type> scalar = _scalar;
+        gridTransform(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), stream);
+    }
+
+    template <typename T>
+    void setToWithMask(const GpuMat& mat, const GpuMat& mask, Scalar _scalar, Stream& stream)
+    {
+        Scalar_<typename VecTraits<T>::elem_type> scalar = _scalar;
+        gridTransform(constantPtr(VecTraits<T>::make(scalar.val), mat.rows, mat.cols), globPtr<T>(mat), identity<T>(), globPtr<uchar>(mask), stream);
+    }
+}
+
+GpuMat& cv::gpu::GpuMat::setTo(Scalar value, Stream& stream)
+{
+    CV_DbgAssert( !empty() );
+    CV_DbgAssert( depth() <= CV_64F && channels() <= 4 );
+
+    if (value[0] == 0.0 && value[1] == 0.0 && value[2] == 0.0 && value[3] == 0.0)
+    {
+        // Zero fill
+
+        if (stream)
+            CV_CUDEV_SAFE_CALL( cudaMemset2DAsync(data, step, 0, cols * elemSize(), rows, StreamAccessor::getStream(stream)) );
+        else
+            CV_CUDEV_SAFE_CALL( cudaMemset2D(data, step, 0, cols * elemSize(), rows) );
+
+        return *this;
+    }
+
+    if (depth() == CV_8U)
+    {
+        const int cn = channels();
+
+        if (cn == 1
+                || (cn == 2 && value[0] == value[1])
+                || (cn == 3 && value[0] == value[1] && value[0] == value[2])
+                || (cn == 4 && value[0] == value[1] && value[0] == value[2] && value[0] == value[3]))
+        {
+            const int val = cv::saturate_cast<uchar>(value[0]);
+
+            if (stream)
+                CV_CUDEV_SAFE_CALL( cudaMemset2DAsync(data, step, val, cols * elemSize(), rows, StreamAccessor::getStream(stream)) );
+            else
+                CV_CUDEV_SAFE_CALL( cudaMemset2D(data, step, val, cols * elemSize(), rows) );
+
+            return *this;
+        }
+    }
+
+    typedef void (*func_t)(const GpuMat& mat, Scalar scalar, Stream& stream);
+    static const func_t funcs[7][4] =
+    {
+        {setToWithOutMask<uchar>,setToWithOutMask<uchar2>,setToWithOutMask<uchar3>,setToWithOutMask<uchar4>},
+        {setToWithOutMask<schar>,setToWithOutMask<char2>,setToWithOutMask<char3>,setToWithOutMask<char4>},
+        {setToWithOutMask<ushort>,setToWithOutMask<ushort2>,setToWithOutMask<ushort3>,setToWithOutMask<ushort4>},
+        {setToWithOutMask<short>,setToWithOutMask<short2>,setToWithOutMask<short3>,setToWithOutMask<short4>},
+        {setToWithOutMask<int>,setToWithOutMask<int2>,setToWithOutMask<int3>,setToWithOutMask<int4>},
+        {setToWithOutMask<float>,setToWithOutMask<float2>,setToWithOutMask<float3>,setToWithOutMask<float4>},
+        {setToWithOutMask<double>,setToWithOutMask<double2>,setToWithOutMask<double3>,setToWithOutMask<double4>}
+    };
+
+    funcs[depth()][channels() - 1](*this, value, stream);
+
+    return *this;
+}
+
+GpuMat& cv::gpu::GpuMat::setTo(Scalar value, InputArray _mask, Stream& stream)
+{
+    CV_DbgAssert( !empty() );
+    CV_DbgAssert( depth() <= CV_64F && channels() <= 4 );
+
+    GpuMat mask = _mask.getGpuMat();
+
+    CV_DbgAssert( size() == mask.size() && mask.type() == CV_8UC1 );
+
+    typedef void (*func_t)(const GpuMat& mat, const GpuMat& mask, Scalar scalar, Stream& stream);
+    static const func_t funcs[7][4] =
+    {
+        {setToWithMask<uchar>,setToWithMask<uchar2>,setToWithMask<uchar3>,setToWithMask<uchar4>},
+        {setToWithMask<schar>,setToWithMask<char2>,setToWithMask<char3>,setToWithMask<char4>},
+        {setToWithMask<ushort>,setToWithMask<ushort2>,setToWithMask<ushort3>,setToWithMask<ushort4>},
+        {setToWithMask<short>,setToWithMask<short2>,setToWithMask<short3>,setToWithMask<short4>},
+        {setToWithMask<int>,setToWithMask<int2>,setToWithMask<int3>,setToWithMask<int4>},
+        {setToWithMask<float>,setToWithMask<float2>,setToWithMask<float3>,setToWithMask<float4>},
+        {setToWithMask<double>,setToWithMask<double2>,setToWithMask<double3>,setToWithMask<double4>}
+    };
+
+    funcs[depth()][channels() - 1](*this, mask, value, stream);
+
+    return *this;
+}
+
+/////////////////////////////////////////////////////
+/// convertTo
+
+namespace
+{
+    template <typename T> struct ConvertToPolicy : DefaultTransformPolicy
+    {
+    };
+    template <> struct ConvertToPolicy<double> : DefaultTransformPolicy
+    {
+        enum {
+            shift = 1
+        };
+    };
+
+    template <typename T, typename D>
+    void convertToNoScale(const GpuMat& src, const GpuMat& dst, Stream& stream)
+    {
+        typedef typename VecTraits<T>::elem_type src_elem_type;
+        typedef typename VecTraits<D>::elem_type dst_elem_type;
+        typedef typename LargerType<src_elem_type, float>::type larger_elem_type;
+        typedef typename LargerType<float, dst_elem_type>::type scalar_type;
+
+        gridTransform_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), saturate_cast_func<T, D>(), stream);
+    }
+
+    template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
+    {
+        S alpha;
+        S beta;
+
+        __device__ __forceinline__ D operator ()(typename TypeTraits<T>::parameter_type src) const
+        {
+            return cudev::saturate_cast<D>(alpha * src + beta);
+        }
+    };
+
+    template <typename T, typename D>
+    void convertToScale(const GpuMat& src, const GpuMat& dst, double alpha, double beta, Stream& stream)
+    {
+        typedef typename VecTraits<T>::elem_type src_elem_type;
+        typedef typename VecTraits<D>::elem_type dst_elem_type;
+        typedef typename LargerType<src_elem_type, float>::type larger_elem_type;
+        typedef typename LargerType<float, dst_elem_type>::type scalar_type;
+
+        Convertor<T, D, scalar_type> op;
+        op.alpha = cv::saturate_cast<scalar_type>(alpha);
+        op.beta = cv::saturate_cast<scalar_type>(beta);
+
+        gridTransform_< ConvertToPolicy<scalar_type> >(globPtr<T>(src), globPtr<D>(dst), op, stream);
+    }
+}
+
+void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, Stream& stream) const
+{
+    if (rtype < 0)
+        rtype = type();
+    else
+        rtype = CV_MAKE_TYPE(CV_MAT_DEPTH(rtype), channels());
+
+    const int sdepth = depth();
+    const int ddepth = CV_MAT_DEPTH(rtype);
+    if (sdepth == ddepth)
+    {
+        if (stream)
+            copyTo(_dst, stream);
+        else
+            copyTo(_dst);
+
+        return;
+    }
+
+    CV_DbgAssert( sdepth <= CV_64F && ddepth <= CV_64F );
+
+    GpuMat src = *this;
+
+    _dst.create(size(), rtype);
+    GpuMat dst = _dst.getGpuMat();
+
+    typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, Stream& stream);
+    static const func_t funcs[7][7] =
+    {
+        {0, convertToNoScale<uchar, schar>, convertToNoScale<uchar, ushort>, convertToNoScale<uchar, short>, convertToNoScale<uchar, int>, convertToNoScale<uchar, float>, convertToNoScale<uchar, double>},
+        {convertToNoScale<schar, uchar>, 0, convertToNoScale<schar, ushort>, convertToNoScale<schar, short>, convertToNoScale<schar, int>, convertToNoScale<schar, float>, convertToNoScale<schar, double>},
+        {convertToNoScale<ushort, uchar>, convertToNoScale<ushort, schar>, 0, convertToNoScale<ushort, short>, convertToNoScale<ushort, int>, convertToNoScale<ushort, float>, convertToNoScale<ushort, double>},
+        {convertToNoScale<short, uchar>, convertToNoScale<short, schar>, convertToNoScale<short, ushort>, 0, convertToNoScale<short, int>, convertToNoScale<short, float>, convertToNoScale<short, double>},
+        {convertToNoScale<int, uchar>, convertToNoScale<int, schar>, convertToNoScale<int, ushort>, convertToNoScale<int, short>, 0, convertToNoScale<int, float>, convertToNoScale<int, double>},
+        {convertToNoScale<float, uchar>, convertToNoScale<float, schar>, convertToNoScale<float, ushort>, convertToNoScale<float, short>, convertToNoScale<float, int>, 0, convertToNoScale<float, double>},
+        {convertToNoScale<double, uchar>, convertToNoScale<double, schar>, convertToNoScale<double, ushort>, convertToNoScale<double, short>, convertToNoScale<double, int>, convertToNoScale<double, float>, 0}
+    };
+
+    funcs[sdepth][ddepth](reshape(1), dst.reshape(1), stream);
+}
+
+void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, double alpha, double beta, Stream& stream) const
+{
+    if (rtype < 0)
+        rtype = type();
+    else
+        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
+
+    const int sdepth = depth();
+    const int ddepth = CV_MAT_DEPTH(rtype);
+
+    GpuMat src = *this;
+
+    _dst.create(size(), rtype);
+    GpuMat dst = _dst.getGpuMat();
+
+    typedef void (*func_t)(const GpuMat& src, const GpuMat& dst, double alpha, double beta, Stream& stream);
+    static const func_t funcs[7][7] =
+    {
+        {convertToScale<uchar, uchar>, convertToScale<uchar, schar>, convertToScale<uchar, ushort>, convertToScale<uchar, short>, convertToScale<uchar, int>, convertToScale<uchar, float>, convertToScale<uchar, double>},
+        {convertToScale<schar, uchar>, convertToScale<schar, schar>, convertToScale<schar, ushort>, convertToScale<schar, short>, convertToScale<schar, int>, convertToScale<schar, float>, convertToScale<schar, double>},
+        {convertToScale<ushort, uchar>, convertToScale<ushort, schar>, convertToScale<ushort, ushort>, convertToScale<ushort, short>, convertToScale<ushort, int>, convertToScale<ushort, float>, convertToScale<ushort, double>},
+        {convertToScale<short, uchar>, convertToScale<short, schar>, convertToScale<short, ushort>, convertToScale<short, short>, convertToScale<short, int>, convertToScale<short, float>, convertToScale<short, double>},
+        {convertToScale<int, uchar>, convertToScale<int, schar>, convertToScale<int, ushort>, convertToScale<int, short>, convertToScale<int, int>, convertToScale<int, float>, convertToScale<int, double>},
+        {convertToScale<float, uchar>, convertToScale<float, schar>, convertToScale<float, ushort>, convertToScale<float, short>, convertToScale<float, int>, convertToScale<float, float>, convertToScale<float, double>},
+        {convertToScale<double, uchar>, convertToScale<double, schar>, convertToScale<double, ushort>, convertToScale<double, short>, convertToScale<double, int>, convertToScale<double, float>, convertToScale<double, double>}
+    };
+
+    funcs[sdepth][ddepth](reshape(1), dst.reshape(1), alpha, beta, stream);
+}
+
+#endif
diff --git a/modules/core/src/cuda/matrix_operations.cu b/modules/core/src/cuda/matrix_operations.cu
deleted file mode 100644
index 7de5205ec6..0000000000
--- a/modules/core/src/cuda/matrix_operations.cu
+++ /dev/null
@@ -1,296 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "opencv2/core/cuda/saturate_cast.hpp"
-#include "opencv2/core/cuda/transform.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/type_traits.hpp"
-#include "opencv2/core/cuda/vec_traits.hpp"
-
-#include "matrix_operations.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    ///////////////////////////////////////////////////////////////////////////
-    // copyWithMask
-
-    template <typename T>
-    void copyWithMask(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream)
-    {
-        if (multiChannelMask)
-            cv::gpu::cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, identity<T>(), SingleMask(mask), stream);
-        else
-            cv::gpu::cudev::transform((PtrStepSz<T>) src, (PtrStepSz<T>) dst, identity<T>(), SingleMaskChannels(mask, cn), stream);
-    }
-
-    void copyWithMask(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream)
-    {
-        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream);
-
-        static const func_t tab[] =
-        {
-            0,
-            copyWithMask<uchar>,
-            copyWithMask<ushort>,
-            0,
-            copyWithMask<int>,
-            0,
-            0,
-            0,
-            copyWithMask<double>
-        };
-
-        const func_t func = tab[elemSize1];
-        CV_DbgAssert( func != 0 );
-
-        func(src, dst, cn, mask, multiChannelMask, stream);
-    }
-
-    ///////////////////////////////////////////////////////////////////////////
-    // set
-
-    template<typename T, class Mask>
-    __global__ void set(PtrStepSz<T> mat, const Mask mask, const int channels, const typename TypeVec<T, 4>::vec_type value)
-    {
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x >= mat.cols * channels || y >= mat.rows)
-            return;
-
-        const T scalar[4] = {value.x, value.y, value.z, value.w};
-
-        if (mask(y, x / channels))
-            mat(y, x) = scalar[x % channels];
-    }
-
-    template <typename T>
-    void set(PtrStepSz<T> mat, const T* scalar, int channels, cudaStream_t stream)
-    {
-        typedef typename TypeVec<T, 4>::vec_type scalar_t;
-
-        dim3 block(32, 8);
-        dim3 grid(divUp(mat.cols * channels, block.x), divUp(mat.rows, block.y));
-
-        set<T><<<grid, block, 0, stream>>>(mat, WithOutMask(), channels, VecTraits<scalar_t>::make(scalar));
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall ( cudaDeviceSynchronize() );
-    }
-
-    template void set<uchar >(PtrStepSz<uchar > mat, const uchar*  scalar, int channels, cudaStream_t stream);
-    template void set<schar >(PtrStepSz<schar > mat, const schar*  scalar, int channels, cudaStream_t stream);
-    template void set<ushort>(PtrStepSz<ushort> mat, const ushort* scalar, int channels, cudaStream_t stream);
-    template void set<short >(PtrStepSz<short > mat, const short*  scalar, int channels, cudaStream_t stream);
-    template void set<int   >(PtrStepSz<int   > mat, const int*    scalar, int channels, cudaStream_t stream);
-    template void set<float >(PtrStepSz<float > mat, const float*  scalar, int channels, cudaStream_t stream);
-    template void set<double>(PtrStepSz<double> mat, const double* scalar, int channels, cudaStream_t stream);
-
-    template <typename T>
-    void set(PtrStepSz<T> mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream)
-    {
-        typedef typename TypeVec<T, 4>::vec_type scalar_t;
-
-        dim3 block(32, 8);
-        dim3 grid(divUp(mat.cols * channels, block.x), divUp(mat.rows, block.y));
-
-        set<T><<<grid, block, 0, stream>>>(mat, SingleMask(mask), channels, VecTraits<scalar_t>::make(scalar));
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall ( cudaDeviceSynchronize() );
-    }
-
-    template void set<uchar >(PtrStepSz<uchar > mat, const uchar*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set<schar >(PtrStepSz<schar > mat, const schar*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set<ushort>(PtrStepSz<ushort> mat, const ushort* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set<short >(PtrStepSz<short > mat, const short*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set<int   >(PtrStepSz<int   > mat, const int*    scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set<float >(PtrStepSz<float > mat, const float*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set<double>(PtrStepSz<double> mat, const double* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-
-    ///////////////////////////////////////////////////////////////////////////
-    // convert
-
-    template <typename T, typename D, typename S> struct Convertor : unary_function<T, D>
-    {
-        Convertor(S alpha_, S beta_) : alpha(alpha_), beta(beta_) {}
-
-        __device__ __forceinline__ D operator()(typename TypeTraits<T>::ParameterType src) const
-        {
-            return saturate_cast<D>(alpha * src + beta);
-        }
-
-        S alpha, beta;
-    };
-
-    namespace detail
-    {
-        template <size_t src_size, size_t dst_size, typename F> struct ConvertTraitsDispatcher : DefaultTransformFunctorTraits<F>
-        {
-        };
-        template <typename F> struct ConvertTraitsDispatcher<1, 1, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_shift = 8 };
-        };
-        template <typename F> struct ConvertTraitsDispatcher<1, 2, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_shift = 4 };
-        };
-        template <typename F> struct ConvertTraitsDispatcher<1, 4, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_block_dim_y = 8 };
-            enum { smart_shift = 4 };
-        };
-
-        template <typename F> struct ConvertTraitsDispatcher<2, 2, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_shift = 4 };
-        };
-        template <typename F> struct ConvertTraitsDispatcher<2, 4, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_shift = 2 };
-        };
-
-        template <typename F> struct ConvertTraitsDispatcher<4, 2, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_block_dim_y = 8 };
-            enum { smart_shift = 4 };
-        };
-        template <typename F> struct ConvertTraitsDispatcher<4, 4, F> : DefaultTransformFunctorTraits<F>
-        {
-            enum { smart_block_dim_y = 8 };
-            enum { smart_shift = 2 };
-        };
-
-        template <typename F> struct ConvertTraits : ConvertTraitsDispatcher<sizeof(typename F::argument_type), sizeof(typename F::result_type), F>
-        {
-        };
-    }
-
-    template <typename T, typename D, typename S> struct TransformFunctorTraits< Convertor<T, D, S> > : detail::ConvertTraits< Convertor<T, D, S> >
-    {
-    };
-
-    template<typename T, typename D, typename S>
-    void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
-    {
-        Convertor<T, D, S> op(static_cast<S>(alpha), static_cast<S>(beta));
-        cv::gpu::cudev::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
-    }
-
-    void convert(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream)
-    {
-        typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);
-
-        static const caller_t tab[7][7] =
-        {
-            {
-                cvt_<uchar, uchar, float>,
-                cvt_<uchar, schar, float>,
-                cvt_<uchar, ushort, float>,
-                cvt_<uchar, short, float>,
-                cvt_<uchar, int, float>,
-                cvt_<uchar, float, float>,
-                cvt_<uchar, double, double>
-            },
-            {
-                cvt_<schar, uchar, float>,
-                cvt_<schar, schar, float>,
-                cvt_<schar, ushort, float>,
-                cvt_<schar, short, float>,
-                cvt_<schar, int, float>,
-                cvt_<schar, float, float>,
-                cvt_<schar, double, double>
-            },
-            {
-                cvt_<ushort, uchar, float>,
-                cvt_<ushort, schar, float>,
-                cvt_<ushort, ushort, float>,
-                cvt_<ushort, short, float>,
-                cvt_<ushort, int, float>,
-                cvt_<ushort, float, float>,
-                cvt_<ushort, double, double>
-            },
-            {
-                cvt_<short, uchar, float>,
-                cvt_<short, schar, float>,
-                cvt_<short, ushort, float>,
-                cvt_<short, short, float>,
-                cvt_<short, int, float>,
-                cvt_<short, float, float>,
-                cvt_<short, double, double>
-            },
-            {
-                cvt_<int, uchar, float>,
-                cvt_<int, schar, float>,
-                cvt_<int, ushort, float>,
-                cvt_<int, short, float>,
-                cvt_<int, int, double>,
-                cvt_<int, float, double>,
-                cvt_<int, double, double>
-            },
-            {
-                cvt_<float, uchar, float>,
-                cvt_<float, schar, float>,
-                cvt_<float, ushort, float>,
-                cvt_<float, short, float>,
-                cvt_<float, int, float>,
-                cvt_<float, float, float>,
-                cvt_<float, double, double>
-            },
-            {
-                cvt_<double, uchar, double>,
-                cvt_<double, schar, double>,
-                cvt_<double, ushort, double>,
-                cvt_<double, short, double>,
-                cvt_<double, int, double>,
-                cvt_<double, float, double>,
-                cvt_<double, double, double>
-            }
-        };
-
-        const caller_t func = tab[sdepth][ddepth];
-        func(src, dst, alpha, beta, stream);
-    }
-}}} // namespace cv { namespace gpu { namespace cudev
diff --git a/modules/core/src/cuda/matrix_operations.hpp b/modules/core/src/cuda/matrix_operations.hpp
deleted file mode 100644
index 4e451061b8..0000000000
--- a/modules/core/src/cuda/matrix_operations.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "opencv2/core/cuda/common.hpp"
-
-namespace cv { namespace gpu { namespace cudev
-{
-    void copyWithMask(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool multiChannelMask, cudaStream_t stream);
-
-    template <typename T>
-    void set(PtrStepSz<T> mat, const T* scalar, int channels, cudaStream_t stream);
-
-    template <typename T>
-    void set(PtrStepSz<T> mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-
-    void convert(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
-}}}
diff --git a/modules/core/src/gpu_mat.cpp b/modules/core/src/gpu_mat.cpp
index a2e8da65a8..33a6046fed 100644
--- a/modules/core/src/gpu_mat.cpp
+++ b/modules/core/src/gpu_mat.cpp
@@ -46,504 +46,6 @@
 using namespace cv;
 using namespace cv::gpu;
 
-/////////////////////////// matrix operations /////////////////////////
-
-#ifdef HAVE_CUDA
-
-// CUDA implementation
-
-#include "cuda/matrix_operations.hpp"
-
-namespace
-{
-    template <typename T> void cudaSet_(GpuMat& src, Scalar s, cudaStream_t stream)
-    {
-        Scalar_<T> sf = s;
-        cudev::set<T>(PtrStepSz<T>(src), sf.val, src.channels(), stream);
-    }
-
-    void cudaSet(GpuMat& src, Scalar s, cudaStream_t stream)
-    {
-        typedef void (*func_t)(GpuMat& src, Scalar s, cudaStream_t stream);
-        static const func_t funcs[] =
-        {
-            cudaSet_<uchar>,
-            cudaSet_<schar>,
-            cudaSet_<ushort>,
-            cudaSet_<short>,
-            cudaSet_<int>,
-            cudaSet_<float>,
-            cudaSet_<double>
-        };
-
-        funcs[src.depth()](src, s, stream);
-    }
-
-    template <typename T> void cudaSet_(GpuMat& src, Scalar s, PtrStepSzb mask, cudaStream_t stream)
-    {
-        Scalar_<T> sf = s;
-        cudev::set<T>(PtrStepSz<T>(src), sf.val, mask, src.channels(), stream);
-    }
-
-    void cudaSet(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
-    {
-        typedef void (*func_t)(GpuMat& src, Scalar s, PtrStepSzb mask, cudaStream_t stream);
-        static const func_t funcs[] =
-        {
-            cudaSet_<uchar>,
-            cudaSet_<schar>,
-            cudaSet_<ushort>,
-            cudaSet_<short>,
-            cudaSet_<int>,
-            cudaSet_<float>,
-            cudaSet_<double>
-        };
-
-        funcs[src.depth()](src, s, mask, stream);
-    }
-
-    void cudaCopyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
-    {
-        cudev::copyWithMask(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
-    }
-
-    void cudaConvert(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
-    {
-        cudev::convert(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, stream);
-    }
-
-    void cudaConvert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream)
-    {
-        cudev::convert(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
-    }
-}
-
-// NPP implementation
-
-namespace
-{
-    //////////////////////////////////////////////////////////////////////////
-    // Convert
-
-    template<int SDEPTH, int DDEPTH> struct NppConvertFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
-    };
-    template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
-    {
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
-    };
-
-    template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
-    {
-        typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, cudaStream_t stream)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    //////////////////////////////////////////////////////////////////////////
-    // Set
-
-    template<int SDEPTH, int SCN> struct NppSetFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<int SCN> struct NppSetFunc<CV_8S, SCN>
-    {
-        typedef NppStatus (*func_ptr)(Npp8s values[], Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-    template<> struct NppSetFunc<CV_8S, 1>
-    {
-        typedef NppStatus (*func_ptr)(Npp8s val, Npp8s* pSrc, int nSrcStep, NppiSize oSizeROI);
-    };
-
-    template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s, cudaStream_t stream)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s, cudaStream_t stream)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    template<int SDEPTH, int SCN> struct NppSetMaskFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-    template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-
-    template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-    template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            Scalar_<src_t> nppS = s;
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-
-    //////////////////////////////////////////////////////////////////////////
-    // CopyMasked
-
-    template<int SDEPTH> struct NppCopyWithMaskFunc
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, src_t* pDst, int nDstStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
-    };
-
-    template<int SDEPTH, typename NppCopyWithMaskFunc<SDEPTH>::func_ptr func> struct NppCopyWithMask
-    {
-        typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
-
-        static void call(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream)
-        {
-            NppiSize sz;
-            sz.width = src.cols;
-            sz.height = src.rows;
-
-            NppStreamHandler h(stream);
-
-            nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<src_t>(), static_cast<int>(dst.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
-
-            if (stream == 0)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
-    };
-}
-
-// Dispatcher
-
-namespace
-{
-    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
-    {
-        CV_DbgAssert( src.size() == dst.size() && src.type() == dst.type() );
-
-        CV_Assert( src.depth() <= CV_64F && src.channels() <= 4 );
-        CV_Assert( src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()) );
-
-        if (src.depth() == CV_64F)
-        {
-            CV_Assert( deviceSupports(NATIVE_DOUBLE) );
-        }
-
-        typedef void (*func_t)(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
-        static const func_t funcs[7][4] =
-        {
-            /*  8U */ {NppCopyWithMask<CV_8U , nppiCopy_8u_C1MR >::call, cudaCopyWithMask, NppCopyWithMask<CV_8U , nppiCopy_8u_C3MR >::call, NppCopyWithMask<CV_8U , nppiCopy_8u_C4MR >::call},
-            /*  8S */ {cudaCopyWithMask                                , cudaCopyWithMask, cudaCopyWithMask                                , cudaCopyWithMask                                },
-            /* 16U */ {NppCopyWithMask<CV_16U, nppiCopy_16u_C1MR>::call, cudaCopyWithMask, NppCopyWithMask<CV_16U, nppiCopy_16u_C3MR>::call, NppCopyWithMask<CV_16U, nppiCopy_16u_C4MR>::call},
-            /* 16S */ {NppCopyWithMask<CV_16S, nppiCopy_16s_C1MR>::call, cudaCopyWithMask, NppCopyWithMask<CV_16S, nppiCopy_16s_C3MR>::call, NppCopyWithMask<CV_16S, nppiCopy_16s_C4MR>::call},
-            /* 32S */ {NppCopyWithMask<CV_32S, nppiCopy_32s_C1MR>::call, cudaCopyWithMask, NppCopyWithMask<CV_32S, nppiCopy_32s_C3MR>::call, NppCopyWithMask<CV_32S, nppiCopy_32s_C4MR>::call},
-            /* 32F */ {NppCopyWithMask<CV_32F, nppiCopy_32f_C1MR>::call, cudaCopyWithMask, NppCopyWithMask<CV_32F, nppiCopy_32f_C3MR>::call, NppCopyWithMask<CV_32F, nppiCopy_32f_C4MR>::call},
-            /* 64F */ {cudaCopyWithMask                                , cudaCopyWithMask, cudaCopyWithMask                                , cudaCopyWithMask                                }
-        };
-
-        const func_t func = mask.channels() == src.channels() ? funcs[src.depth()][src.channels() - 1] : cudaCopyWithMask;
-
-        func(src, dst, mask, stream);
-    }
-
-    void convert(const GpuMat& src, GpuMat& dst, cudaStream_t stream = 0)
-    {
-        CV_DbgAssert( src.size() == dst.size() && src.channels() == dst.channels() );
-
-        CV_Assert( src.depth() <= CV_64F && src.channels() <= 4 );
-        CV_Assert( dst.depth() <= CV_64F );
-
-        if (src.depth() == CV_64F || dst.depth() == CV_64F)
-        {
-            CV_Assert( deviceSupports(NATIVE_DOUBLE) );
-        }
-
-        typedef void (*func_t)(const GpuMat& src, GpuMat& dst, cudaStream_t stream);
-        static const func_t funcs[7][7][4] =
-        {
-            {
-                /*  8U ->  8U */ {0, 0, 0, 0},
-                /*  8U ->  8S */ {cudaConvert                                       , cudaConvert, cudaConvert, cudaConvert                                       },
-                /*  8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::call, cudaConvert, cudaConvert, NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::call},
-                /*  8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::call, cudaConvert, cudaConvert, NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::call},
-                /*  8U -> 32S */ {cudaConvert                                       , cudaConvert, cudaConvert, cudaConvert                                       },
-                /*  8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::call, cudaConvert, cudaConvert, cudaConvert                                       },
-                /*  8U -> 64F */ {cudaConvert                                       , cudaConvert, cudaConvert, cudaConvert                                       }
-            },
-            {
-                /*  8S ->  8U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert},
-                /*  8S ->  8S */ {0,0,0,0},
-                /*  8S -> 16U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert},
-                /*  8S -> 16S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert},
-                /*  8S -> 32S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert},
-                /*  8S -> 32F */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert},
-                /*  8S -> 64F */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}
-            },
-            {
-                /* 16U ->  8U */ {NppCvt<CV_16U, CV_8U , nppiConvert_16u8u_C1R >::call, cudaConvert, cudaConvert, NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::call},
-                /* 16U ->  8S */ {cudaConvert                                         , cudaConvert, cudaConvert, cudaConvert                                       },
-                /* 16U -> 16U */ {0,0,0,0},
-                /* 16U -> 16S */ {cudaConvert                                         , cudaConvert, cudaConvert, cudaConvert                                       },
-                /* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::call, cudaConvert, cudaConvert, cudaConvert                                       },
-                /* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::call, cudaConvert, cudaConvert, cudaConvert                                       },
-                /* 16U -> 64F */ {cudaConvert                                         , cudaConvert, cudaConvert, cudaConvert                                       }
-            },
-            {
-                /* 16S ->  8U */ {NppCvt<CV_16S, CV_8U , nppiConvert_16s8u_C1R >::call, cudaConvert, cudaConvert, NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::call},
-                /* 16S ->  8S */ {cudaConvert                                         , cudaConvert, cudaConvert, cudaConvert                                       },
-                /* 16S -> 16U */ {cudaConvert                                         , cudaConvert, cudaConvert, cudaConvert                                       },
-                /* 16S -> 16S */ {0,0,0,0},
-                /* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::call, cudaConvert, cudaConvert, cudaConvert                                       },
-                /* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::call, cudaConvert, cudaConvert, cudaConvert                                       },
-                /* 16S -> 64F */ {cudaConvert                                         , cudaConvert, cudaConvert, cudaConvert                                       }
-            },
-            {
-                /* 32S ->  8U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert},
-                /* 32S ->  8S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert},
-                /* 32S -> 16U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert},
-                /* 32S -> 16S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert},
-                /* 32S -> 32S */ {0,0,0,0},
-                /* 32S -> 32F */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert},
-                /* 32S -> 64F */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert}
-            },
-            {
-                /* 32F ->  8U */ {NppCvt<CV_32F, CV_8U , nppiConvert_32f8u_C1R >::call, cudaConvert, cudaConvert, cudaConvert},
-                /* 32F ->  8S */ {cudaConvert                                         , cudaConvert, cudaConvert, cudaConvert},
-                /* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::call, cudaConvert, cudaConvert, cudaConvert},
-                /* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::call, cudaConvert, cudaConvert, cudaConvert},
-                /* 32F -> 32S */ {cudaConvert                                         , cudaConvert, cudaConvert, cudaConvert},
-                /* 32F -> 32F */ {0,0,0,0},
-                /* 32F -> 64F */ {cudaConvert                                         , cudaConvert, cudaConvert, cudaConvert}
-            },
-            {
-                /* 64F ->  8U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert},
-                /* 64F ->  8S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert},
-                /* 64F -> 16U */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert},
-                /* 64F -> 16S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert},
-                /* 64F -> 32S */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert},
-                /* 64F -> 32F */ {cudaConvert, cudaConvert, cudaConvert, cudaConvert},
-                /* 64F -> 64F */ {0,0,0,0}
-            }
-        };
-
-        const bool aligned = isAligned(src.data, 16) && isAligned(dst.data, 16);
-        if (!aligned)
-        {
-            cudaConvert(src, dst, stream);
-            return;
-        }
-
-        const func_t func = funcs[src.depth()][dst.depth()][src.channels() - 1];
-        CV_DbgAssert( func != 0 );
-
-        func(src, dst, stream);
-    }
-
-    void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0)
-    {
-        CV_DbgAssert( src.size() == dst.size() && src.channels() == dst.channels() );
-
-        CV_Assert( src.depth() <= CV_64F && src.channels() <= 4 );
-        CV_Assert( dst.depth() <= CV_64F );
-
-        if (src.depth() == CV_64F || dst.depth() == CV_64F)
-        {
-            CV_Assert( deviceSupports(NATIVE_DOUBLE) );
-        }
-
-        cudaConvert(src, dst, alpha, beta, stream);
-    }
-
-    void set(GpuMat& m, Scalar s, cudaStream_t stream = 0)
-    {
-        if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
-        {
-            if (stream)
-                cudaSafeCall( cudaMemset2DAsync(m.data, m.step, 0, m.cols * m.elemSize(), m.rows, stream) );
-            else
-                cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
-            return;
-        }
-
-        if (m.depth() == CV_8U)
-        {
-            int cn = m.channels();
-
-            if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
-            {
-                int val = saturate_cast<uchar>(s[0]);
-                if (stream)
-                    cudaSafeCall( cudaMemset2DAsync(m.data, m.step, val, m.cols * m.elemSize(), m.rows, stream) );
-                else
-                    cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
-                return;
-            }
-        }
-
-        typedef void (*func_t)(GpuMat& src, Scalar s, cudaStream_t stream);
-        static const func_t funcs[7][4] =
-        {
-            {NppSet<CV_8U , 1, nppiSet_8u_C1R >::call, cudaSet                                 , cudaSet                               , NppSet<CV_8U , 4, nppiSet_8u_C4R >::call},
-            {NppSet<CV_8S , 1, nppiSet_8s_C1R >::call, NppSet<CV_8S , 2, nppiSet_8s_C2R >::call, NppSet<CV_8S, 3, nppiSet_8s_C3R>::call, NppSet<CV_8S , 4, nppiSet_8s_C4R >::call},
-            {NppSet<CV_16U, 1, nppiSet_16u_C1R>::call, NppSet<CV_16U, 2, nppiSet_16u_C2R>::call, cudaSet                               , NppSet<CV_16U, 4, nppiSet_16u_C4R>::call},
-            {NppSet<CV_16S, 1, nppiSet_16s_C1R>::call, NppSet<CV_16S, 2, nppiSet_16s_C2R>::call, cudaSet                               , NppSet<CV_16S, 4, nppiSet_16s_C4R>::call},
-            {NppSet<CV_32S, 1, nppiSet_32s_C1R>::call, cudaSet                                 , cudaSet                               , NppSet<CV_32S, 4, nppiSet_32s_C4R>::call},
-            {NppSet<CV_32F, 1, nppiSet_32f_C1R>::call, cudaSet                                 , cudaSet                               , NppSet<CV_32F, 4, nppiSet_32f_C4R>::call},
-            {cudaSet                                 , cudaSet                                 , cudaSet                               , cudaSet                                 }
-        };
-
-        CV_Assert( m.depth() <= CV_64F && m.channels() <= 4 );
-
-        if (m.depth() == CV_64F)
-        {
-            CV_Assert( deviceSupports(NATIVE_DOUBLE) );
-        }
-
-        funcs[m.depth()][m.channels() - 1](m, s, stream);
-    }
-
-    void set(GpuMat& m, Scalar s, const GpuMat& mask, cudaStream_t stream = 0)
-    {
-        CV_DbgAssert( !mask.empty() );
-
-        CV_Assert( m.depth() <= CV_64F && m.channels() <= 4 );
-
-        if (m.depth() == CV_64F)
-        {
-            CV_Assert( deviceSupports(NATIVE_DOUBLE) );
-        }
-
-        typedef void (*func_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
-        static const func_t funcs[7][4] =
-        {
-            {NppSetMask<CV_8U , 1, nppiSet_8u_C1MR >::call, cudaSet, cudaSet, NppSetMask<CV_8U , 4, nppiSet_8u_C4MR >::call},
-            {cudaSet                                      , cudaSet, cudaSet, cudaSet                                      },
-            {NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::call, cudaSet, cudaSet, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::call},
-            {NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::call, cudaSet, cudaSet, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::call},
-            {NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::call, cudaSet, cudaSet, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::call},
-            {NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::call, cudaSet, cudaSet, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::call},
-            {cudaSet                                      , cudaSet, cudaSet, cudaSet                                      }
-        };
-
-        funcs[m.depth()][m.channels() - 1](m, s, mask, stream);
-    }
-}
-
-#endif // HAVE_CUDA
-
 cv::gpu::GpuMat::GpuMat(int rows_, int cols_, int type_, void* data_, size_t step_) :
     flags(Mat::MAGIC_VAL + (type_ & Mat::TYPE_MASK)), rows(rows_), cols(cols_),
     step(step_), data((uchar*)data_), refcount(0),
@@ -651,288 +153,6 @@ cv::gpu::GpuMat::GpuMat(const GpuMat& m, Rect roi) :
         rows = cols = 0;
 }
 
-void cv::gpu::GpuMat::create(int _rows, int _cols, int _type)
-{
-#ifndef HAVE_CUDA
-    (void) _rows;
-    (void) _cols;
-    (void) _type;
-    throw_no_cuda();
-#else
-    _type &= Mat::TYPE_MASK;
-
-    if (rows == _rows && cols == _cols && type() == _type && data)
-        return;
-
-    if (data)
-        release();
-
-    CV_DbgAssert( _rows >= 0 && _cols >= 0 );
-
-    if (_rows > 0 && _cols > 0)
-    {
-        flags = Mat::MAGIC_VAL + _type;
-        rows = _rows;
-        cols = _cols;
-
-        size_t esz = elemSize();
-
-        void* devPtr;
-
-        if (rows > 1 && cols > 1)
-        {
-            cudaSafeCall( cudaMallocPitch(&devPtr, &step, esz * cols, rows) );
-        }
-        else
-        {
-            // Single row or single column must be continuous
-            cudaSafeCall( cudaMalloc(&devPtr, esz * cols * rows) );
-            step = esz * cols;
-        }
-
-        if (esz * cols == step)
-            flags |= Mat::CONTINUOUS_FLAG;
-
-        int64 _nettosize = static_cast<int64>(step) * rows;
-        size_t nettosize = static_cast<size_t>(_nettosize);
-
-        datastart = data = static_cast<uchar*>(devPtr);
-        dataend = data + nettosize;
-
-        refcount = static_cast<int*>(fastMalloc(sizeof(*refcount)));
-        *refcount = 1;
-    }
-#endif
-}
-
-void cv::gpu::GpuMat::release()
-{
-#ifdef HAVE_CUDA
-    if (refcount && CV_XADD(refcount, -1) == 1)
-    {
-        cudaFree(datastart);
-        fastFree(refcount);
-    }
-
-    data = datastart = dataend = 0;
-    step = rows = cols = 0;
-    refcount = 0;
-#endif
-}
-
-void cv::gpu::GpuMat::upload(InputArray arr)
-{
-#ifndef HAVE_CUDA
-    (void) arr;
-    throw_no_cuda();
-#else
-    Mat mat = arr.getMat();
-
-    CV_DbgAssert( !mat.empty() );
-
-    create(mat.size(), mat.type());
-
-    cudaSafeCall( cudaMemcpy2D(data, step, mat.data, mat.step, cols * elemSize(), rows, cudaMemcpyHostToDevice) );
-#endif
-}
-
-void cv::gpu::GpuMat::upload(InputArray arr, Stream& _stream)
-{
-#ifndef HAVE_CUDA
-    (void) arr;
-    (void) _stream;
-    throw_no_cuda();
-#else
-    Mat mat = arr.getMat();
-
-    CV_DbgAssert( !mat.empty() );
-
-    create(mat.size(), mat.type());
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-    cudaSafeCall( cudaMemcpy2DAsync(data, step, mat.data, mat.step, cols * elemSize(), rows, cudaMemcpyHostToDevice, stream) );
-#endif
-}
-
-void cv::gpu::GpuMat::download(OutputArray _dst) const
-{
-#ifndef HAVE_CUDA
-    (void) _dst;
-    throw_no_cuda();
-#else
-    CV_DbgAssert( !empty() );
-
-    _dst.create(size(), type());
-    Mat dst = _dst.getMat();
-
-    cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToHost) );
-#endif
-}
-
-void cv::gpu::GpuMat::download(OutputArray _dst, Stream& _stream) const
-{
-#ifndef HAVE_CUDA
-    (void) _dst;
-    (void) _stream;
-    throw_no_cuda();
-#else
-    CV_DbgAssert( !empty() );
-
-    _dst.create(size(), type());
-    Mat dst = _dst.getMat();
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToHost, stream) );
-#endif
-}
-
-void cv::gpu::GpuMat::copyTo(OutputArray _dst) const
-{
-#ifndef HAVE_CUDA
-    (void) _dst;
-    throw_no_cuda();
-#else
-    CV_DbgAssert( !empty() );
-
-    _dst.create(size(), type());
-    GpuMat dst = _dst.getGpuMat();
-
-    cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToDevice) );
-#endif
-}
-
-void cv::gpu::GpuMat::copyTo(OutputArray _dst, Stream& _stream) const
-{
-#ifndef HAVE_CUDA
-    (void) _dst;
-    (void) _stream;
-    throw_no_cuda();
-#else
-    CV_DbgAssert( !empty() );
-
-    _dst.create(size(), type());
-    GpuMat dst = _dst.getGpuMat();
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-    cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, data, step, cols * elemSize(), rows, cudaMemcpyDeviceToDevice, stream) );
-#endif
-}
-
-void cv::gpu::GpuMat::copyTo(OutputArray _dst, InputArray _mask, Stream& _stream) const
-{
-#ifndef HAVE_CUDA
-    (void) _dst;
-    (void) _mask;
-    (void) _stream;
-    throw_no_cuda();
-#else
-    CV_DbgAssert( !empty() );
-
-    _dst.create(size(), type());
-    GpuMat dst = _dst.getGpuMat();
-
-    GpuMat mask = _mask.getGpuMat();
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-    ::copyWithMask(*this, dst, mask, stream);
-#endif
-}
-
-GpuMat& cv::gpu::GpuMat::setTo(Scalar s, Stream& _stream)
-{
-#ifndef HAVE_CUDA
-    (void) s;
-    (void) _stream;
-    throw_no_cuda();
-#else
-    CV_DbgAssert( !empty() );
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-    ::set(*this, s, stream);
-#endif
-
-    return *this;
-}
-
-GpuMat& cv::gpu::GpuMat::setTo(Scalar s, InputArray _mask, Stream& _stream)
-{
-#ifndef HAVE_CUDA
-    (void) s;
-    (void) _mask;
-    (void) _stream;
-    throw_no_cuda();
-#else
-    CV_DbgAssert( !empty() );
-
-    GpuMat mask = _mask.getGpuMat();
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-    ::set(*this, s, mask, stream);
-#endif
-
-    return *this;
-}
-
-void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, Stream& _stream) const
-{
-#ifndef HAVE_CUDA
-    (void) _dst;
-    (void) rtype;
-    (void) _stream;
-    throw_no_cuda();
-#else
-    if (rtype < 0)
-        rtype = type();
-    else
-        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
-
-    const int sdepth = depth();
-    const int ddepth = CV_MAT_DEPTH(rtype);
-    if (sdepth == ddepth)
-    {
-        if (_stream)
-            copyTo(_dst, _stream);
-        else
-            copyTo(_dst);
-
-        return;
-    }
-
-    GpuMat src = *this;
-
-    _dst.create(size(), rtype);
-    GpuMat dst = _dst.getGpuMat();
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-    ::convert(src, dst, stream);
-#endif
-}
-
-void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, double alpha, double beta, Stream& _stream) const
-{
-#ifndef HAVE_CUDA
-    (void) _dst;
-    (void) rtype;
-    (void) alpha;
-    (void) beta;
-    (void) _stream;
-    throw_no_cuda();
-#else
-    if (rtype < 0)
-        rtype = type();
-    else
-        rtype = CV_MAKETYPE(CV_MAT_DEPTH(rtype), channels());
-
-    GpuMat src = *this;
-
-    _dst.create(size(), rtype);
-    GpuMat dst = _dst.getGpuMat();
-
-    cudaStream_t stream = StreamAccessor::getStream(_stream);
-    ::convert(src, dst, alpha, beta, stream);
-#endif
-}
-
 GpuMat cv::gpu::GpuMat::reshape(int new_cn, int new_rows) const
 {
     GpuMat hdr = *this;
@@ -1124,3 +344,101 @@ GpuMat cv::gpu::allocMatFromBuf(int rows, int cols, int type, GpuMat& mat)
 
     return mat = GpuMat(rows, cols, type);
 }
+
+#ifndef HAVE_CUDA
+
+void cv::gpu::GpuMat::create(int _rows, int _cols, int _type)
+{
+    (void) _rows;
+    (void) _cols;
+    (void) _type;
+    throw_no_cuda();
+}
+
+void cv::gpu::GpuMat::release()
+{
+}
+
+void cv::gpu::GpuMat::upload(InputArray arr)
+{
+    (void) arr;
+    throw_no_cuda();
+}
+
+void cv::gpu::GpuMat::upload(InputArray arr, Stream& _stream)
+{
+    (void) arr;
+    (void) _stream;
+    throw_no_cuda();
+}
+
+void cv::gpu::GpuMat::download(OutputArray _dst) const
+{
+    (void) _dst;
+    throw_no_cuda();
+}
+
+void cv::gpu::GpuMat::download(OutputArray _dst, Stream& _stream) const
+{
+    (void) _dst;
+    (void) _stream;
+    throw_no_cuda();
+}
+
+void cv::gpu::GpuMat::copyTo(OutputArray _dst) const
+{
+    (void) _dst;
+    throw_no_cuda();
+}
+
+void cv::gpu::GpuMat::copyTo(OutputArray _dst, Stream& _stream) const
+{
+    (void) _dst;
+    (void) _stream;
+    throw_no_cuda();
+}
+
+void cv::gpu::GpuMat::copyTo(OutputArray _dst, InputArray _mask, Stream& _stream) const
+{
+    (void) _dst;
+    (void) _mask;
+    (void) _stream;
+    throw_no_cuda();
+}
+
+GpuMat& cv::gpu::GpuMat::setTo(Scalar s, Stream& _stream)
+{
+    (void) s;
+    (void) _stream;
+    throw_no_cuda();
+    return *this;
+}
+
+GpuMat& cv::gpu::GpuMat::setTo(Scalar s, InputArray _mask, Stream& _stream)
+{
+    (void) s;
+    (void) _mask;
+    (void) _stream;
+    throw_no_cuda();
+    return *this;
+}
+
+void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, Stream& _stream) const
+{
+    (void) _dst;
+    (void) rtype;
+    (void) _stream;
+    throw_no_cuda();
+}
+
+void cv::gpu::GpuMat::convertTo(OutputArray _dst, int rtype, double alpha, double beta, Stream& _stream) const
+{
+    (void) _dst;
+    (void) rtype;
+    (void) alpha;
+    (void) beta;
+    (void) _stream;
+    throw_no_cuda();
+}
+
+#endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/copy.hpp b/modules/cudev/include/opencv2/cudev/grid/copy.hpp
index cfbe456333..d7d3ea8343 100644
--- a/modules/cudev/include/opencv2/cudev/grid/copy.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/copy.hpp
@@ -50,6 +50,7 @@
 #include "../util/tuple.hpp"
 #include "../ptr2d/traits.hpp"
 #include "../ptr2d/gpumat.hpp"
+#include "../ptr2d/glob.hpp"
 #include "../ptr2d/mask.hpp"
 #include "../ptr2d/zip.hpp"
 #include "detail/copy.hpp"
@@ -69,6 +70,18 @@ __host__ void gridCopy_(const SrcPtr& src, GpuMat_<DstType>& dst, const MaskPtr&
     grid_copy_detail::copy<Policy>(shrinkPtr(src), shrinkPtr(dst), shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridCopy_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_copy_detail::copy<Policy>(shrinkPtr(src), shrinkPtr(dst), shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtr, typename DstType>
 __host__ void gridCopy_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
 {
@@ -80,6 +93,17 @@ __host__ void gridCopy_(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream
     grid_copy_detail::copy<Policy>(shrinkPtr(src), shrinkPtr(dst), WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtr, typename DstType>
+__host__ void gridCopy_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
+
+    grid_copy_detail::copy<Policy>(shrinkPtr(src), shrinkPtr(dst), WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtrTuple, typename D0, typename D1, class MaskPtr>
 __host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@@ -100,6 +124,25 @@ __host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMa
                                          StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtrTuple, typename D0, typename D1, class MaskPtr>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst))),
+                                         shrinkPtr(mask),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtrTuple, typename D0, typename D1>
 __host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, Stream& stream = Stream::Null())
 {
@@ -118,6 +161,24 @@ __host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMa
                                          StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtrTuple, typename D0, typename D1>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst))),
+                                         WithOutMask(),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtrTuple, typename D0, typename D1, typename D2, class MaskPtr>
 __host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@@ -139,6 +200,26 @@ __host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMa
                                          StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtrTuple, typename D0, typename D1, typename D2, class MaskPtr>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(get<2>(dst)) == rows && getCols(get<2>(dst)) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst))),
+                                         shrinkPtr(mask),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtrTuple, typename D0, typename D1, typename D2>
 __host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, Stream& stream = Stream::Null())
 {
@@ -158,6 +239,25 @@ __host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMa
                                          StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtrTuple, typename D0, typename D1, typename D2>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(get<2>(dst)) == rows && getCols(get<2>(dst)) == cols );
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst))),
+                                         WithOutMask(),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtrTuple, typename D0, typename D1, typename D2, typename D3, class MaskPtr>
 __host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@@ -180,6 +280,27 @@ __host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMa
                                          StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtrTuple, typename D0, typename D1, typename D2, typename D3, class MaskPtr>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(get<2>(dst)) == rows && getCols(get<2>(dst)) == cols );
+    CV_Assert( getRows(get<3>(dst)) == rows && getCols(get<3>(dst)) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst), get<3>(dst))),
+                                         shrinkPtr(mask),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtrTuple, typename D0, typename D1, typename D2, typename D3>
 __host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, Stream& stream = Stream::Null())
 {
@@ -200,6 +321,26 @@ __host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMa
                                          StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtrTuple, typename D0, typename D1, typename D2, typename D3>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<SrcPtrTuple>::value == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(get<2>(dst)) == rows && getCols(get<2>(dst)) == cols );
+    CV_Assert( getRows(get<3>(dst)) == rows && getCols(get<3>(dst)) == cols );
+
+    grid_copy_detail::copy_tuple<Policy>(shrinkPtr(src),
+                                         shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst), get<3>(dst))),
+                                         WithOutMask(),
+                                         rows, cols,
+                                         StreamAccessor::getStream(stream));
+}
+
 // Default Policy
 
 struct DefaultCopyPolicy
@@ -216,48 +357,96 @@ __host__ void gridCopy(const SrcPtr& src, GpuMat_<DstType>& dst, const MaskPtr&
     gridCopy_<DefaultCopyPolicy>(src, dst, mask, stream);
 }
 
+template <class SrcPtr, typename DstType, class MaskPtr>
+__host__ void gridCopy(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, mask, stream);
+}
+
 template <class SrcPtr, typename DstType>
 __host__ void gridCopy(const SrcPtr& src, GpuMat_<DstType>& dst, Stream& stream = Stream::Null())
 {
     gridCopy_<DefaultCopyPolicy>(src, dst, stream);
 }
 
+template <class SrcPtr, typename DstType>
+__host__ void gridCopy(const SrcPtr& src, const GlobPtrSz<DstType>& dst, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, stream);
+}
+
 template <class SrcPtrTuple, typename D0, typename D1, class MaskPtr>
 __host__ void gridCopy(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
     gridCopy_<DefaultCopyPolicy>(src, dst, mask, stream);
 }
 
+template <class SrcPtrTuple, typename D0, typename D1, class MaskPtr>
+__host__ void gridCopy(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, mask, stream);
+}
+
 template <class SrcPtrTuple, typename D0, typename D1>
 __host__ void gridCopy(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, Stream& stream = Stream::Null())
 {
     gridCopy_<DefaultCopyPolicy>(src, dst, stream);
 }
 
+template <class SrcPtrTuple, typename D0, typename D1>
+__host__ void gridCopy(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, stream);
+}
+
 template <class SrcPtrTuple, typename D0, typename D1, typename D2, class MaskPtr>
 __host__ void gridCopy(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
     gridCopy_<DefaultCopyPolicy>(src, dst, mask, stream);
 }
 
+template <class SrcPtrTuple, typename D0, typename D1, typename D2, class MaskPtr>
+__host__ void gridCopy(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, mask, stream);
+}
+
 template <class SrcPtrTuple, typename D0, typename D1, typename D2>
 __host__ void gridCopy(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, Stream& stream = Stream::Null())
 {
     gridCopy_<DefaultCopyPolicy>(src, dst, stream);
 }
 
+template <class SrcPtrTuple, typename D0, typename D1, typename D2>
+__host__ void gridCopy(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, stream);
+}
+
 template <class SrcPtrTuple, typename D0, typename D1, typename D2, typename D3, class MaskPtr>
 __host__ void gridCopy(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
     gridCopy_<DefaultCopyPolicy>(src, dst, mask, stream);
 }
 
+template <class SrcPtrTuple, typename D0, typename D1, typename D2, typename D3, class MaskPtr>
+__host__ void gridCopy(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, mask, stream);
+}
+
 template <class SrcPtrTuple, typename D0, typename D1, typename D2, typename D3>
 __host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, Stream& stream = Stream::Null())
 {
     gridCopy_<DefaultCopyPolicy>(src, dst, stream);
 }
 
+template <class SrcPtrTuple, typename D0, typename D1, typename D2, typename D3>
+__host__ void gridCopy_(const SrcPtrTuple& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, Stream& stream = Stream::Null())
+{
+    gridCopy_<DefaultCopyPolicy>(src, dst, stream);
+}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/grid/transform.hpp b/modules/cudev/include/opencv2/cudev/grid/transform.hpp
index 6c57758a6d..0da5e2d8a4 100644
--- a/modules/cudev/include/opencv2/cudev/grid/transform.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/transform.hpp
@@ -50,6 +50,7 @@
 #include "../util/tuple.hpp"
 #include "../ptr2d/traits.hpp"
 #include "../ptr2d/gpumat.hpp"
+#include "../ptr2d/glob.hpp"
 #include "../ptr2d/mask.hpp"
 #include "../ptr2d/zip.hpp"
 #include "detail/transform.hpp"
@@ -69,6 +70,18 @@ __host__ void gridTransform_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnO
     grid_transform_detail::transform<Policy>(shrinkPtr(src), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr>
+__host__ void gridTransform_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_transform_detail::transform<Policy>(shrinkPtr(src), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtr, typename DstType, class UnOp>
 __host__ void gridTransform_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
 {
@@ -80,6 +93,17 @@ __host__ void gridTransform_(const SrcPtr& src, GpuMat_<DstType>& dst, const UnO
     grid_transform_detail::transform<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtr, typename DstType, class UnOp>
+__host__ void gridTransform_(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const UnOp& op, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
+
+    grid_transform_detail::transform<Policy>(shrinkPtr(src), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
 __host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@@ -94,6 +118,19 @@ __host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<D
     grid_transform_detail::transform<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr>
+__host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtrSz<DstType>& dst, const BinOp& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src1);
+    const int cols = getCols(src1);
+
+    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
+    CV_Assert( getRows(src2) == rows && getCols(src2) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_transform_detail::transform<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, shrinkPtr(mask), rows, cols, StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp>
 __host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<DstType>& dst, const BinOp& op, Stream& stream = Stream::Null())
 {
@@ -107,6 +144,18 @@ __host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GpuMat_<D
     grid_transform_detail::transform<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp>
+__host__ void gridTransform_(const SrcPtr1& src1, const SrcPtr2& src2, GlobPtrSz<DstType>& dst, const BinOp& op, Stream& stream = Stream::Null())
+{
+    const int rows = getRows(src1);
+    const int cols = getCols(src1);
+
+    CV_Assert( getRows(dst) == rows && getCols(dst) == cols );
+    CV_Assert( getRows(src2) == rows && getCols(src2) == cols );
+
+    grid_transform_detail::transform<Policy>(shrinkPtr(src1), shrinkPtr(src2), shrinkPtr(dst), op, WithOutMask(), rows, cols, StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
 __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@@ -128,6 +177,26 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
                                                    StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
+__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst))),
+                                                   op,
+                                                   shrinkPtr(mask),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple>
 __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
@@ -147,6 +216,25 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
                                                    StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtr, typename D0, typename D1, class OpTuple>
+__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 2, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst))),
+                                                   op,
+                                                   WithOutMask(),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
 __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@@ -169,6 +257,27 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
                                                    StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
+__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(get<2>(dst)) == rows && getCols(get<2>(dst)) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst))),
+                                                   op,
+                                                   shrinkPtr(mask),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
 __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
@@ -189,6 +298,26 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
                                                    StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
+__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 3, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(get<2>(dst)) == rows && getCols(get<2>(dst)) == cols );
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst))),
+                                                   op,
+                                                   WithOutMask(),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
 __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
@@ -212,6 +341,28 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
                                                    StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
+__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(get<2>(dst)) == rows && getCols(get<2>(dst)) == cols );
+    CV_Assert( getRows(get<3>(dst)) == rows && getCols(get<3>(dst)) == cols );
+    CV_Assert( getRows(mask) == rows && getCols(mask) == cols );
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst), get<3>(dst))),
+                                                   op,
+                                                   shrinkPtr(mask),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
 template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
 __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
@@ -233,6 +384,27 @@ __host__ void gridTransform_(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMa
                                                    StreamAccessor::getStream(stream));
 }
 
+template <class Policy, class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
+__host__ void gridTransform_(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    CV_StaticAssert( tuple_size<OpTuple>::value == 4, "" );
+
+    const int rows = getRows(src);
+    const int cols = getCols(src);
+
+    CV_Assert( getRows(get<0>(dst)) == rows && getCols(get<0>(dst)) == cols );
+    CV_Assert( getRows(get<1>(dst)) == rows && getCols(get<1>(dst)) == cols );
+    CV_Assert( getRows(get<2>(dst)) == rows && getCols(get<2>(dst)) == cols );
+    CV_Assert( getRows(get<3>(dst)) == rows && getCols(get<3>(dst)) == cols );
+
+    grid_transform_detail::transform_tuple<Policy>(shrinkPtr(src),
+                                                   shrinkPtr(zipPtr(get<0>(dst), get<1>(dst), get<2>(dst), get<3>(dst))),
+                                                   op,
+                                                   WithOutMask(),
+                                                   rows, cols,
+                                                   StreamAccessor::getStream(stream));
+}
+
 // Default Policy
 
 struct DefaultTransformPolicy
@@ -250,60 +422,120 @@ __host__ void gridTransform(const SrcPtr& src, GpuMat_<DstType>& dst, const Op&
     gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
 }
 
+template <class SrcPtr, typename DstType, class Op, class MaskPtr>
+__host__ void gridTransform(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+}
+
 template <class SrcPtr, typename DstType, class Op>
 __host__ void gridTransform(const SrcPtr& src, GpuMat_<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
 {
     gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
 }
 
+template <class SrcPtr, typename DstType, class Op>
+__host__ void gridTransform(const SrcPtr& src, const GlobPtrSz<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
+{
+    gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
+}
+
 template <class SrcPtr1, class SrcPtr2, typename DstType, class Op, class MaskPtr>
 __host__ void gridTransform(const SrcPtr1& src1, const SrcPtr1& src2, GpuMat_<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
     gridTransform_<DefaultTransformPolicy>(src1, src2, dst, op, mask, stream);
 }
 
+template <class SrcPtr1, class SrcPtr2, typename DstType, class Op, class MaskPtr>
+__host__ void gridTransform(const SrcPtr1& src1, const SrcPtr1& src2, const GlobPtrSz<DstType>& dst, const Op& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridTransform_<DefaultTransformPolicy>(src1, src2, dst, op, mask, stream);
+}
+
 template <class SrcPtr1, class SrcPtr2, typename DstType, class Op>
 __host__ void gridTransform(const SrcPtr1& src1, const SrcPtr1& src2, GpuMat_<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
 {
     gridTransform_<DefaultTransformPolicy>(src1, src2, dst, op, stream);
 }
 
+template <class SrcPtr1, class SrcPtr2, typename DstType, class Op>
+__host__ void gridTransform(const SrcPtr1& src1, const SrcPtr1& src2, const GlobPtrSz<DstType>& dst, const Op& op, Stream& stream = Stream::Null())
+{
+    gridTransform_<DefaultTransformPolicy>(src1, src2, dst, op, stream);
+}
+
 template <class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
 __host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
     gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
 }
 
+template <class SrcPtr, typename D0, typename D1, class OpTuple, class MaskPtr>
+__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+}
+
 template <class SrcPtr, typename D0, typename D1, class OpTuple>
 __host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
     gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
 }
 
+template <class SrcPtr, typename D0, typename D1, class OpTuple>
+__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
+}
+
 template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
 __host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
     gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
 }
 
+template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple, class MaskPtr>
+__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+}
+
 template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
 __host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
     gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
 }
 
+template <class SrcPtr, typename D0, typename D1, typename D2, class OpTuple>
+__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
+}
+
 template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
 __host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
 {
     gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
 }
 
+template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple, class MaskPtr>
+__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, const MaskPtr& mask, Stream& stream = Stream::Null())
+{
+    gridTransform_<DefaultTransformPolicy>(src, dst, op, mask, stream);
+}
+
 template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
 __host__ void gridTransform(const SrcPtr& src, const tuple< GpuMat_<D0>&, GpuMat_<D1>&, GpuMat_<D2>&, GpuMat_<D3>& >& dst, const OpTuple& op, Stream& stream = Stream::Null())
 {
     gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
 }
 
+template <class SrcPtr, typename D0, typename D1, typename D2, typename D3, class OpTuple>
+__host__ void gridTransform(const SrcPtr& src, const tuple< GlobPtrSz<D0>, GlobPtrSz<D1>, GlobPtrSz<D2>, GlobPtrSz<D3> >& dst, const OpTuple& op, Stream& stream = Stream::Null())
+{
+    gridTransform_<DefaultTransformPolicy>(src, dst, op, stream);
+}
+
 }}
 
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/detail/gpumat.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/detail/gpumat.hpp
index 2c7cf7e146..e378c52372 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/detail/gpumat.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/detail/gpumat.hpp
@@ -335,4 +335,27 @@ __host__ GpuMat_<T>& GpuMat_<T>::assign(const Expr<Body>& expr, Stream& stream)
 
 }}
 
+// Input / Output Arrays
+
+namespace cv {
+
+template<typename _Tp>
+__host__ _InputArray::_InputArray(const cudev::GpuMat_<_Tp>& m)
+    : flags(FIXED_TYPE + GPU_MAT + DataType<_Tp>::type), obj((void*)&m)
+{}
+
+template<typename _Tp>
+__host__ _OutputArray::_OutputArray(cudev::GpuMat_<_Tp>& m)
+    : _InputArray(m)
+{}
+
+template<typename _Tp>
+__host__ _OutputArray::_OutputArray(const cudev::GpuMat_<_Tp>& m)
+    : _InputArray(m)
+{
+    flags |= FIXED_SIZE;
+}
+
+}
+
 #endif
diff --git a/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp b/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp
index c64cafbb12..7304a8c7f5 100644
--- a/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/glob.hpp
@@ -91,6 +91,17 @@ __host__ GlobPtrSz<T> globPtr(T* data, size_t step, int rows, int cols)
     return p;
 }
 
+template <typename T>
+__host__ GlobPtrSz<T> globPtr(const GpuMat& mat)
+{
+    GlobPtrSz<T> p;
+    p.data = (T*) mat.data;
+    p.step = mat.step;
+    p.rows = mat.rows;
+    p.cols = mat.cols;
+    return p;
+}
+
 template <typename T> struct PtrTraits< GlobPtrSz<T> > : PtrTraitsBase<GlobPtrSz<T>, GlobPtr<T> >
 {
 };
diff --git a/modules/features2d/include/opencv2/features2d.hpp b/modules/features2d/include/opencv2/features2d.hpp
index 1f0a92c188..4e3f3b80a6 100644
--- a/modules/features2d/include/opencv2/features2d.hpp
+++ b/modules/features2d/include/opencv2/features2d.hpp
@@ -961,7 +961,7 @@ struct CV_EXPORTS Hamming
 
 typedef Hamming HammingLUT;
 
-template<int cellsize> struct CV_EXPORTS HammingMultilevel
+template<int cellsize> struct HammingMultilevel
 {
     enum { normType = NORM_HAMMING + (cellsize>1) };
     typedef unsigned char ValueType;
diff --git a/modules/gpufilters/src/filtering.cpp b/modules/gpufilters/src/filtering.cpp
index 5a852c9234..14917acc33 100644
--- a/modules/gpufilters/src/filtering.cpp
+++ b/modules/gpufilters/src/filtering.cpp
@@ -230,22 +230,22 @@ namespace
         switch (srcType)
         {
         case CV_8UC1:
-            func_ = cudev::filter2D<uchar, uchar>;
+            func_ = cv::gpu::cudev::filter2D<uchar, uchar>;
             break;
         case CV_8UC4:
-            func_ = cudev::filter2D<uchar4, uchar4>;
+            func_ = cv::gpu::cudev::filter2D<uchar4, uchar4>;
             break;
         case CV_16UC1:
-            func_ = cudev::filter2D<ushort, ushort>;
+            func_ = cv::gpu::cudev::filter2D<ushort, ushort>;
             break;
         case CV_16UC4:
-            func_ = cudev::filter2D<ushort4, ushort4>;
+            func_ = cv::gpu::cudev::filter2D<ushort4, ushort4>;
             break;
         case CV_32FC1:
-            func_ = cudev::filter2D<float, float>;
+            func_ = cv::gpu::cudev::filter2D<float, float>;
             break;
         case CV_32FC4:
-            func_ = cudev::filter2D<float4, float4>;
+            func_ = cv::gpu::cudev::filter2D<float4, float4>;
             break;
         }
     }
diff --git a/modules/gpuimgproc/doc/hough.rst b/modules/gpuimgproc/doc/hough.rst
index c640575f01..ec7117198f 100644
--- a/modules/gpuimgproc/doc/hough.rst
+++ b/modules/gpuimgproc/doc/hough.rst
@@ -216,98 +216,19 @@ Creates implementation for :ocv:class:`gpu::HoughCirclesDetector` .
 
 
 
-gpu::GeneralizedHough
----------------------
-.. ocv:class:: gpu::GeneralizedHough : public Algorithm
-
-Base class for generalized hough transform. ::
-
-    class CV_EXPORTS GeneralizedHough : public Algorithm
-    {
-    public:
-        static Ptr<GeneralizedHough> create(int method);
-
-        virtual void setTemplate(InputArray templ, int cannyThreshold = 100, Point templCenter = Point(-1, -1)) = 0;
-        virtual void setTemplate(InputArray edges, InputArray dx, InputArray dy, Point templCenter = Point(-1, -1)) = 0;
-
-        virtual void detect(InputArray image, OutputArray positions, int cannyThreshold = 100) = 0;
-        virtual void detect(InputArray edges, InputArray dx, InputArray dy, OutputArray positions) = 0;
-
-        virtual void downloadResults(InputArray d_positions, OutputArray h_positions, OutputArray h_votes = noArray()) = 0;
-    };
-
-
-Finds arbitrary template in the grayscale image using Generalized Hough Transform.
-
-
-
-gpu::GeneralizedHough::create
------------------------------
-Creates implementation for :ocv:class:`gpu::GeneralizedHough` .
-
-.. ocv:function:: Ptr<GeneralizedHough> gpu::GeneralizedHough::create(int method)
-
-    :param method: Combination of flags ( ``cv::GeneralizedHough::GHT_POSITION`` , ``cv::GeneralizedHough::GHT_SCALE`` , ``cv::GeneralizedHough::GHT_ROTATION`` ) specifying transformation to find.
-
-For full affine transformations (move + scale + rotation) [Guil1999]_ algorithm is used, otherwise [Ballard1981]_ algorithm is used.
-
-
-
-gpu::GeneralizedHough::setTemplate
+gpu::createGeneralizedHoughBallard
 ----------------------------------
-Set template to search.
+Creates implementation for generalized hough transform from [Ballard1981]_ .
 
-.. ocv:function:: void gpu::GeneralizedHough::setTemplate(InputArray templ, int cannyThreshold = 100, Point templCenter = Point(-1, -1))
-
-.. ocv:function:: void gpu::GeneralizedHough::setTemplate(InputArray edges, InputArray dx, InputArray dy, Point templCenter = Point(-1, -1))
-
-    :param templ: Template image. Canny edge detector will be applied to extract template edges.
-
-    :param cannyThreshold: Threshold value for Canny edge detector.
-
-    :param templCenter: Center for rotation. By default image center will be used.
-
-    :param edges: Edge map for template image.
-
-    :param dx: First derivative of template image in the vertical direction. Support only ``CV_32S`` type.
-
-    :param dy: First derivative of template image in the horizontal direction. Support only ``CV_32S`` type.
+.. ocv:function:: Ptr<GeneralizedHoughBallard> gpu::createGeneralizedHoughBallard()
 
 
 
-gpu::GeneralizedHough::detect
------------------------------
-Finds template (set by :ocv:func:`gpu::GeneralizedHough::setTemplate` ) in the grayscale image.
+gpu::createGeneralizedHoughGuil
+-------------------------------
+Creates implementation for generalized hough transform from [Guil1999]_ .
 
-.. ocv:function:: void gpu::GeneralizedHough::detect(InputArray image, OutputArray positions, int cannyThreshold = 100)
-
-.. ocv:function:: void gpu::GeneralizedHough::detect(InputArray edges, InputArray dx, InputArray dy, OutputArray positions)
-
-    :param templ: Input image. Canny edge detector will be applied to extract template edges.
-
-    :param positions: Output vector of found objects. Each vector is encoded as a 4-element floating-point vector  :math:`(x, y, scale, angle)` .
-
-    :param cannyThreshold: Threshold value for Canny edge detector.
-
-    :param edges: Edge map for input image.
-
-    :param dx: First derivative of input image in the vertical direction. Support only ``CV_32S`` type.
-
-    :param dy: First derivative of input image in the horizontal direction. Support only ``CV_32S`` type.
-
-
-
-gpu::GeneralizedHough::downloadResults
---------------------------------------
-Downloads results from :ocv:func:`gpu::GeneralizedHough::detect` to host memory.
-
-.. ocv:function:: void gpu::GeneralizedHough::downloadResult(InputArray d_positions, OutputArray h_positions, OutputArray h_votes = noArray())
-
-    :param d_lines: Result of :ocv:func:`gpu::GeneralizedHough::detect` .
-
-    :param h_lines: Output host array.
-
-    :param h_votes: Optional output array for votes. Each vector is encoded as a 3-element integer-point vector  :math:`(position_votes, scale_votes, angle_votes)` .
+.. ocv:function:: Ptr<GeneralizedHoughGuil> gpu::createGeneralizedHoughGuil()
 
 
 
diff --git a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
index 330476560d..f0a0f1260a 100644
--- a/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
+++ b/modules/gpuimgproc/include/opencv2/gpuimgproc.hpp
@@ -283,24 +283,13 @@ CV_EXPORTS Ptr<HoughCirclesDetector> createHoughCirclesDetector(float dp, float
 //////////////////////////////////////
 // GeneralizedHough
 
-//! finds arbitrary template in the grayscale image using Generalized Hough Transform
 //! Ballard, D.H. (1981). Generalizing the Hough transform to detect arbitrary shapes. Pattern Recognition 13 (2): 111-122.
+//! Detects position only without traslation and rotation
+CV_EXPORTS Ptr<GeneralizedHoughBallard> createGeneralizedHoughBallard();
+
 //! Guil, N., González-Linares, J.M. and Zapata, E.L. (1999). Bidimensional shape detection using an invariant approach. Pattern Recognition 32 (6): 1025-1038.
-class CV_EXPORTS GeneralizedHough : public Algorithm
-{
-public:
-    static Ptr<GeneralizedHough> create(int method);
-
-    //! set template to search
-    virtual void setTemplate(InputArray templ, int cannyThreshold = 100, Point templCenter = Point(-1, -1)) = 0;
-    virtual void setTemplate(InputArray edges, InputArray dx, InputArray dy, Point templCenter = Point(-1, -1)) = 0;
-
-    //! find template on image
-    virtual void detect(InputArray image, OutputArray positions, int cannyThreshold = 100) = 0;
-    virtual void detect(InputArray edges, InputArray dx, InputArray dy, OutputArray positions) = 0;
-
-    virtual void downloadResults(InputArray d_positions, OutputArray h_positions, OutputArray h_votes = noArray()) = 0;
-};
+//! Detects position, traslation and rotation
+CV_EXPORTS Ptr<GeneralizedHoughGuil> createGeneralizedHoughGuil();
 
 ////////////////////////// Corners Detection ///////////////////////////
 
diff --git a/modules/gpuimgproc/perf/perf_hough.cpp b/modules/gpuimgproc/perf/perf_hough.cpp
index f72a820f5b..cce8e7432e 100644
--- a/modules/gpuimgproc/perf/perf_hough.cpp
+++ b/modules/gpuimgproc/perf/perf_hough.cpp
@@ -227,23 +227,59 @@ PERF_TEST_P(Sz_Dp_MinDist, HoughCircles,
 //////////////////////////////////////////////////////////////////////
 // GeneralizedHough
 
-enum { GHT_POSITION = cv::GeneralizedHough::GHT_POSITION,
-       GHT_SCALE    = cv::GeneralizedHough::GHT_SCALE,
-       GHT_ROTATION = cv::GeneralizedHough::GHT_ROTATION
-     };
-
-CV_FLAGS(GHMethod, GHT_POSITION, GHT_SCALE, GHT_ROTATION);
-
-DEF_PARAM_TEST(Method_Sz, GHMethod, cv::Size);
-
-PERF_TEST_P(Method_Sz, GeneralizedHough,
-            Combine(Values(GHMethod(GHT_POSITION), GHMethod(GHT_POSITION | GHT_SCALE), GHMethod(GHT_POSITION | GHT_ROTATION), GHMethod(GHT_POSITION | GHT_SCALE | GHT_ROTATION)),
-                    GPU_TYPICAL_MAT_SIZES))
+PERF_TEST_P(Sz, GeneralizedHoughBallard, GPU_TYPICAL_MAT_SIZES)
 {
     declare.time(10);
 
-    const int method = GET_PARAM(0);
-    const cv::Size imageSize = GET_PARAM(1);
+    const cv::Size imageSize = GetParam();
+
+    const cv::Mat templ = readImage("cv/shared/templ.png", cv::IMREAD_GRAYSCALE);
+    ASSERT_FALSE(templ.empty());
+
+    cv::Mat image(imageSize, CV_8UC1, cv::Scalar::all(0));
+    templ.copyTo(image(cv::Rect(50, 50, templ.cols, templ.rows)));
+
+    cv::Mat edges;
+    cv::Canny(image, edges, 50, 100);
+
+    cv::Mat dx, dy;
+    cv::Sobel(image, dx, CV_32F, 1, 0);
+    cv::Sobel(image, dy, CV_32F, 0, 1);
+
+    if (PERF_RUN_GPU())
+    {
+        cv::Ptr<cv::GeneralizedHoughBallard> alg = cv::gpu::createGeneralizedHoughBallard();
+
+        const cv::gpu::GpuMat d_edges(edges);
+        const cv::gpu::GpuMat d_dx(dx);
+        const cv::gpu::GpuMat d_dy(dy);
+        cv::gpu::GpuMat positions;
+
+        alg->setTemplate(cv::gpu::GpuMat(templ));
+
+        TEST_CYCLE() alg->detect(d_edges, d_dx, d_dy, positions);
+
+        GPU_SANITY_CHECK(positions);
+    }
+    else
+    {
+        cv::Ptr<cv::GeneralizedHoughBallard> alg = cv::createGeneralizedHoughBallard();
+
+        cv::Mat positions;
+
+        alg->setTemplate(templ);
+
+        TEST_CYCLE() alg->detect(edges, dx, dy, positions);
+
+        CPU_SANITY_CHECK(positions);
+    }
+}
+
+PERF_TEST_P(Sz, GeneralizedHoughGuil, GPU_TYPICAL_MAT_SIZES)
+{
+    declare.time(10);
+
+    const cv::Size imageSize = GetParam();
 
     const cv::Mat templ = readImage("cv/shared/templ.png", cv::IMREAD_GRAYSCALE);
     ASSERT_FALSE(templ.empty());
@@ -281,39 +317,32 @@ PERF_TEST_P(Method_Sz, GeneralizedHough,
 
     if (PERF_RUN_GPU())
     {
+        cv::Ptr<cv::GeneralizedHoughGuil> alg = cv::gpu::createGeneralizedHoughGuil();
+        alg->setMaxAngle(90.0);
+        alg->setAngleStep(2.0);
+
         const cv::gpu::GpuMat d_edges(edges);
         const cv::gpu::GpuMat d_dx(dx);
         const cv::gpu::GpuMat d_dy(dy);
-        cv::gpu::GpuMat posAndVotes;
+        cv::gpu::GpuMat positions;
 
-        cv::Ptr<cv::gpu::GeneralizedHough> d_hough = cv::gpu::GeneralizedHough::create(method);
-        if (method & GHT_ROTATION)
-        {
-            d_hough->set("maxAngle", 90.0);
-            d_hough->set("angleStep", 2.0);
-        }
+        alg->setTemplate(cv::gpu::GpuMat(templ));
 
-        d_hough->setTemplate(cv::gpu::GpuMat(templ));
+        TEST_CYCLE() alg->detect(d_edges, d_dx, d_dy, positions);
 
-        TEST_CYCLE() d_hough->detect(d_edges, d_dx, d_dy, posAndVotes);
-
-        const cv::gpu::GpuMat positions(1, posAndVotes.cols, CV_32FC4, posAndVotes.data);
         GPU_SANITY_CHECK(positions);
     }
     else
     {
+        cv::Ptr<cv::GeneralizedHoughGuil> alg = cv::createGeneralizedHoughGuil();
+        alg->setMaxAngle(90.0);
+        alg->setAngleStep(2.0);
+
         cv::Mat positions;
 
-        cv::Ptr<cv::GeneralizedHough> hough = cv::GeneralizedHough::create(method);
-        if (method & GHT_ROTATION)
-        {
-            hough->set("maxAngle", 90.0);
-            hough->set("angleStep", 2.0);
-        }
+        alg->setTemplate(templ);
 
-        hough->setTemplate(templ);
-
-        TEST_CYCLE() hough->detect(edges, dx, dy, positions);
+        TEST_CYCLE() alg->detect(edges, dx, dy, positions);
 
         CPU_SANITY_CHECK(positions);
     }
diff --git a/modules/gpuimgproc/src/color.cpp b/modules/gpuimgproc/src/color.cpp
index 006274742e..3d714b6287 100644
--- a/modules/gpuimgproc/src/color.cpp
+++ b/modules/gpuimgproc/src/color.cpp
@@ -187,7 +187,7 @@ namespace
         _dst.create(src.size(), CV_8UC2);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::bgr_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::bgr_to_bgr555(src, dst, StreamAccessor::getStream(stream));
     }
 
     void bgr_to_bgr565(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -200,7 +200,7 @@ namespace
         _dst.create(src.size(), CV_8UC2);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::bgr_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::bgr_to_bgr565(src, dst, StreamAccessor::getStream(stream));
     }
 
     void rgb_to_bgr555(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -213,7 +213,7 @@ namespace
         _dst.create(src.size(), CV_8UC2);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::rgb_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::rgb_to_bgr555(src, dst, StreamAccessor::getStream(stream));
     }
 
     void rgb_to_bgr565(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -226,7 +226,7 @@ namespace
         _dst.create(src.size(), CV_8UC2);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::rgb_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::rgb_to_bgr565(src, dst, StreamAccessor::getStream(stream));
     }
 
     void bgra_to_bgr555(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -239,7 +239,7 @@ namespace
         _dst.create(src.size(), CV_8UC2);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::bgra_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::bgra_to_bgr555(src, dst, StreamAccessor::getStream(stream));
     }
 
     void bgra_to_bgr565(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -252,7 +252,7 @@ namespace
         _dst.create(src.size(), CV_8UC2);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::bgra_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::bgra_to_bgr565(src, dst, StreamAccessor::getStream(stream));
     }
 
     void rgba_to_bgr555(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -265,7 +265,7 @@ namespace
         _dst.create(src.size(), CV_8UC2);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::rgba_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::rgba_to_bgr555(src, dst, StreamAccessor::getStream(stream));
     }
 
     void rgba_to_bgr565(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -278,7 +278,7 @@ namespace
         _dst.create(src.size(), CV_8UC2);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::rgba_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::rgba_to_bgr565(src, dst, StreamAccessor::getStream(stream));
     }
 
     void bgr555_to_rgb(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -291,7 +291,7 @@ namespace
         _dst.create(src.size(), CV_8UC3);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::bgr555_to_rgb(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::bgr555_to_rgb(src, dst, StreamAccessor::getStream(stream));
     }
 
     void bgr565_to_rgb(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -304,7 +304,7 @@ namespace
         _dst.create(src.size(), CV_8UC3);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::bgr565_to_rgb(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::bgr565_to_rgb(src, dst, StreamAccessor::getStream(stream));
     }
 
     void bgr555_to_bgr(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -317,7 +317,7 @@ namespace
         _dst.create(src.size(), CV_8UC3);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::bgr555_to_bgr(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::bgr555_to_bgr(src, dst, StreamAccessor::getStream(stream));
     }
 
     void bgr565_to_bgr(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -330,7 +330,7 @@ namespace
         _dst.create(src.size(), CV_8UC3);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::bgr565_to_bgr(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::bgr565_to_bgr(src, dst, StreamAccessor::getStream(stream));
     }
 
     void bgr555_to_rgba(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -343,7 +343,7 @@ namespace
         _dst.create(src.size(), CV_8UC4);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::bgr555_to_rgba(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::bgr555_to_rgba(src, dst, StreamAccessor::getStream(stream));
     }
 
     void bgr565_to_rgba(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -356,7 +356,7 @@ namespace
         _dst.create(src.size(), CV_8UC4);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::bgr565_to_rgba(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::bgr565_to_rgba(src, dst, StreamAccessor::getStream(stream));
     }
 
     void bgr555_to_bgra(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -369,7 +369,7 @@ namespace
         _dst.create(src.size(), CV_8UC4);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::bgr555_to_bgra(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::bgr555_to_bgra(src, dst, StreamAccessor::getStream(stream));
     }
 
     void bgr565_to_bgra(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -382,7 +382,7 @@ namespace
         _dst.create(src.size(), CV_8UC4);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::bgr565_to_bgra(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::bgr565_to_bgra(src, dst, StreamAccessor::getStream(stream));
     }
 
     void gray_to_bgr(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -427,7 +427,7 @@ namespace
         _dst.create(src.size(), CV_8UC2);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::gray_to_bgr555(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::gray_to_bgr555(src, dst, StreamAccessor::getStream(stream));
     }
 
     void gray_to_bgr565(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -440,7 +440,7 @@ namespace
         _dst.create(src.size(), CV_8UC2);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::gray_to_bgr565(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::gray_to_bgr565(src, dst, StreamAccessor::getStream(stream));
     }
 
     void bgr555_to_gray(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -453,7 +453,7 @@ namespace
         _dst.create(src.size(), CV_8UC1);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::bgr555_to_gray(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::bgr555_to_gray(src, dst, StreamAccessor::getStream(stream));
     }
 
     void bgr565_to_gray(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -466,7 +466,7 @@ namespace
         _dst.create(src.size(), CV_8UC1);
         GpuMat dst = _dst.getGpuMat();
 
-        cudev::bgr565_to_gray(src, dst, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::bgr565_to_gray(src, dst, StreamAccessor::getStream(stream));
     }
 
     void rgb_to_gray(InputArray _src, OutputArray _dst, int, Stream& stream)
@@ -2145,9 +2145,9 @@ void cv::gpu::demosaicing(InputArray _src, OutputArray _dst, int code, int dcn,
                                         code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
 
         if (dcn == 3)
-            cudev::MHCdemosaic<3>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+            cv::gpu::cudev::MHCdemosaic<3>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
         else
-            cudev::MHCdemosaic<4>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+            cv::gpu::cudev::MHCdemosaic<4>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
 
         break;
     }
@@ -2172,7 +2172,7 @@ void cv::gpu::demosaicing(InputArray _src, OutputArray _dst, int code, int dcn,
         const int2 firstRed = make_int2(code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGB2BGR_MHT ? 0 : 1,
                                         code == COLOR_BayerRG2BGR_MHT || code == COLOR_BayerGR2BGR_MHT ? 0 : 1);
 
-        cudev::MHCdemosaic<1>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
+        cv::gpu::cudev::MHCdemosaic<1>(srcWhole, make_int2(ofs.x, ofs.y), dst, firstRed, StreamAccessor::getStream(stream));
 
         break;
     }
diff --git a/modules/gpuimgproc/src/cuda/generalized_hough.cu b/modules/gpuimgproc/src/cuda/generalized_hough.cu
index 14c8600104..fdf691ff4a 100644
--- a/modules/gpuimgproc/src/cuda/generalized_hough.cu
+++ b/modules/gpuimgproc/src/cuda/generalized_hough.cu
@@ -307,268 +307,6 @@ namespace cv { namespace gpu { namespace cudev
             return totalCount;
         }
 
-        ////////////////////////////////////////////////////////////////////////
-        // Ballard_PosScale
-
-        __global__ void Ballard_PosScale_calcHist(const unsigned int* coordList, const float* thetaList,
-                                                  PtrStep<short2> r_table, const int* r_sizes,
-                                                  PtrStepi hist, const int rows, const int cols,
-                                                  const float minScale, const float scaleStep, const int scaleRange,
-                                                  const float idp, const float thetaScale)
-        {
-            const unsigned int coord = coordList[blockIdx.x];
-            float2 p;
-            p.x = (coord & 0xFFFF);
-            p.y = (coord >> 16) & 0xFFFF;
-
-            const float theta = thetaList[blockIdx.x];
-            const int n = __float2int_rn(theta * thetaScale);
-
-            const short2* r_row = r_table.ptr(n);
-            const int r_row_size = r_sizes[n];
-
-            for (int j = 0; j < r_row_size; ++j)
-            {
-                const float2 d = saturate_cast<float2>(r_row[j]);
-
-                for (int s = threadIdx.x; s < scaleRange; s += blockDim.x)
-                {
-                    const float scale = minScale + s * scaleStep;
-
-                    float2 c = p - scale * d;
-
-                    c.x *= idp;
-                    c.y *= idp;
-
-                    if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
-                        ::atomicAdd(hist.ptr((s + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
-                }
-            }
-        }
-
-        void Ballard_PosScale_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                           PtrStepSz<short2> r_table, const int* r_sizes,
-                                           PtrStepi hist, int rows, int cols,
-                                           float minScale, float scaleStep, int scaleRange,
-                                           float dp, int levels)
-        {
-            const dim3 block(256);
-            const dim3 grid(pointsCount);
-
-            const float idp = 1.0f / dp;
-            const float thetaScale = levels / (2.0f * CV_PI_F);
-
-            Ballard_PosScale_calcHist<<<grid, block>>>(coordList, thetaList,
-                                                       r_table, r_sizes,
-                                                       hist, rows, cols,
-                                                       minScale, scaleStep, scaleRange,
-                                                       idp, thetaScale);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void Ballard_PosScale_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int scaleRange,
-                                                       float4* out, int3* votes, const int maxSize,
-                                                       const float minScale, const float scaleStep, const float dp, const int threshold)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= cols || y >= rows)
-                return;
-
-            for (int s = 0; s < scaleRange; ++s)
-            {
-                const float scale = minScale + s * scaleStep;
-
-                const int prevScaleIdx = (s) * (rows + 2);
-                const int curScaleIdx = (s + 1) * (rows + 2);
-                const int nextScaleIdx = (s + 2) * (rows + 2);
-
-                const int curVotes = hist(curScaleIdx + y + 1, x + 1);
-
-                if (curVotes > threshold &&
-                    curVotes >  hist(curScaleIdx + y + 1, x) &&
-                    curVotes >= hist(curScaleIdx + y + 1, x + 2) &&
-                    curVotes >  hist(curScaleIdx + y, x + 1) &&
-                    curVotes >= hist(curScaleIdx + y + 2, x + 1) &&
-                    curVotes >  hist(prevScaleIdx + y + 1, x + 1) &&
-                    curVotes >= hist(nextScaleIdx + y + 1, x + 1))
-                {
-                    const int ind = ::atomicAdd(&g_counter, 1);
-
-                    if (ind < maxSize)
-                    {
-                        out[ind] = make_float4(x * dp, y * dp, scale, 0.0f);
-                        votes[ind] = make_int3(curVotes, curVotes, 0);
-                    }
-                }
-            }
-        }
-
-        int Ballard_PosScale_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int scaleRange, float4* out, int3* votes, int maxSize,
-                                               float minScale, float scaleStep, float dp, int threshold)
-        {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
-
-            const dim3 block(32, 8);
-            const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(Ballard_PosScale_findPosInHist, cudaFuncCachePreferL1) );
-
-            Ballard_PosScale_findPosInHist<<<grid, block>>>(hist, rows, cols, scaleRange, out, votes,
-                                                            maxSize, minScale, scaleStep, dp, threshold);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            totalCount = ::min(totalCount, maxSize);
-
-            return totalCount;
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // Ballard_PosRotation
-
-        __global__ void Ballard_PosRotation_calcHist(const unsigned int* coordList, const float* thetaList,
-                                                     PtrStep<short2> r_table, const int* r_sizes,
-                                                     PtrStepi hist, const int rows, const int cols,
-                                                     const float minAngle, const float angleStep, const int angleRange,
-                                                     const float idp, const float thetaScale)
-        {
-            const unsigned int coord = coordList[blockIdx.x];
-            float2 p;
-            p.x = (coord & 0xFFFF);
-            p.y = (coord >> 16) & 0xFFFF;
-
-            const float thetaVal = thetaList[blockIdx.x];
-
-            for (int a = threadIdx.x; a < angleRange; a += blockDim.x)
-            {
-                const float angle = (minAngle + a * angleStep) * (CV_PI_F / 180.0f);
-                float sinA, cosA;
-                sincosf(angle, &sinA, &cosA);
-
-                float theta = thetaVal - angle;
-                if (theta < 0)
-                    theta += 2.0f * CV_PI_F;
-
-                const int n = __float2int_rn(theta * thetaScale);
-
-                const short2* r_row = r_table.ptr(n);
-                const int r_row_size = r_sizes[n];
-
-                for (int j = 0; j < r_row_size; ++j)
-                {
-                    const float2 d = saturate_cast<float2>(r_row[j]);
-
-                    const float2 dr = make_float2(d.x * cosA - d.y * sinA, d.x * sinA + d.y * cosA);
-
-                    float2 c = make_float2(p.x - dr.x, p.y - dr.y);
-                    c.x *= idp;
-                    c.y *= idp;
-
-                    if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
-                        ::atomicAdd(hist.ptr((a + 1) * (rows + 2) + __float2int_rn(c.y + 1)) + __float2int_rn(c.x + 1), 1);
-                }
-            }
-        }
-
-        void Ballard_PosRotation_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                              PtrStepSz<short2> r_table, const int* r_sizes,
-                                              PtrStepi hist, int rows, int cols,
-                                              float minAngle, float angleStep, int angleRange,
-                                              float dp, int levels)
-        {
-            const dim3 block(256);
-            const dim3 grid(pointsCount);
-
-            const float idp = 1.0f / dp;
-            const float thetaScale = levels / (2.0f * CV_PI_F);
-
-            Ballard_PosRotation_calcHist<<<grid, block>>>(coordList, thetaList,
-                                                          r_table, r_sizes,
-                                                          hist, rows, cols,
-                                                          minAngle, angleStep, angleRange,
-                                                          idp, thetaScale);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-        }
-
-        __global__ void Ballard_PosRotation_findPosInHist(const PtrStepi hist, const int rows, const int cols, const int angleRange,
-                                                          float4* out, int3* votes, const int maxSize,
-                                                          const float minAngle, const float angleStep, const float dp, const int threshold)
-        {
-            const int x = blockIdx.x * blockDim.x + threadIdx.x;
-            const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-            if (x >= cols || y >= rows)
-                return;
-
-            for (int a = 0; a < angleRange; ++a)
-            {
-                const float angle = minAngle + a * angleStep;
-
-                const int prevAngleIdx = (a) * (rows + 2);
-                const int curAngleIdx = (a + 1) * (rows + 2);
-                const int nextAngleIdx = (a + 2) * (rows + 2);
-
-                const int curVotes = hist(curAngleIdx + y + 1, x + 1);
-
-                if (curVotes > threshold &&
-                    curVotes >  hist(curAngleIdx + y + 1, x) &&
-                    curVotes >= hist(curAngleIdx + y + 1, x + 2) &&
-                    curVotes >  hist(curAngleIdx + y, x + 1) &&
-                    curVotes >= hist(curAngleIdx + y + 2, x + 1) &&
-                    curVotes >  hist(prevAngleIdx + y + 1, x + 1) &&
-                    curVotes >= hist(nextAngleIdx + y + 1, x + 1))
-                {
-                    const int ind = ::atomicAdd(&g_counter, 1);
-
-                    if (ind < maxSize)
-                    {
-                        out[ind] = make_float4(x * dp, y * dp, 1.0f, angle);
-                        votes[ind] = make_int3(curVotes, 0, curVotes);
-                    }
-                }
-            }
-        }
-
-        int Ballard_PosRotation_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int angleRange, float4* out, int3* votes, int maxSize,
-                                                  float minAngle, float angleStep, float dp, int threshold)
-        {
-            void* counterPtr;
-            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
-
-            cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
-
-            const dim3 block(32, 8);
-            const dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
-
-            cudaSafeCall( cudaFuncSetCacheConfig(Ballard_PosRotation_findPosInHist, cudaFuncCachePreferL1) );
-
-            Ballard_PosRotation_findPosInHist<<<grid, block>>>(hist, rows, cols, angleRange, out, votes,
-                                                               maxSize, minAngle, angleStep, dp, threshold);
-            cudaSafeCall( cudaGetLastError() );
-
-            cudaSafeCall( cudaDeviceSynchronize() );
-
-            int totalCount;
-            cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
-
-            totalCount = ::min(totalCount, maxSize);
-
-            return totalCount;
-        }
-
         ////////////////////////////////////////////////////////////////////////
         // Guil_Full
 
diff --git a/modules/gpuimgproc/src/generalized_hough.cpp b/modules/gpuimgproc/src/generalized_hough.cpp
index 0d01301744..6adfcb7a26 100644
--- a/modules/gpuimgproc/src/generalized_hough.cpp
+++ b/modules/gpuimgproc/src/generalized_hough.cpp
@@ -47,7 +47,9 @@ using namespace cv::gpu;
 
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER) || !defined(HAVE_OPENCV_GPUARITHM)
 
-Ptr<gpu::GeneralizedHough> cv::gpu::GeneralizedHough::create(int) { throw_no_cuda(); return Ptr<GeneralizedHough>(); }
+Ptr<GeneralizedHoughBallard> cv::gpu::createGeneralizedHoughBallard() { throw_no_cuda(); return Ptr<GeneralizedHoughBallard>(); }
+
+Ptr<GeneralizedHoughGuil> cv::gpu::createGeneralizedHoughGuil() { throw_no_cuda(); return Ptr<GeneralizedHoughGuil>(); }
 
 #else /* !defined (HAVE_CUDA) */
 
@@ -67,22 +69,6 @@ namespace cv { namespace gpu { namespace cudev
                                       float dp, int levels);
         int Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold);
 
-        void Ballard_PosScale_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                           PtrStepSz<short2> r_table, const int* r_sizes,
-                                           PtrStepi hist, int rows, int cols,
-                                           float minScale, float scaleStep, int scaleRange,
-                                           float dp, int levels);
-        int Ballard_PosScale_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int scaleRange, float4* out, int3* votes, int maxSize,
-                                               float minScale, float scaleStep, float dp, int threshold);
-
-        void Ballard_PosRotation_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                              PtrStepSz<short2> r_table, const int* r_sizes,
-                                              PtrStepi hist, int rows, int cols,
-                                              float minAngle, float angleStep, int angleRange,
-                                              float dp, int levels);
-        int Ballard_PosRotation_findPosInHist_gpu(PtrStepi hist, int rows, int cols, int angleRange, float4* out, int3* votes, int maxSize,
-                                                  float minAngle, float angleStep, float dp, int threshold);
-
         void Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2);
         void Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2);
         void Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
@@ -110,278 +96,207 @@ namespace cv { namespace gpu { namespace cudev
     }
 }}}
 
+// common
+
 namespace
 {
-    /////////////////////////////////////
-    // GeneralizedHoughBase
-
-    class GeneralizedHoughBase : public gpu::GeneralizedHough
+    class GeneralizedHoughBase
     {
-    public:
-        GeneralizedHoughBase();
-
-        void setTemplate(InputArray templ, int cannyThreshold = 100, Point templCenter = Point(-1, -1));
-        void setTemplate(InputArray edges, InputArray dx, InputArray dy, Point templCenter = Point(-1, -1));
-
-        void detect(InputArray image, OutputArray positions, int cannyThreshold = 100);
-        void detect(InputArray edges, InputArray dx, InputArray dy, OutputArray positions);
-
-        void downloadResults(InputArray d_positions, OutputArray h_positions, OutputArray h_votes = noArray());
-
     protected:
-        virtual void setTemplateImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Point templCenter) = 0;
-        virtual void detectImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, OutputArray positions) = 0;
+        GeneralizedHoughBase();
+        virtual ~GeneralizedHoughBase() {}
+
+        void setTemplateImpl(InputArray templ, Point templCenter);
+        void setTemplateImpl(InputArray edges, InputArray dx, InputArray dy, Point templCenter);
+
+        void detectImpl(InputArray image, OutputArray positions, OutputArray votes);
+        void detectImpl(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes);
+
+        void buildEdgePointList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy);
+
+        virtual void processTempl() = 0;
+        virtual void processImage() = 0;
+
+        int cannyLowThresh_;
+        int cannyHighThresh_;
+        double minDist_;
+        double dp_;
+        int maxBufferSize_;
+
+        Size templSize_;
+        Point templCenter_;
+        GpuMat templEdges_;
+        GpuMat templDx_;
+        GpuMat templDy_;
+
+        Size imageSize_;
+        GpuMat imageEdges_;
+        GpuMat imageDx_;
+        GpuMat imageDy_;
+
+        GpuMat edgePointList_;
+
+        GpuMat outBuf_;
+        int posCount_;
 
     private:
 #ifdef HAVE_OPENCV_GPUFILTERS
-        GpuMat dx_, dy_;
-        GpuMat edges_;
+        void calcEdges(InputArray src, GpuMat& edges, GpuMat& dx, GpuMat& dy);
+#endif
+
+        void filterMinDist();
+        void convertTo(OutputArray positions, OutputArray votes);
+
+#ifdef HAVE_OPENCV_GPUFILTERS
         Ptr<gpu::CannyEdgeDetector> canny_;
         Ptr<gpu::Filter> filterDx_;
         Ptr<gpu::Filter> filterDy_;
 #endif
+
+        std::vector<float4> oldPosBuf_;
+        std::vector<int3> oldVoteBuf_;
+        std::vector<float4> newPosBuf_;
+        std::vector<int3> newVoteBuf_;
+        std::vector<int> indexies_;
     };
 
     GeneralizedHoughBase::GeneralizedHoughBase()
     {
+        cannyLowThresh_ = 50;
+        cannyHighThresh_ = 100;
+        minDist_ = 1.0;
+        dp_ = 1.0;
+
+        maxBufferSize_ = 10000;
+
 #ifdef HAVE_OPENCV_GPUFILTERS
-        canny_ = gpu::createCannyEdgeDetector(50, 100);
+        canny_ = gpu::createCannyEdgeDetector(cannyLowThresh_, cannyHighThresh_);
         filterDx_ = gpu::createSobelFilter(CV_8UC1, CV_32S, 1, 0);
         filterDy_ = gpu::createSobelFilter(CV_8UC1, CV_32S, 0, 1);
 #endif
     }
 
-    void GeneralizedHoughBase::setTemplate(InputArray _templ, int cannyThreshold, Point templCenter)
+#ifdef HAVE_OPENCV_GPUFILTERS
+    void GeneralizedHoughBase::calcEdges(InputArray _src, GpuMat& edges, GpuMat& dx, GpuMat& dy)
+    {
+        GpuMat src = _src.getGpuMat();
+
+        CV_Assert( src.type() == CV_8UC1 );
+        CV_Assert( cannyLowThresh_ > 0 && cannyLowThresh_ < cannyHighThresh_ );
+
+        ensureSizeIsEnough(src.size(), CV_32SC1, dx);
+        ensureSizeIsEnough(src.size(), CV_32SC1, dy);
+
+        filterDx_->apply(src, dx);
+        filterDy_->apply(src, dy);
+
+        ensureSizeIsEnough(src.size(), CV_8UC1, edges);
+
+        canny_->setLowThreshold(cannyLowThresh_);
+        canny_->setHighThreshold(cannyHighThresh_);
+        canny_->detect(dx, dy, edges);
+    }
+#endif
+
+    void GeneralizedHoughBase::setTemplateImpl(InputArray templ, Point templCenter)
     {
 #ifndef HAVE_OPENCV_GPUFILTERS
-        (void) _templ;
-        (void) cannyThreshold;
+        (void) templ;
         (void) templCenter;
         throw_no_cuda();
 #else
-        GpuMat templ = _templ.getGpuMat();
-
-        CV_Assert( templ.type() == CV_8UC1 );
-        CV_Assert( cannyThreshold > 0 );
-
-        ensureSizeIsEnough(templ.size(), CV_32SC1, dx_);
-        ensureSizeIsEnough(templ.size(), CV_32SC1, dy_);
-
-        filterDx_->apply(templ, dx_);
-        filterDy_->apply(templ, dy_);
-
-        ensureSizeIsEnough(templ.size(), CV_8UC1, edges_);
-
-        canny_->setLowThreshold(cannyThreshold / 2);
-        canny_->setHighThreshold(cannyThreshold);
-        canny_->detect(dx_, dy_, edges_);
+        calcEdges(templ, templEdges_, templDx_, templDy_);
 
         if (templCenter == Point(-1, -1))
-            templCenter = Point(templ.cols / 2, templ.rows / 2);
+            templCenter = Point(templEdges_.cols / 2, templEdges_.rows / 2);
 
-        setTemplateImpl(edges_, dx_, dy_, templCenter);
+        templSize_ = templEdges_.size();
+        templCenter_ = templCenter;
+
+        processTempl();
 #endif
     }
 
-    void GeneralizedHoughBase::setTemplate(InputArray _edges, InputArray _dx, InputArray _dy, Point templCenter)
+    void GeneralizedHoughBase::setTemplateImpl(InputArray edges, InputArray dx, InputArray dy, Point templCenter)
     {
-        GpuMat edges = _edges.getGpuMat();
-        GpuMat dx = _dx.getGpuMat();
-        GpuMat dy = _dy.getGpuMat();
+        edges.getGpuMat().copyTo(templEdges_);
+        dx.getGpuMat().copyTo(templDx_);
+        dy.getGpuMat().copyTo(templDy_);
+
+        CV_Assert( templEdges_.type() == CV_8UC1 );
+        CV_Assert( templDx_.type() == CV_32FC1 && templDx_.size() == templEdges_.size() );
+        CV_Assert( templDy_.type() == templDx_.type() && templDy_.size() == templEdges_.size() );
 
         if (templCenter == Point(-1, -1))
-            templCenter = Point(edges.cols / 2, edges.rows / 2);
+            templCenter = Point(templEdges_.cols / 2, templEdges_.rows / 2);
 
-        setTemplateImpl(edges, dx, dy, templCenter);
-    }
-
-    void GeneralizedHoughBase::detect(InputArray _image, OutputArray positions, int cannyThreshold)
-    {
-#ifndef HAVE_OPENCV_GPUFILTERS
-        (void) _image;
-        (void) positions;
-        (void) cannyThreshold;
-        throw_no_cuda();
-#else
-        GpuMat image = _image.getGpuMat();
-
-        CV_Assert( image.type() == CV_8UC1 );
-        CV_Assert( cannyThreshold > 0 );
-
-        ensureSizeIsEnough(image.size(), CV_32SC1, dx_);
-        ensureSizeIsEnough(image.size(), CV_32SC1, dy_);
-
-        filterDx_->apply(image, dx_);
-        filterDy_->apply(image, dy_);
-
-        ensureSizeIsEnough(image.size(), CV_8UC1, edges_);
-
-        canny_->setLowThreshold(cannyThreshold / 2);
-        canny_->setHighThreshold(cannyThreshold);
-        canny_->detect(dx_, dy_, edges_);
-
-        detectImpl(edges_, dx_, dy_, positions);
-#endif
-    }
-
-    void GeneralizedHoughBase::detect(InputArray _edges, InputArray _dx, InputArray _dy, OutputArray positions)
-    {
-        GpuMat edges = _edges.getGpuMat();
-        GpuMat dx = _dx.getGpuMat();
-        GpuMat dy = _dy.getGpuMat();
-
-        detectImpl(edges, dx, dy, positions);
-    }
-
-    void GeneralizedHoughBase::downloadResults(InputArray _d_positions, OutputArray h_positions, OutputArray h_votes)
-    {
-        GpuMat d_positions = _d_positions.getGpuMat();
-
-        if (d_positions.empty())
-        {
-            h_positions.release();
-            if (h_votes.needed())
-                h_votes.release();
-            return;
-        }
-
-        CV_Assert( d_positions.rows == 2 && d_positions.type() == CV_32FC4 );
-
-        d_positions.row(0).download(h_positions);
-
-        if (h_votes.needed())
-        {
-            GpuMat d_votes(1, d_positions.cols, CV_32SC3, d_positions.ptr<int3>(1));
-            d_votes.download(h_votes);
-        }
-    }
-
-    /////////////////////////////////////
-    // GHT_Pos
-
-    template <typename T, class A> void releaseVector(std::vector<T, A>& v)
-    {
-        std::vector<T, A> empty;
-        empty.swap(v);
-    }
-
-    class GHT_Pos : public GeneralizedHoughBase
-    {
-    public:
-        GHT_Pos();
-
-    protected:
-        void setTemplateImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Point templCenter);
-        void detectImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, OutputArray positions);
-        void releaseImpl();
-
-        virtual void processTempl() = 0;
-        virtual void processImage() = 0;
-
-        void buildEdgePointList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy);
-        void filterMinDist();
-        void convertTo(OutputArray positions);
-
-        int maxSize;
-        double minDist;
-
-        Size templSize;
-        Point templCenter;
-        GpuMat templEdges;
-        GpuMat templDx;
-        GpuMat templDy;
-
-        Size imageSize;
-        GpuMat imageEdges;
-        GpuMat imageDx;
-        GpuMat imageDy;
-
-        GpuMat edgePointList;
-
-        GpuMat outBuf;
-        int posCount;
-
-        std::vector<float4> oldPosBuf;
-        std::vector<int3> oldVoteBuf;
-        std::vector<float4> newPosBuf;
-        std::vector<int3> newVoteBuf;
-        std::vector<int> indexies;
-    };
-
-    GHT_Pos::GHT_Pos()
-    {
-        maxSize = 10000;
-        minDist = 1.0;
-    }
-
-    void GHT_Pos::setTemplateImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Point templCenter_)
-    {
-        templSize = edges.size();
-        templCenter = templCenter_;
-
-        ensureSizeIsEnough(templSize, edges.type(), templEdges);
-        ensureSizeIsEnough(templSize, dx.type(), templDx);
-        ensureSizeIsEnough(templSize, dy.type(), templDy);
-
-        edges.copyTo(templEdges);
-        dx.copyTo(templDx);
-        dy.copyTo(templDy);
+        templSize_ = templEdges_.size();
+        templCenter_ = templCenter;
 
         processTempl();
     }
 
-    void GHT_Pos::detectImpl(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, OutputArray positions)
+    void GeneralizedHoughBase::detectImpl(InputArray image, OutputArray positions, OutputArray votes)
     {
-        imageSize = edges.size();
+#ifndef HAVE_OPENCV_GPUFILTERS
+        (void) templ;
+        (void) templCenter;
+        throw_no_cuda();
+#else
+        calcEdges(image, imageEdges_, imageDx_, imageDy_);
 
-        ensureSizeIsEnough(imageSize, edges.type(), imageEdges);
-        ensureSizeIsEnough(imageSize, dx.type(), imageDx);
-        ensureSizeIsEnough(imageSize, dy.type(), imageDy);
+        imageSize_ = imageEdges_.size();
 
-        edges.copyTo(imageEdges);
-        dx.copyTo(imageDx);
-        dy.copyTo(imageDy);
-
-        posCount = 0;
+        posCount_ = 0;
 
         processImage();
 
-        if (posCount == 0)
+        if (posCount_ == 0)
+        {
             positions.release();
+            if (votes.needed())
+                votes.release();
+        }
         else
         {
-            if (minDist > 1)
+            if (minDist_ > 1)
                 filterMinDist();
-            convertTo(positions);
+            convertTo(positions, votes);
+        }
+#endif
+    }
+
+    void GeneralizedHoughBase::detectImpl(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes)
+    {
+        edges.getGpuMat().copyTo(imageEdges_);
+        dx.getGpuMat().copyTo(imageDx_);
+        dy.getGpuMat().copyTo(imageDy_);
+
+        CV_Assert( imageEdges_.type() == CV_8UC1 );
+        CV_Assert( imageDx_.type() == CV_32FC1 && imageDx_.size() == imageEdges_.size() );
+        CV_Assert( imageDy_.type() == imageDx_.type() && imageDy_.size() == imageEdges_.size() );
+
+        imageSize_ = imageEdges_.size();
+
+        posCount_ = 0;
+
+        processImage();
+
+        if (posCount_ == 0)
+        {
+            positions.release();
+            if (votes.needed())
+                votes.release();
+        }
+        else
+        {
+            if (minDist_ > 1)
+                filterMinDist();
+            convertTo(positions, votes);
         }
     }
 
-    void GHT_Pos::releaseImpl()
-    {
-        templSize = Size();
-        templCenter = Point(-1, -1);
-        templEdges.release();
-        templDx.release();
-        templDy.release();
-
-        imageSize = Size();
-        imageEdges.release();
-        imageDx.release();
-        imageDy.release();
-
-        edgePointList.release();
-
-        outBuf.release();
-        posCount = 0;
-
-        releaseVector(oldPosBuf);
-        releaseVector(oldVoteBuf);
-        releaseVector(newPosBuf);
-        releaseVector(newVoteBuf);
-        releaseVector(indexies);
-    }
-
-    void GHT_Pos::buildEdgePointList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy)
+    void GeneralizedHoughBase::buildEdgePointList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy)
     {
         using namespace cv::gpu::cudev::ght;
 
@@ -397,17 +312,17 @@ namespace
             0
         };
 
-        CV_Assert(edges.type() == CV_8UC1);
-        CV_Assert(dx.size() == edges.size());
-        CV_Assert(dy.type() == dx.type() && dy.size() == edges.size());
+        CV_Assert( edges.type() == CV_8UC1 );
+        CV_Assert( dx.size() == edges.size() );
+        CV_Assert( dy.type() == dx.type() && dy.size() == edges.size() );
 
         const func_t func = funcs[dx.depth()];
-        CV_Assert(func != 0);
+        CV_Assert( func != 0 );
 
-        edgePointList.cols = (int) (edgePointList.step / sizeof(int));
-        ensureSizeIsEnough(2, edges.size().area(), CV_32SC1, edgePointList);
+        edgePointList_.cols = (int) (edgePointList_.step / sizeof(int));
+        ensureSizeIsEnough(2, edges.size().area(), CV_32SC1, edgePointList_);
 
-        edgePointList.cols = func(edges, dx, dy, edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1));
+        edgePointList_.cols = func(edges, dx, dy, edgePointList_.ptr<unsigned int>(0), edgePointList_.ptr<float>(1));
     }
 
     struct IndexCmp
@@ -422,37 +337,37 @@ namespace
         }
     };
 
-    void GHT_Pos::filterMinDist()
+    void GeneralizedHoughBase::filterMinDist()
     {
-        oldPosBuf.resize(posCount);
-        oldVoteBuf.resize(posCount);
+        oldPosBuf_.resize(posCount_);
+        oldVoteBuf_.resize(posCount_);
 
-        cudaSafeCall( cudaMemcpy(&oldPosBuf[0], outBuf.ptr(0), posCount * sizeof(float4), cudaMemcpyDeviceToHost) );
-        cudaSafeCall( cudaMemcpy(&oldVoteBuf[0], outBuf.ptr(1), posCount * sizeof(int3), cudaMemcpyDeviceToHost) );
+        cudaSafeCall( cudaMemcpy(&oldPosBuf_[0], outBuf_.ptr(0), posCount_ * sizeof(float4), cudaMemcpyDeviceToHost) );
+        cudaSafeCall( cudaMemcpy(&oldVoteBuf_[0], outBuf_.ptr(1), posCount_ * sizeof(int3), cudaMemcpyDeviceToHost) );
 
-        indexies.resize(posCount);
-        for (int i = 0; i < posCount; ++i)
-            indexies[i] = i;
-        std::sort(indexies.begin(), indexies.end(), IndexCmp(&oldVoteBuf[0]));
+        indexies_.resize(posCount_);
+        for (int i = 0; i < posCount_; ++i)
+            indexies_[i] = i;
+        std::sort(indexies_.begin(), indexies_.end(), IndexCmp(&oldVoteBuf_[0]));
 
-        newPosBuf.clear();
-        newVoteBuf.clear();
-        newPosBuf.reserve(posCount);
-        newVoteBuf.reserve(posCount);
+        newPosBuf_.clear();
+        newVoteBuf_.clear();
+        newPosBuf_.reserve(posCount_);
+        newVoteBuf_.reserve(posCount_);
 
-        const int cellSize = cvRound(minDist);
-        const int gridWidth = (imageSize.width + cellSize - 1) / cellSize;
-        const int gridHeight = (imageSize.height + cellSize - 1) / cellSize;
+        const int cellSize = cvRound(minDist_);
+        const int gridWidth = (imageSize_.width + cellSize - 1) / cellSize;
+        const int gridHeight = (imageSize_.height + cellSize - 1) / cellSize;
 
         std::vector< std::vector<Point2f> > grid(gridWidth * gridHeight);
 
-        const double minDist2 = minDist * minDist;
+        const double minDist2 = minDist_ * minDist_;
 
-        for (int i = 0; i < posCount; ++i)
+        for (int i = 0; i < posCount_; ++i)
         {
-            const int ind = indexies[i];
+            const int ind = indexies_[i];
 
-            Point2f p(oldPosBuf[ind].x, oldPosBuf[ind].y);
+            Point2f p(oldPosBuf_[ind].x, oldPosBuf_[ind].y);
 
             bool good = true;
 
@@ -495,319 +410,279 @@ namespace
             {
                 grid[yCell * gridWidth + xCell].push_back(p);
 
-                newPosBuf.push_back(oldPosBuf[ind]);
-                newVoteBuf.push_back(oldVoteBuf[ind]);
+                newPosBuf_.push_back(oldPosBuf_[ind]);
+                newVoteBuf_.push_back(oldVoteBuf_[ind]);
             }
         }
 
-        posCount = static_cast<int>(newPosBuf.size());
-        cudaSafeCall( cudaMemcpy(outBuf.ptr(0), &newPosBuf[0], posCount * sizeof(float4), cudaMemcpyHostToDevice) );
-        cudaSafeCall( cudaMemcpy(outBuf.ptr(1), &newVoteBuf[0], posCount * sizeof(int3), cudaMemcpyHostToDevice) );
+        posCount_ = static_cast<int>(newPosBuf_.size());
+        cudaSafeCall( cudaMemcpy(outBuf_.ptr(0), &newPosBuf_[0], posCount_ * sizeof(float4), cudaMemcpyHostToDevice) );
+        cudaSafeCall( cudaMemcpy(outBuf_.ptr(1), &newVoteBuf_[0], posCount_ * sizeof(int3), cudaMemcpyHostToDevice) );
     }
 
-    void GHT_Pos::convertTo(OutputArray positions)
+    void GeneralizedHoughBase::convertTo(OutputArray positions, OutputArray votes)
     {
-        ensureSizeIsEnough(2, posCount, CV_32FC4, positions);
-        GpuMat(2, posCount, CV_32FC4, outBuf.data, outBuf.step).copyTo(positions);
+        ensureSizeIsEnough(1, posCount_, CV_32FC4, positions);
+        GpuMat(1, posCount_, CV_32FC4, outBuf_.ptr(0), outBuf_.step).copyTo(positions);
+
+        if (votes.needed())
+        {
+            ensureSizeIsEnough(1, posCount_, CV_32FC3, votes);
+            GpuMat(1, posCount_, CV_32FC4, outBuf_.ptr(1), outBuf_.step).copyTo(votes);
+        }
     }
+}
 
-    /////////////////////////////////////
-    // POSITION Ballard
+// GeneralizedHoughBallard
 
-    class GHT_Ballard_Pos : public GHT_Pos
+namespace
+{
+    class GeneralizedHoughBallardImpl : public GeneralizedHoughBallard, private GeneralizedHoughBase
     {
     public:
-        AlgorithmInfo* info() const;
+        GeneralizedHoughBallardImpl();
 
-        GHT_Ballard_Pos();
+        void setTemplate(InputArray templ, Point templCenter) { setTemplateImpl(templ, templCenter); }
+        void setTemplate(InputArray edges, InputArray dx, InputArray dy, Point templCenter) { setTemplateImpl(edges, dx, dy, templCenter); }
 
-    protected:
-        void releaseImpl();
+        void detect(InputArray image, OutputArray positions, OutputArray votes) { detectImpl(image, positions, votes); }
+        void detect(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes) { detectImpl(edges, dx, dy, positions, votes); }
 
+        void setCannyLowThresh(int cannyLowThresh) { cannyLowThresh_ = cannyLowThresh; }
+        int getCannyLowThresh() const { return cannyLowThresh_; }
+
+        void setCannyHighThresh(int cannyHighThresh) { cannyHighThresh_ = cannyHighThresh; }
+        int getCannyHighThresh() const { return cannyHighThresh_; }
+
+        void setMinDist(double minDist) { minDist_ = minDist; }
+        double getMinDist() const { return minDist_; }
+
+        void setDp(double dp) { dp_ = dp; }
+        double getDp() const { return dp_; }
+
+        void setMaxBufferSize(int maxBufferSize) { maxBufferSize_ = maxBufferSize; }
+        int getMaxBufferSize() const { return maxBufferSize_; }
+
+        void setLevels(int levels) { levels_ = levels; }
+        int getLevels() const { return levels_; }
+
+        void setVotesThreshold(int votesThreshold) { votesThreshold_ = votesThreshold; }
+        int getVotesThreshold() const { return votesThreshold_; }
+
+    private:
         void processTempl();
         void processImage();
 
-        virtual void calcHist();
-        virtual void findPosInHist();
+        void calcHist();
+        void findPosInHist();
 
-        int levels;
-        int votesThreshold;
-        double dp;
+        int levels_;
+        int votesThreshold_;
 
-        GpuMat r_table;
-        GpuMat r_sizes;
+        GpuMat r_table_;
+        GpuMat r_sizes_;
 
-        GpuMat hist;
+        GpuMat hist_;
     };
 
-    CV_INIT_ALGORITHM(GHT_Ballard_Pos, "GeneralizedHough_GPU.POSITION",
-                      obj.info()->addParam(obj, "maxSize", obj.maxSize, false, 0, 0,
-                                           "Maximal size of inner buffers.");
-                      obj.info()->addParam(obj, "minDist", obj.minDist, false, 0, 0,
-                                           "Minimum distance between the centers of the detected objects.");
-                      obj.info()->addParam(obj, "levels", obj.levels, false, 0, 0,
-                                           "R-Table levels.");
-                      obj.info()->addParam(obj, "votesThreshold", obj.votesThreshold, false, 0, 0,
-                                           "The accumulator threshold for the template centers at the detection stage. The smaller it is, the more false positions may be detected.");
-                      obj.info()->addParam(obj, "dp", obj.dp, false, 0, 0,
-                                           "Inverse ratio of the accumulator resolution to the image resolution."));
-
-    GHT_Ballard_Pos::GHT_Ballard_Pos()
+    GeneralizedHoughBallardImpl::GeneralizedHoughBallardImpl()
     {
-        levels = 360;
-        votesThreshold = 100;
-        dp = 1.0;
+        levels_ = 360;
+        votesThreshold_ = 100;
     }
 
-    void GHT_Ballard_Pos::releaseImpl()
-    {
-        GHT_Pos::releaseImpl();
-
-        r_table.release();
-        r_sizes.release();
-
-        hist.release();
-    }
-
-    void GHT_Ballard_Pos::processTempl()
+    void GeneralizedHoughBallardImpl::processTempl()
     {
         using namespace cv::gpu::cudev::ght;
 
-        CV_Assert(levels > 0);
+        CV_Assert( levels_ > 0 );
 
-        buildEdgePointList(templEdges, templDx, templDy);
+        buildEdgePointList(templEdges_, templDx_, templDy_);
 
-        ensureSizeIsEnough(levels + 1, maxSize, CV_16SC2, r_table);
-        ensureSizeIsEnough(1, levels + 1, CV_32SC1, r_sizes);
-        r_sizes.setTo(Scalar::all(0));
+        ensureSizeIsEnough(levels_ + 1, maxBufferSize_, CV_16SC2, r_table_);
+        ensureSizeIsEnough(1, levels_ + 1, CV_32SC1, r_sizes_);
+        r_sizes_.setTo(Scalar::all(0));
 
-        if (edgePointList.cols > 0)
+        if (edgePointList_.cols > 0)
         {
-            buildRTable_gpu(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
-                            r_table, r_sizes.ptr<int>(), make_short2(templCenter.x, templCenter.y), levels);
-            gpu::min(r_sizes, maxSize, r_sizes);
+            buildRTable_gpu(edgePointList_.ptr<unsigned int>(0), edgePointList_.ptr<float>(1), edgePointList_.cols,
+                            r_table_, r_sizes_.ptr<int>(), make_short2(templCenter_.x, templCenter_.y), levels_);
+            gpu::min(r_sizes_, maxBufferSize_, r_sizes_);
         }
     }
 
-    void GHT_Ballard_Pos::processImage()
+    void GeneralizedHoughBallardImpl::processImage()
     {
         calcHist();
         findPosInHist();
     }
 
-    void GHT_Ballard_Pos::calcHist()
+    void GeneralizedHoughBallardImpl::calcHist()
     {
         using namespace cv::gpu::cudev::ght;
 
-        CV_Assert(levels > 0 && r_table.rows == (levels + 1) && r_sizes.cols == (levels + 1));
-        CV_Assert(dp > 0.0);
+        CV_Assert( levels_ > 0 && r_table_.rows == (levels_ + 1) && r_sizes_.cols == (levels_ + 1) );
+        CV_Assert( dp_ > 0.0);
 
-        const double idp = 1.0 / dp;
+        const double idp = 1.0 / dp_;
 
-        buildEdgePointList(imageEdges, imageDx, imageDy);
+        buildEdgePointList(imageEdges_, imageDx_, imageDy_);
 
-        ensureSizeIsEnough(cvCeil(imageSize.height * idp) + 2, cvCeil(imageSize.width * idp) + 2, CV_32SC1, hist);
-        hist.setTo(Scalar::all(0));
+        ensureSizeIsEnough(cvCeil(imageSize_.height * idp) + 2, cvCeil(imageSize_.width * idp) + 2, CV_32SC1, hist_);
+        hist_.setTo(Scalar::all(0));
 
-        if (edgePointList.cols > 0)
+        if (edgePointList_.cols > 0)
         {
-            Ballard_Pos_calcHist_gpu(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
-                                     r_table, r_sizes.ptr<int>(),
-                                     hist,
-                                     (float)dp, levels);
+            Ballard_Pos_calcHist_gpu(edgePointList_.ptr<unsigned int>(0), edgePointList_.ptr<float>(1), edgePointList_.cols,
+                                     r_table_, r_sizes_.ptr<int>(),
+                                     hist_,
+                                     (float)dp_, levels_);
         }
     }
 
-    void GHT_Ballard_Pos::findPosInHist()
+    void GeneralizedHoughBallardImpl::findPosInHist()
     {
         using namespace cv::gpu::cudev::ght;
 
-        CV_Assert(votesThreshold > 0);
+        CV_Assert( votesThreshold_ > 0 );
 
-        ensureSizeIsEnough(2, maxSize, CV_32FC4, outBuf);
+        ensureSizeIsEnough(2, maxBufferSize_, CV_32FC4, outBuf_);
 
-        posCount = Ballard_Pos_findPosInHist_gpu(hist, outBuf.ptr<float4>(0), outBuf.ptr<int3>(1), maxSize, (float)dp, votesThreshold);
+        posCount_ = Ballard_Pos_findPosInHist_gpu(hist_, outBuf_.ptr<float4>(0), outBuf_.ptr<int3>(1), maxBufferSize_, (float)dp_, votesThreshold_);
     }
+}
 
-    /////////////////////////////////////
-    // POSITION & SCALE
+Ptr<GeneralizedHoughBallard> cv::gpu::createGeneralizedHoughBallard()
+{
+    return new GeneralizedHoughBallardImpl;
+}
 
-    class GHT_Ballard_PosScale : public GHT_Ballard_Pos
+// GeneralizedHoughGuil
+
+namespace
+{
+    class GeneralizedHoughGuilImpl : public GeneralizedHoughGuil, private GeneralizedHoughBase
     {
     public:
-        AlgorithmInfo* info() const;
+        GeneralizedHoughGuilImpl();
 
-        GHT_Ballard_PosScale();
+        void setTemplate(InputArray templ, Point templCenter) { setTemplateImpl(templ, templCenter); }
+        void setTemplate(InputArray edges, InputArray dx, InputArray dy, Point templCenter) { setTemplateImpl(edges, dx, dy, templCenter); }
 
-    protected:
-        void calcHist();
-        void findPosInHist();
+        void detect(InputArray image, OutputArray positions, OutputArray votes) { detectImpl(image, positions, votes); }
+        void detect(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes) { detectImpl(edges, dx, dy, positions, votes); }
 
-        double minScale;
-        double maxScale;
-        double scaleStep;
-    };
+        void setCannyLowThresh(int cannyLowThresh) { cannyLowThresh_ = cannyLowThresh; }
+        int getCannyLowThresh() const { return cannyLowThresh_; }
 
-    CV_INIT_ALGORITHM(GHT_Ballard_PosScale, "GeneralizedHough_GPU.POSITION_SCALE",
-                      obj.info()->addParam(obj, "maxSize", obj.maxSize, false, 0, 0,
-                                           "Maximal size of inner buffers.");
-                      obj.info()->addParam(obj, "minDist", obj.minDist, false, 0, 0,
-                                           "Minimum distance between the centers of the detected objects.");
-                      obj.info()->addParam(obj, "levels", obj.levels, false, 0, 0,
-                                           "R-Table levels.");
-                      obj.info()->addParam(obj, "votesThreshold", obj.votesThreshold, false, 0, 0,
-                                           "The accumulator threshold for the template centers at the detection stage. The smaller it is, the more false positions may be detected.");
-                      obj.info()->addParam(obj, "dp", obj.dp, false, 0, 0,
-                                           "Inverse ratio of the accumulator resolution to the image resolution.");
-                      obj.info()->addParam(obj, "minScale", obj.minScale, false, 0, 0,
-                                           "Minimal scale to detect.");
-                      obj.info()->addParam(obj, "maxScale", obj.maxScale, false, 0, 0,
-                                           "Maximal scale to detect.");
-                      obj.info()->addParam(obj, "scaleStep", obj.scaleStep, false, 0, 0,
-                                           "Scale step."));
+        void setCannyHighThresh(int cannyHighThresh) { cannyHighThresh_ = cannyHighThresh; }
+        int getCannyHighThresh() const { return cannyHighThresh_; }
 
-    GHT_Ballard_PosScale::GHT_Ballard_PosScale()
-    {
-        minScale = 0.5;
-        maxScale = 2.0;
-        scaleStep = 0.05;
-    }
+        void setMinDist(double minDist) { minDist_ = minDist; }
+        double getMinDist() const { return minDist_; }
 
-    void GHT_Ballard_PosScale::calcHist()
-    {
-        using namespace cv::gpu::cudev::ght;
+        void setDp(double dp) { dp_ = dp; }
+        double getDp() const { return dp_; }
 
-        CV_Assert(levels > 0 && r_table.rows == (levels + 1) && r_sizes.cols == (levels + 1));
-        CV_Assert(dp > 0.0);
-        CV_Assert(minScale > 0.0 && minScale < maxScale);
-        CV_Assert(scaleStep > 0.0);
+        void setMaxBufferSize(int maxBufferSize) { maxBufferSize_ = maxBufferSize; }
+        int getMaxBufferSize() const { return maxBufferSize_; }
 
-        const double idp = 1.0 / dp;
-        const int scaleRange = cvCeil((maxScale - minScale) / scaleStep);
-        const int rows = cvCeil(imageSize.height * idp);
-        const int cols = cvCeil(imageSize.width * idp);
+        void setXi(double xi) { xi_ = xi; }
+        double getXi() const { return xi_; }
 
-        buildEdgePointList(imageEdges, imageDx, imageDy);
+        void setLevels(int levels) { levels_ = levels; }
+        int getLevels() const { return levels_; }
 
-        ensureSizeIsEnough((scaleRange + 2) * (rows + 2), cols + 2, CV_32SC1, hist);
-        hist.setTo(Scalar::all(0));
+        void setAngleEpsilon(double angleEpsilon) { angleEpsilon_ = angleEpsilon; }
+        double getAngleEpsilon() const { return angleEpsilon_; }
 
-        if (edgePointList.cols > 0)
+        void setMinAngle(double minAngle) { minAngle_ = minAngle; }
+        double getMinAngle() const { return minAngle_; }
+
+        void setMaxAngle(double maxAngle) { maxAngle_ = maxAngle; }
+        double getMaxAngle() const { return maxAngle_; }
+
+        void setAngleStep(double angleStep) { angleStep_ = angleStep; }
+        double getAngleStep() const { return angleStep_; }
+
+        void setAngleThresh(int angleThresh) { angleThresh_ = angleThresh; }
+        int getAngleThresh() const { return angleThresh_; }
+
+        void setMinScale(double minScale) { minScale_ = minScale; }
+        double getMinScale() const { return minScale_; }
+
+        void setMaxScale(double maxScale) { maxScale_ = maxScale; }
+        double getMaxScale() const { return maxScale_; }
+
+        void setScaleStep(double scaleStep) { scaleStep_ = scaleStep; }
+        double getScaleStep() const { return scaleStep_; }
+
+        void setScaleThresh(int scaleThresh) { scaleThresh_ = scaleThresh; }
+        int getScaleThresh() const { return scaleThresh_; }
+
+        void setPosThresh(int posThresh) { posThresh_ = posThresh; }
+        int getPosThresh() const { return posThresh_; }
+
+    private:
+        void processTempl();
+        void processImage();
+
+        double xi_;
+        int levels_;
+        double angleEpsilon_;
+
+        double minAngle_;
+        double maxAngle_;
+        double angleStep_;
+        int angleThresh_;
+
+        double minScale_;
+        double maxScale_;
+        double scaleStep_;
+        int scaleThresh_;
+
+        int posThresh_;
+
+        struct Feature
         {
-            Ballard_PosScale_calcHist_gpu(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
-                                          r_table, r_sizes.ptr<int>(),
-                                          hist, rows, cols,
-                                          (float)minScale, (float)scaleStep, scaleRange, (float)dp, levels);
-        }
-    }
+            GpuMat p1_pos;
+            GpuMat p1_theta;
+            GpuMat p2_pos;
 
-    void GHT_Ballard_PosScale::findPosInHist()
-    {
-        using namespace cv::gpu::cudev::ght;
+            GpuMat d12;
 
-        CV_Assert(votesThreshold > 0);
+            GpuMat r1;
+            GpuMat r2;
 
-        const double idp = 1.0 / dp;
-        const int scaleRange = cvCeil((maxScale - minScale) / scaleStep);
-        const int rows = cvCeil(imageSize.height * idp);
-        const int cols = cvCeil(imageSize.width * idp);
+            GpuMat sizes;
+            int maxSize;
 
-        ensureSizeIsEnough(2, maxSize, CV_32FC4, outBuf);
+            void create(int levels, int maxCapacity, bool isTempl);
+        };
 
-        posCount =  Ballard_PosScale_findPosInHist_gpu(hist, rows, cols, scaleRange, outBuf.ptr<float4>(0), outBuf.ptr<int3>(1), maxSize, (float)minScale, (float)scaleStep, (float)dp, votesThreshold);
-    }
+        typedef void (*set_func_t)(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2);
+        typedef void (*build_func_t)(const unsigned int* coordList, const float* thetaList, int pointsCount,
+                                     int* sizes, int maxSize,
+                                     float xi, float angleEpsilon, int levels,
+                                     float2 center, float maxDist);
 
-    /////////////////////////////////////
-    // POSITION & Rotation
+        void buildFeatureList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Feature& features,
+                              set_func_t set_func, build_func_t build_func, bool isTempl, Point2d center = Point2d());
 
-    class GHT_Ballard_PosRotation : public GHT_Ballard_Pos
-    {
-    public:
-        AlgorithmInfo* info() const;
+        void calcOrientation();
+        void calcScale(double angle);
+        void calcPosition(double angle, int angleVotes, double scale, int scaleVotes);
 
-        GHT_Ballard_PosRotation();
+        Feature templFeatures_;
+        Feature imageFeatures_;
 
-    protected:
-        void calcHist();
-        void findPosInHist();
+        std::vector< std::pair<double, int> > angles_;
+        std::vector< std::pair<double, int> > scales_;
 
-        double minAngle;
-        double maxAngle;
-        double angleStep;
+        GpuMat hist_;
+        std::vector<int> h_buf_;
     };
 
-    CV_INIT_ALGORITHM(GHT_Ballard_PosRotation, "GeneralizedHough_GPU.POSITION_ROTATION",
-                      obj.info()->addParam(obj, "maxSize", obj.maxSize, false, 0, 0,
-                                           "Maximal size of inner buffers.");
-                      obj.info()->addParam(obj, "minDist", obj.minDist, false, 0, 0,
-                                           "Minimum distance between the centers of the detected objects.");
-                      obj.info()->addParam(obj, "levels", obj.levels, false, 0, 0,
-                                           "R-Table levels.");
-                      obj.info()->addParam(obj, "votesThreshold", obj.votesThreshold, false, 0, 0,
-                                           "The accumulator threshold for the template centers at the detection stage. The smaller it is, the more false positions may be detected.");
-                      obj.info()->addParam(obj, "dp", obj.dp, false, 0, 0,
-                                           "Inverse ratio of the accumulator resolution to the image resolution.");
-                      obj.info()->addParam(obj, "minAngle", obj.minAngle, false, 0, 0,
-                                           "Minimal rotation angle to detect in degrees.");
-                      obj.info()->addParam(obj, "maxAngle", obj.maxAngle, false, 0, 0,
-                                           "Maximal rotation angle to detect in degrees.");
-                      obj.info()->addParam(obj, "angleStep", obj.angleStep, false, 0, 0,
-                                           "Angle step in degrees."));
-
-    GHT_Ballard_PosRotation::GHT_Ballard_PosRotation()
-    {
-        minAngle = 0.0;
-        maxAngle = 360.0;
-        angleStep = 1.0;
-    }
-
-    void GHT_Ballard_PosRotation::calcHist()
-    {
-        using namespace cv::gpu::cudev::ght;
-
-        CV_Assert(levels > 0 && r_table.rows == (levels + 1) && r_sizes.cols == (levels + 1));
-        CV_Assert(dp > 0.0);
-        CV_Assert(minAngle >= 0.0 && minAngle < maxAngle && maxAngle <= 360.0);
-        CV_Assert(angleStep > 0.0 && angleStep < 360.0);
-
-        const double idp = 1.0 / dp;
-        const int angleRange = cvCeil((maxAngle - minAngle) / angleStep);
-        const int rows = cvCeil(imageSize.height * idp);
-        const int cols = cvCeil(imageSize.width * idp);
-
-        buildEdgePointList(imageEdges, imageDx, imageDy);
-
-        ensureSizeIsEnough((angleRange + 2) * (rows + 2), cols + 2, CV_32SC1, hist);
-        hist.setTo(Scalar::all(0));
-
-        if (edgePointList.cols > 0)
-        {
-            Ballard_PosRotation_calcHist_gpu(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
-                                             r_table, r_sizes.ptr<int>(),
-                                             hist, rows, cols,
-                                             (float)minAngle, (float)angleStep, angleRange, (float)dp, levels);
-        }
-    }
-
-    void GHT_Ballard_PosRotation::findPosInHist()
-    {
-        using namespace cv::gpu::cudev::ght;
-
-        CV_Assert(votesThreshold > 0);
-
-        const double idp = 1.0 / dp;
-        const int angleRange = cvCeil((maxAngle - minAngle) / angleStep);
-        const int rows = cvCeil(imageSize.height * idp);
-        const int cols = cvCeil(imageSize.width * idp);
-
-        ensureSizeIsEnough(2, maxSize, CV_32FC4, outBuf);
-
-        posCount = Ballard_PosRotation_findPosInHist_gpu(hist, rows, cols, angleRange, outBuf.ptr<float4>(0), outBuf.ptr<int3>(1), maxSize, (float)minAngle, (float)angleStep, (float)dp, votesThreshold);
-    }
-
-    /////////////////////////////////////////
-    // POSITION & SCALE & ROTATION
-
     double toRad(double a)
     {
         return a * CV_PI / 180.0;
@@ -830,211 +705,94 @@ namespace
         return (fabs(clampAngle(a - b)) <= eps);
     }
 
-    class GHT_Guil_Full : public GHT_Pos
+    GeneralizedHoughGuilImpl::GeneralizedHoughGuilImpl()
     {
-    public:
-        AlgorithmInfo* info() const;
+        maxBufferSize_ = 1000;
 
-        GHT_Guil_Full();
+        xi_ = 90.0;
+        levels_ = 360;
+        angleEpsilon_ = 1.0;
 
-    protected:
-        void releaseImpl();
+        minAngle_ = 0.0;
+        maxAngle_ = 360.0;
+        angleStep_ = 1.0;
+        angleThresh_ = 15000;
 
-        void processTempl();
-        void processImage();
+        minScale_ = 0.5;
+        maxScale_ = 2.0;
+        scaleStep_ = 0.05;
+        scaleThresh_ = 1000;
 
-        struct Feature
-        {
-            GpuMat p1_pos;
-            GpuMat p1_theta;
-            GpuMat p2_pos;
-
-            GpuMat d12;
-
-            GpuMat r1;
-            GpuMat r2;
-
-            GpuMat sizes;
-            int maxSize;
-
-            void create(int levels, int maxCapacity, bool isTempl);
-            void release();
-        };
-
-        typedef void (*set_func_t)(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2);
-        typedef void (*build_func_t)(const unsigned int* coordList, const float* thetaList, int pointsCount,
-                                     int* sizes, int maxSize,
-                                     float xi, float angleEpsilon, int levels,
-                                     float2 center, float maxDist);
-
-        void buildFeatureList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Feature& features,
-                              set_func_t set_func, build_func_t build_func, bool isTempl, Point2d center = Point2d());
-
-        void calcOrientation();
-        void calcScale(double angle);
-        void calcPosition(double angle, int angleVotes, double scale, int scaleVotes);
-
-        double xi;
-        int levels;
-        double angleEpsilon;
-
-        double minAngle;
-        double maxAngle;
-        double angleStep;
-        int angleThresh;
-
-        double minScale;
-        double maxScale;
-        double scaleStep;
-        int scaleThresh;
-
-        double dp;
-        int posThresh;
-
-        Feature templFeatures;
-        Feature imageFeatures;
-
-        std::vector< std::pair<double, int> > angles;
-        std::vector< std::pair<double, int> > scales;
-
-        GpuMat hist;
-        std::vector<int> h_buf;
-    };
-
-    CV_INIT_ALGORITHM(GHT_Guil_Full, "GeneralizedHough_GPU.POSITION_SCALE_ROTATION",
-                      obj.info()->addParam(obj, "minDist", obj.minDist, false, 0, 0,
-                                           "Minimum distance between the centers of the detected objects.");
-                      obj.info()->addParam(obj, "maxSize", obj.maxSize, false, 0, 0,
-                                           "Maximal size of inner buffers.");
-                      obj.info()->addParam(obj, "xi", obj.xi, false, 0, 0,
-                                           "Angle difference in degrees between two points in feature.");
-                      obj.info()->addParam(obj, "levels", obj.levels, false, 0, 0,
-                                           "Feature table levels.");
-                      obj.info()->addParam(obj, "angleEpsilon", obj.angleEpsilon, false, 0, 0,
-                                           "Maximal difference between angles that treated as equal.");
-                      obj.info()->addParam(obj, "minAngle", obj.minAngle, false, 0, 0,
-                                           "Minimal rotation angle to detect in degrees.");
-                      obj.info()->addParam(obj, "maxAngle", obj.maxAngle, false, 0, 0,
-                                           "Maximal rotation angle to detect in degrees.");
-                      obj.info()->addParam(obj, "angleStep", obj.angleStep, false, 0, 0,
-                                           "Angle step in degrees.");
-                      obj.info()->addParam(obj, "angleThresh", obj.angleThresh, false, 0, 0,
-                                           "Angle threshold.");
-                      obj.info()->addParam(obj, "minScale", obj.minScale, false, 0, 0,
-                                           "Minimal scale to detect.");
-                      obj.info()->addParam(obj, "maxScale", obj.maxScale, false, 0, 0,
-                                           "Maximal scale to detect.");
-                      obj.info()->addParam(obj, "scaleStep", obj.scaleStep, false, 0, 0,
-                                           "Scale step.");
-                      obj.info()->addParam(obj, "scaleThresh", obj.scaleThresh, false, 0, 0,
-                                           "Scale threshold.");
-                      obj.info()->addParam(obj, "dp", obj.dp, false, 0, 0,
-                                           "Inverse ratio of the accumulator resolution to the image resolution.");
-                      obj.info()->addParam(obj, "posThresh", obj.posThresh, false, 0, 0,
-                                           "Position threshold."));
-
-    GHT_Guil_Full::GHT_Guil_Full()
-    {
-        maxSize = 1000;
-        xi = 90.0;
-        levels = 360;
-        angleEpsilon = 1.0;
-
-        minAngle = 0.0;
-        maxAngle = 360.0;
-        angleStep = 1.0;
-        angleThresh = 15000;
-
-        minScale = 0.5;
-        maxScale = 2.0;
-        scaleStep = 0.05;
-        scaleThresh = 1000;
-
-        dp = 1.0;
-        posThresh = 100;
+        posThresh_ = 100;
     }
 
-    void GHT_Guil_Full::releaseImpl()
-    {
-        GHT_Pos::releaseImpl();
-
-        templFeatures.release();
-        imageFeatures.release();
-
-        releaseVector(angles);
-        releaseVector(scales);
-
-        hist.release();
-        releaseVector(h_buf);
-    }
-
-    void GHT_Guil_Full::processTempl()
+    void GeneralizedHoughGuilImpl::processTempl()
     {
         using namespace cv::gpu::cudev::ght;
 
-        buildFeatureList(templEdges, templDx, templDy, templFeatures,
+        buildFeatureList(templEdges_, templDx_, templDy_, templFeatures_,
             Guil_Full_setTemplFeatures, Guil_Full_buildTemplFeatureList_gpu,
-            true, templCenter);
+            true, templCenter_);
 
-        h_buf.resize(templFeatures.sizes.cols);
-        cudaSafeCall( cudaMemcpy(&h_buf[0], templFeatures.sizes.data, h_buf.size() * sizeof(int), cudaMemcpyDeviceToHost) );
-        templFeatures.maxSize = *max_element(h_buf.begin(), h_buf.end());
+        h_buf_.resize(templFeatures_.sizes.cols);
+        cudaSafeCall( cudaMemcpy(&h_buf_[0], templFeatures_.sizes.data, h_buf_.size() * sizeof(int), cudaMemcpyDeviceToHost) );
+        templFeatures_.maxSize = *std::max_element(h_buf_.begin(), h_buf_.end());
     }
 
-    void GHT_Guil_Full::processImage()
+    void GeneralizedHoughGuilImpl::processImage()
     {
         using namespace cv::gpu::cudev::ght;
 
-        CV_Assert(levels > 0);
-        CV_Assert(templFeatures.sizes.cols == levels + 1);
-        CV_Assert(minAngle >= 0.0 && minAngle < maxAngle && maxAngle <= 360.0);
-        CV_Assert(angleStep > 0.0 && angleStep < 360.0);
-        CV_Assert(angleThresh > 0);
-        CV_Assert(minScale > 0.0 && minScale < maxScale);
-        CV_Assert(scaleStep > 0.0);
-        CV_Assert(scaleThresh > 0);
-        CV_Assert(dp > 0.0);
-        CV_Assert(posThresh > 0);
+        CV_Assert( levels_ > 0 );
+        CV_Assert( templFeatures_.sizes.cols == levels_ + 1 );
+        CV_Assert( minAngle_ >= 0.0 && minAngle_ < maxAngle_ && maxAngle_ <= 360.0 );
+        CV_Assert( angleStep_ > 0.0 && angleStep_ < 360.0 );
+        CV_Assert( angleThresh_ > 0 );
+        CV_Assert( minScale_ > 0.0 && minScale_ < maxScale_ );
+        CV_Assert( scaleStep_ > 0.0 );
+        CV_Assert( scaleThresh_ > 0 );
+        CV_Assert( dp_ > 0.0 );
+        CV_Assert( posThresh_ > 0 );
 
-        const double iAngleStep = 1.0 / angleStep;
-        const int angleRange = cvCeil((maxAngle - minAngle) * iAngleStep);
+        const double iAngleStep = 1.0 / angleStep_;
+        const int angleRange = cvCeil((maxAngle_ - minAngle_) * iAngleStep);
 
-        const double iScaleStep = 1.0 / scaleStep;
-        const int scaleRange = cvCeil((maxScale - minScale) * iScaleStep);
+        const double iScaleStep = 1.0 / scaleStep_;
+        const int scaleRange = cvCeil((maxScale_ - minScale_) * iScaleStep);
 
-        const double idp = 1.0 / dp;
-        const int histRows = cvCeil(imageSize.height * idp);
-        const int histCols = cvCeil(imageSize.width * idp);
+        const double idp = 1.0 / dp_;
+        const int histRows = cvCeil(imageSize_.height * idp);
+        const int histCols = cvCeil(imageSize_.width * idp);
 
-        ensureSizeIsEnough(histRows + 2, std::max(angleRange + 1, std::max(scaleRange + 1, histCols + 2)), CV_32SC1, hist);
-        h_buf.resize(std::max(angleRange + 1, scaleRange + 1));
+        ensureSizeIsEnough(histRows + 2, std::max(angleRange + 1, std::max(scaleRange + 1, histCols + 2)), CV_32SC1, hist_);
+        h_buf_.resize(std::max(angleRange + 1, scaleRange + 1));
 
-        ensureSizeIsEnough(2, maxSize, CV_32FC4, outBuf);
+        ensureSizeIsEnough(2, maxBufferSize_, CV_32FC4, outBuf_);
 
-        buildFeatureList(imageEdges, imageDx, imageDy, imageFeatures,
+        buildFeatureList(imageEdges_, imageDx_, imageDy_, imageFeatures_,
             Guil_Full_setImageFeatures, Guil_Full_buildImageFeatureList_gpu,
             false);
 
         calcOrientation();
 
-        for (size_t i = 0; i < angles.size(); ++i)
+        for (size_t i = 0; i < angles_.size(); ++i)
         {
-            const double angle = angles[i].first;
-            const int angleVotes = angles[i].second;
+            const double angle = angles_[i].first;
+            const int angleVotes = angles_[i].second;
 
             calcScale(angle);
 
-            for (size_t j = 0; j < scales.size(); ++j)
+            for (size_t j = 0; j < scales_.size(); ++j)
             {
-                const double scale = scales[j].first;
-                const int scaleVotes = scales[j].second;
+                const double scale = scales_[j].first;
+                const int scaleVotes = scales_[j].second;
 
                 calcPosition(angle, angleVotes, scale, scaleVotes);
             }
         }
     }
 
-    void GHT_Guil_Full::Feature::create(int levels, int maxCapacity, bool isTempl)
+    void GeneralizedHoughGuilImpl::Feature::create(int levels, int maxCapacity, bool isTempl)
     {
         if (!isTempl)
         {
@@ -1058,128 +816,91 @@ namespace
         maxSize = 0;
     }
 
-    void GHT_Guil_Full::Feature::release()
+    void GeneralizedHoughGuilImpl::buildFeatureList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Feature& features,
+                                                    set_func_t set_func, build_func_t build_func, bool isTempl, Point2d center)
     {
-        p1_pos.release();
-        p1_theta.release();
-        p2_pos.release();
+        CV_Assert( levels_ > 0 );
 
-        d12.release();
+        const double maxDist = sqrt((double) templSize_.width * templSize_.width + templSize_.height * templSize_.height) * maxScale_;
 
-        r1.release();
-        r2.release();
-
-        sizes.release();
-
-        maxSize = 0;
-    }
-
-    void GHT_Guil_Full::buildFeatureList(const GpuMat& edges, const GpuMat& dx, const GpuMat& dy, Feature& features,
-                                         set_func_t set_func, build_func_t build_func, bool isTempl, Point2d center)
-    {
-        CV_Assert(levels > 0);
-
-        const double maxDist = sqrt((double) templSize.width * templSize.width + templSize.height * templSize.height) * maxScale;
-
-        features.create(levels, maxSize, isTempl);
+        features.create(levels_, maxBufferSize_, isTempl);
         set_func(features.p1_pos, features.p1_theta, features.p2_pos, features.d12, features.r1, features.r2);
 
         buildEdgePointList(edges, dx, dy);
 
-        if (edgePointList.cols > 0)
+        if (edgePointList_.cols > 0)
         {
-            build_func(edgePointList.ptr<unsigned int>(0), edgePointList.ptr<float>(1), edgePointList.cols,
-                features.sizes.ptr<int>(), maxSize, (float)xi, (float)angleEpsilon, levels, make_float2((float)center.x, (float)center.y), (float)maxDist);
+            build_func(edgePointList_.ptr<unsigned int>(0), edgePointList_.ptr<float>(1), edgePointList_.cols,
+                features.sizes.ptr<int>(), maxBufferSize_, (float)xi_, (float)angleEpsilon_, levels_, make_float2((float)center.x, (float)center.y), (float)maxDist);
         }
     }
 
-    void GHT_Guil_Full::calcOrientation()
+    void GeneralizedHoughGuilImpl::calcOrientation()
     {
         using namespace cv::gpu::cudev::ght;
 
-        const double iAngleStep = 1.0 / angleStep;
-        const int angleRange = cvCeil((maxAngle - minAngle) * iAngleStep);
+        const double iAngleStep = 1.0 / angleStep_;
+        const int angleRange = cvCeil((maxAngle_ - minAngle_) * iAngleStep);
 
-        hist.setTo(Scalar::all(0));
-        Guil_Full_calcOHist_gpu(templFeatures.sizes.ptr<int>(), imageFeatures.sizes.ptr<int>(0), hist.ptr<int>(),
-                                (float)minAngle, (float)maxAngle, (float)angleStep, angleRange, levels, templFeatures.maxSize);
-        cudaSafeCall( cudaMemcpy(&h_buf[0], hist.data, h_buf.size() * sizeof(int), cudaMemcpyDeviceToHost) );
+        hist_.setTo(Scalar::all(0));
+        Guil_Full_calcOHist_gpu(templFeatures_.sizes.ptr<int>(), imageFeatures_.sizes.ptr<int>(0), hist_.ptr<int>(),
+                                (float)minAngle_, (float)maxAngle_, (float)angleStep_, angleRange, levels_, templFeatures_.maxSize);
+        cudaSafeCall( cudaMemcpy(&h_buf_[0], hist_.data, h_buf_.size() * sizeof(int), cudaMemcpyDeviceToHost) );
 
-        angles.clear();
+        angles_.clear();
 
         for (int n = 0; n < angleRange; ++n)
         {
-            if (h_buf[n] >= angleThresh)
+            if (h_buf_[n] >= angleThresh_)
             {
-                const double angle = minAngle + n * angleStep;
-                angles.push_back(std::make_pair(angle, h_buf[n]));
+                const double angle = minAngle_ + n * angleStep_;
+                angles_.push_back(std::make_pair(angle, h_buf_[n]));
             }
         }
     }
 
-    void GHT_Guil_Full::calcScale(double angle)
+    void GeneralizedHoughGuilImpl::calcScale(double angle)
     {
         using namespace cv::gpu::cudev::ght;
 
-        const double iScaleStep = 1.0 / scaleStep;
-        const int scaleRange = cvCeil((maxScale - minScale) * iScaleStep);
+        const double iScaleStep = 1.0 / scaleStep_;
+        const int scaleRange = cvCeil((maxScale_ - minScale_) * iScaleStep);
 
-        hist.setTo(Scalar::all(0));
-        Guil_Full_calcSHist_gpu(templFeatures.sizes.ptr<int>(), imageFeatures.sizes.ptr<int>(0), hist.ptr<int>(),
-                                (float)angle, (float)angleEpsilon, (float)minScale, (float)maxScale,
-                                (float)iScaleStep, scaleRange, levels, templFeatures.maxSize);
-        cudaSafeCall( cudaMemcpy(&h_buf[0], hist.data, h_buf.size() * sizeof(int), cudaMemcpyDeviceToHost) );
+        hist_.setTo(Scalar::all(0));
+        Guil_Full_calcSHist_gpu(templFeatures_.sizes.ptr<int>(), imageFeatures_.sizes.ptr<int>(0), hist_.ptr<int>(),
+                                (float)angle, (float)angleEpsilon_, (float)minScale_, (float)maxScale_,
+                                (float)iScaleStep, scaleRange, levels_, templFeatures_.maxSize);
+        cudaSafeCall( cudaMemcpy(&h_buf_[0], hist_.data, h_buf_.size() * sizeof(int), cudaMemcpyDeviceToHost) );
 
-        scales.clear();
+        scales_.clear();
 
         for (int s = 0; s < scaleRange; ++s)
         {
-            if (h_buf[s] >= scaleThresh)
+            if (h_buf_[s] >= scaleThresh_)
             {
-                const double scale = minScale + s * scaleStep;
-                scales.push_back(std::make_pair(scale, h_buf[s]));
+                const double scale = minScale_ + s * scaleStep_;
+                scales_.push_back(std::make_pair(scale, h_buf_[s]));
             }
         }
     }
 
-    void GHT_Guil_Full::calcPosition(double angle, int angleVotes, double scale, int scaleVotes)
+    void GeneralizedHoughGuilImpl::calcPosition(double angle, int angleVotes, double scale, int scaleVotes)
     {
         using namespace cv::gpu::cudev::ght;
 
-        hist.setTo(Scalar::all(0));
-        Guil_Full_calcPHist_gpu(templFeatures.sizes.ptr<int>(), imageFeatures.sizes.ptr<int>(0), hist,
-                                (float)angle, (float)angleEpsilon, (float)scale, (float)dp, levels, templFeatures.maxSize);
+        hist_.setTo(Scalar::all(0));
+        Guil_Full_calcPHist_gpu(templFeatures_.sizes.ptr<int>(), imageFeatures_.sizes.ptr<int>(0), hist_,
+                                (float)angle, (float)angleEpsilon_, (float)scale, (float)dp_, levels_, templFeatures_.maxSize);
 
-        posCount = Guil_Full_findPosInHist_gpu(hist, outBuf.ptr<float4>(0), outBuf.ptr<int3>(1),
-                                               posCount, maxSize, (float)angle, angleVotes,
-                                               (float)scale, scaleVotes, (float)dp, posThresh);
+        posCount_ = Guil_Full_findPosInHist_gpu(hist_, outBuf_.ptr<float4>(0), outBuf_.ptr<int3>(1),
+                                                posCount_, maxBufferSize_, (float)angle, angleVotes,
+                                                (float)scale, scaleVotes, (float)dp_, posThresh_);
     }
 }
 
-Ptr<gpu::GeneralizedHough> cv::gpu::GeneralizedHough::create(int method)
+Ptr<GeneralizedHoughGuil> cv::gpu::createGeneralizedHoughGuil()
 {
-    switch (method)
-    {
-    case cv::GeneralizedHough::GHT_POSITION:
-        CV_Assert( !GHT_Ballard_Pos_info_auto.name().empty() );
-        return new GHT_Ballard_Pos();
-
-    case (cv::GeneralizedHough::GHT_POSITION | cv::GeneralizedHough::GHT_SCALE):
-        CV_Assert( !GHT_Ballard_PosScale_info_auto.name().empty() );
-        return new GHT_Ballard_PosScale();
-
-    case (cv::GeneralizedHough::GHT_POSITION | cv::GeneralizedHough::GHT_ROTATION):
-        CV_Assert( !GHT_Ballard_PosRotation_info_auto.name().empty() );
-        return new GHT_Ballard_PosRotation();
-
-    case (cv::GeneralizedHough::GHT_POSITION | cv::GeneralizedHough::GHT_SCALE | cv::GeneralizedHough::GHT_ROTATION):
-        CV_Assert( !GHT_Guil_Full_info_auto.name().empty() );
-        return new GHT_Guil_Full();
-
-    default:
-        CV_Error(Error::StsBadArg, "Unsupported method");
-        return Ptr<GeneralizedHough>();
-    }
+    return new GeneralizedHoughGuilImpl;
 }
 
 #endif /* !defined (HAVE_CUDA) */
diff --git a/modules/gpuimgproc/test/test_hough.cpp b/modules/gpuimgproc/test/test_hough.cpp
index e4319bd219..969899d8b6 100644
--- a/modules/gpuimgproc/test/test_hough.cpp
+++ b/modules/gpuimgproc/test/test_hough.cpp
@@ -193,7 +193,7 @@ PARAM_TEST_CASE(GeneralizedHough, cv::gpu::DeviceInfo, UseRoi)
 {
 };
 
-GPU_TEST_P(GeneralizedHough, POSITION)
+GPU_TEST_P(GeneralizedHough, Ballard)
 {
     const cv::gpu::DeviceInfo devInfo = GET_PARAM(0);
     cv::gpu::setDevice(devInfo.deviceID());
@@ -218,16 +218,16 @@ GPU_TEST_P(GeneralizedHough, POSITION)
         templ.copyTo(imageROI);
     }
 
-    cv::Ptr<cv::gpu::GeneralizedHough> hough = cv::gpu::GeneralizedHough::create(cv::GeneralizedHough::GHT_POSITION);
-    hough->set("votesThreshold", 200);
+    cv::Ptr<cv::GeneralizedHoughBallard> alg = cv::gpu::createGeneralizedHoughBallard();
+    alg->setVotesThreshold(200);
 
-    hough->setTemplate(loadMat(templ, useRoi));
+    alg->setTemplate(loadMat(templ, useRoi));
 
     cv::gpu::GpuMat d_pos;
-    hough->detect(loadMat(image, useRoi), d_pos);
+    alg->detect(loadMat(image, useRoi), d_pos);
 
     std::vector<cv::Vec4f> pos;
-    hough->downloadResults(d_pos, pos);
+    d_pos.download(pos);
 
     ASSERT_EQ(gold_count, pos.size());
 
diff --git a/modules/gpuwarping/src/pyramids.cpp b/modules/gpuwarping/src/pyramids.cpp
index 577ed85677..0e8445df2c 100644
--- a/modules/gpuwarping/src/pyramids.cpp
+++ b/modules/gpuwarping/src/pyramids.cpp
@@ -181,7 +181,7 @@ namespace
 
             const GpuMat& prevLayer = i == 0 ? layer0_ : pyramid_[i - 1];
 
-            cudev::pyramid::downsampleX2(prevLayer, pyramid_[i], img.depth(), img.channels(), StreamAccessor::getStream(stream));
+            cv::gpu::cudev::pyramid::downsampleX2(prevLayer, pyramid_[i], img.depth(), img.channels(), StreamAccessor::getStream(stream));
 
             szLastLayer = szCurLayer;
         }
@@ -222,7 +222,7 @@ namespace
             lastLayer = curLayer;
         }
 
-        cudev::pyramid::interpolateFrom1(lastLayer, outImg, outImg.depth(), outImg.channels(), StreamAccessor::getStream(stream));
+        cv::gpu::cudev::pyramid::interpolateFrom1(lastLayer, outImg, outImg.depth(), outImg.channels(), StreamAccessor::getStream(stream));
     }
 }
 
diff --git a/modules/highgui/doc/user_interface.rst b/modules/highgui/doc/user_interface.rst
index 8957898490..0ffd69ac5e 100644
--- a/modules/highgui/doc/user_interface.rst
+++ b/modules/highgui/doc/user_interface.rst
@@ -81,6 +81,9 @@ The function ``imshow`` displays an image in the specified window. If the window
 
 If window was created with OpenGL support, ``imshow`` also support :ocv:class:`ogl::Buffer` ,  :ocv:class:`ogl::Texture2D` and  :ocv:class:`gpu::GpuMat` as input.
 
+.. note:: This function should be followed by ``waitKey`` function which displays the image for specified milliseconds. Otherwise, it won't display the image.
+
+
 namedWindow
 ---------------
 Creates a window.
diff --git a/modules/imgproc/doc/filtering.rst b/modules/imgproc/doc/filtering.rst
index d55ee1f740..5803145c5c 100755
--- a/modules/imgproc/doc/filtering.rst
+++ b/modules/imgproc/doc/filtering.rst
@@ -759,7 +759,7 @@ Dilates an image by using a specific structuring element.
 
     :param dst: output image of the same size and type as ``src``.
 
-    :param element: structuring element used for dilation; if  ``element=Mat()`` , a  ``3 x 3`` rectangular structuring element is used.
+    :param kernel: structuring element used for dilation; if  ``element=Mat()`` , a  ``3 x 3`` rectangular structuring element is used. Kernel can be created using :ocv:func:`getStructuringElement`
 
     :param anchor: position of the anchor within the element; default value ``(-1, -1)`` means that the anchor is at the element center.
 
@@ -782,11 +782,16 @@ The function supports the in-place mode. Dilation can be applied several ( ``ite
     :ocv:func:`erode`,
     :ocv:func:`morphologyEx`,
     :ocv:func:`createMorphologyFilter`
+    :ocv:func:`getStructuringElement`
+
 
 .. note::
 
    * An example using the morphological dilate operation can be found at opencv_source_code/samples/cpp/morphology2.cpp
 
+    
+
+
 erode
 -----
 Erodes an image by using a specific structuring element.
@@ -801,7 +806,7 @@ Erodes an image by using a specific structuring element.
 
     :param dst: output image of the same size and type as ``src``.
 
-    :param element: structuring element used for erosion; if  ``element=Mat()`` , a  ``3 x 3``  rectangular structuring element is used.
+    :param kernel: structuring element used for erosion; if  ``element=Mat()`` , a  ``3 x 3``  rectangular structuring element is used. Kernel can be created using :ocv:func:`getStructuringElement`.
 
     :param anchor: position of the anchor within the element; default value  ``(-1, -1)``  means that the anchor is at the element center.
 
@@ -823,7 +828,8 @@ The function supports the in-place mode. Erosion can be applied several ( ``iter
 
     :ocv:func:`dilate`,
     :ocv:func:`morphologyEx`,
-    :ocv:func:`createMorphologyFilter`
+    :ocv:func:`createMorphologyFilter`,
+    :ocv:func:`getStructuringElement`
 
 .. note::
 
@@ -956,7 +962,7 @@ Returns Gaussian filter coefficients.
     :param ksize: Aperture size. It should be odd ( :math:`\texttt{ksize} \mod 2 = 1` ) and positive.
 
     :param sigma: Gaussian standard deviation. If it is non-positive, it is computed from  ``ksize``  as  \ ``sigma = 0.3*((ksize-1)*0.5 - 1) + 0.8`` .
-    :param ktype: Type of filter coefficients. It can be  ``CV_32f``  or  ``CV_64F`` .
+    :param ktype: Type of filter coefficients. It can be  ``CV_32F``  or  ``CV_64F`` .
 
 The function computes and returns the
 :math:`\texttt{ksize} \times 1` matrix of Gaussian filter coefficients:
@@ -985,6 +991,32 @@ Two of such generated kernels can be passed to
 
 
 
+getGaborKernel
+-----------------
+Returns Gabor filter coefficients.
+
+.. ocv:function:: Mat getGaborKernel( Size ksize, double sigma, double theta, double lambd, double gamma, double psi = CV_PI*0.5, int ktype = CV_64F )
+
+.. ocv:pyfunction:: cv2.getGaborKernel(ksize, sigma, theta, lambd, gamma[, psi[, ktype]]) -> retval
+
+    :param ksize: Size of the filter returned.
+    
+    :param sigma: Standard deviation of the gaussian envelope.
+    
+    :param theta: Orientation of the normal to the parallel stripes of a Gabor function.
+    
+    :param lambd: Wavelength of the sinusoidal factor.
+    
+    :param gamma: Spatial aspect ratio.
+    
+    :param psi: Phase offset.
+    
+    :param ktype: Type of filter coefficients. It can be  ``CV_32F``  or  ``CV_64F`` .
+
+For more details about gabor filter equations and parameters, see: `Gabor Filter <http://en.wikipedia.org/wiki/Gabor_filter>`_.
+
+
+
 getKernelType
 -------------
 Returns the kernel type.
@@ -1099,7 +1131,9 @@ Performs advanced morphological transformations.
 
     :param dst: Destination image of the same size and type as  ``src`` .
 
-    :param element: Structuring element.
+    :param kernel: Structuring element. It can be created using :ocv:func:`getStructuringElement`.
+    
+    :param anchor: Anchor position with the kernel. Negative values mean that the anchor is at the kernel center.
 
     :param op: Type of a morphological operation that can be one of the following:
 
@@ -1157,7 +1191,8 @@ Any of the operations can be done in-place. In case of multi-channel images, eac
 
     :ocv:func:`dilate`,
     :ocv:func:`erode`,
-    :ocv:func:`createMorphologyFilter`
+    :ocv:func:`createMorphologyFilter`,
+    :ocv:func:`getStructuringElement`
 
 .. note::
 
diff --git a/modules/imgproc/doc/miscellaneous_transformations.rst b/modules/imgproc/doc/miscellaneous_transformations.rst
index 16fcf5372b..47de0b442d 100644
--- a/modules/imgproc/doc/miscellaneous_transformations.rst
+++ b/modules/imgproc/doc/miscellaneous_transformations.rst
@@ -799,7 +799,6 @@ See the sample ``grabcut.cpp`` to learn how to use the function.
 
 .. [Meyer92] Meyer, F. *Color Image Segmentation*, ICIP92, 1992
 
-.. [Telea04] Alexandru Telea, *An Image Inpainting Technique Based on the Fast Marching Method*. Journal of Graphics, GPU, and Game Tools 9 1, pp 23-34 (2004)
 
 .. note::
 
diff --git a/modules/imgproc/include/opencv2/imgproc.hpp b/modules/imgproc/include/opencv2/imgproc.hpp
index bf8f795ac4..c2a488aac2 100644
--- a/modules/imgproc/include/opencv2/imgproc.hpp
+++ b/modules/imgproc/include/opencv2/imgproc.hpp
@@ -694,39 +694,104 @@ public:
 
 
 //! finds arbitrary template in the grayscale image using Generalized Hough Transform
-//! Ballard, D.H. (1981). Generalizing the Hough transform to detect arbitrary shapes. Pattern Recognition 13 (2): 111-122.
-//! Guil, N., González-Linares, J.M. and Zapata, E.L. (1999). Bidimensional shape detection using an invariant approach. Pattern Recognition 32 (6): 1025-1038.
 class CV_EXPORTS GeneralizedHough : public Algorithm
 {
 public:
-    enum { GHT_POSITION = 0,
-           GHT_SCALE    = 1,
-           GHT_ROTATION = 2
-         };
-
-    static Ptr<GeneralizedHough> create(int method);
-
-    virtual ~GeneralizedHough();
-
     //! set template to search
-    void setTemplate(InputArray templ, int cannyThreshold = 100, Point templCenter = Point(-1, -1));
-    void setTemplate(InputArray edges, InputArray dx, InputArray dy, Point templCenter = Point(-1, -1));
+    virtual void setTemplate(InputArray templ, Point templCenter = Point(-1, -1)) = 0;
+    virtual void setTemplate(InputArray edges, InputArray dx, InputArray dy, Point templCenter = Point(-1, -1)) = 0;
 
     //! find template on image
-    void detect(InputArray image, OutputArray positions, OutputArray votes = cv::noArray(), int cannyThreshold = 100);
-    void detect(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes = cv::noArray());
+    virtual void detect(InputArray image, OutputArray positions, OutputArray votes = noArray()) = 0;
+    virtual void detect(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes = noArray()) = 0;
 
-    void release();
+    //! Canny low threshold.
+    virtual void setCannyLowThresh(int cannyLowThresh) = 0;
+    virtual int getCannyLowThresh() const = 0;
 
-protected:
-    virtual void setTemplateImpl(const Mat& edges, const Mat& dx, const Mat& dy, Point templCenter) = 0;
-    virtual void detectImpl(const Mat& edges, const Mat& dx, const Mat& dy, OutputArray positions, OutputArray votes) = 0;
-    virtual void releaseImpl() = 0;
+    //! Canny high threshold.
+    virtual void setCannyHighThresh(int cannyHighThresh) = 0;
+    virtual int getCannyHighThresh() const = 0;
 
-private:
-    Mat edges_;
-    Mat dx_;
-    Mat dy_;
+    //! Minimum distance between the centers of the detected objects.
+    virtual void setMinDist(double minDist) = 0;
+    virtual double getMinDist() const = 0;
+
+    //! Inverse ratio of the accumulator resolution to the image resolution.
+    virtual void setDp(double dp) = 0;
+    virtual double getDp() const = 0;
+
+    //! Maximal size of inner buffers.
+    virtual void setMaxBufferSize(int maxBufferSize) = 0;
+    virtual int getMaxBufferSize() const = 0;
+};
+
+//! Ballard, D.H. (1981). Generalizing the Hough transform to detect arbitrary shapes. Pattern Recognition 13 (2): 111-122.
+//! Detects position only without traslation and rotation
+class CV_EXPORTS GeneralizedHoughBallard : public GeneralizedHough
+{
+public:
+    //! R-Table levels.
+    virtual void setLevels(int levels) = 0;
+    virtual int getLevels() const = 0;
+
+    //! The accumulator threshold for the template centers at the detection stage. The smaller it is, the more false positions may be detected.
+    virtual void setVotesThreshold(int votesThreshold) = 0;
+    virtual int getVotesThreshold() const = 0;
+};
+
+//! Guil, N., González-Linares, J.M. and Zapata, E.L. (1999). Bidimensional shape detection using an invariant approach. Pattern Recognition 32 (6): 1025-1038.
+//! Detects position, traslation and rotation
+class CV_EXPORTS GeneralizedHoughGuil : public GeneralizedHough
+{
+public:
+    //! Angle difference in degrees between two points in feature.
+    virtual void setXi(double xi) = 0;
+    virtual double getXi() const = 0;
+
+    //! Feature table levels.
+    virtual void setLevels(int levels) = 0;
+    virtual int getLevels() const = 0;
+
+    //! Maximal difference between angles that treated as equal.
+    virtual void setAngleEpsilon(double angleEpsilon) = 0;
+    virtual double getAngleEpsilon() const = 0;
+
+    //! Minimal rotation angle to detect in degrees.
+    virtual void setMinAngle(double minAngle) = 0;
+    virtual double getMinAngle() const = 0;
+
+    //! Maximal rotation angle to detect in degrees.
+    virtual void setMaxAngle(double maxAngle) = 0;
+    virtual double getMaxAngle() const = 0;
+
+    //! Angle step in degrees.
+    virtual void setAngleStep(double angleStep) = 0;
+    virtual double getAngleStep() const = 0;
+
+    //! Angle votes threshold.
+    virtual void setAngleThresh(int angleThresh) = 0;
+    virtual int getAngleThresh() const = 0;
+
+    //! Minimal scale to detect.
+    virtual void setMinScale(double minScale) = 0;
+    virtual double getMinScale() const = 0;
+
+    //! Maximal scale to detect.
+    virtual void setMaxScale(double maxScale) = 0;
+    virtual double getMaxScale() const = 0;
+
+    //! Scale step.
+    virtual void setScaleStep(double scaleStep) = 0;
+    virtual double getScaleStep() const = 0;
+
+    //! Scale votes threshold.
+    virtual void setScaleThresh(int scaleThresh) = 0;
+    virtual int getScaleThresh() const = 0;
+
+    //! Position votes threshold.
+    virtual void setPosThresh(int posThresh) = 0;
+    virtual int getPosThresh() const = 0;
 };
 
 
@@ -1416,6 +1481,14 @@ CV_EXPORTS_W double pointPolygonTest( InputArray contour, Point2f pt, bool measu
 
 CV_EXPORTS Ptr<CLAHE> createCLAHE(double clipLimit = 40.0, Size tileGridSize = Size(8, 8));
 
+//! Ballard, D.H. (1981). Generalizing the Hough transform to detect arbitrary shapes. Pattern Recognition 13 (2): 111-122.
+//! Detects position only without traslation and rotation
+CV_EXPORTS Ptr<GeneralizedHoughBallard> createGeneralizedHoughBallard();
+
+//! Guil, N., González-Linares, J.M. and Zapata, E.L. (1999). Bidimensional shape detection using an invariant approach. Pattern Recognition 32 (6): 1025-1038.
+//! Detects position, traslation and rotation
+CV_EXPORTS Ptr<GeneralizedHoughGuil> createGeneralizedHoughGuil();
+
 } // cv
 
 #endif
diff --git a/modules/imgproc/src/generalized_hough.cpp b/modules/imgproc/src/generalized_hough.cpp
index 8eadff200b..7ee3b700da 100644
--- a/modules/imgproc/src/generalized_hough.cpp
+++ b/modules/imgproc/src/generalized_hough.cpp
@@ -45,17 +45,10 @@
 
 using namespace cv;
 
+// common
+
 namespace
 {
-    /////////////////////////////////////
-    // Common
-
-    template <typename T, class A> void releaseVector(std::vector<T, A>& v)
-    {
-        std::vector<T, A> empty;
-        empty.swap(v);
-    }
-
     double toRad(double a)
     {
         return a * CV_PI / 180.0;
@@ -66,70 +59,112 @@ namespace
         return fabs(v) > std::numeric_limits<float>::epsilon();
     }
 
-    class GHT_Pos : public GeneralizedHough
+    class GeneralizedHoughBase
     {
-    public:
-        GHT_Pos();
-
     protected:
-        void setTemplateImpl(const Mat& edges, const Mat& dx, const Mat& dy, Point templCenter);
-        void detectImpl(const Mat& edges, const Mat& dx, const Mat& dy, OutputArray positions, OutputArray votes);
-        void releaseImpl();
+        GeneralizedHoughBase();
+        virtual ~GeneralizedHoughBase() {}
+
+        void setTemplateImpl(InputArray templ, Point templCenter);
+        void setTemplateImpl(InputArray edges, InputArray dx, InputArray dy, Point templCenter);
+
+        void detectImpl(InputArray image, OutputArray positions, OutputArray votes);
+        void detectImpl(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes);
 
         virtual void processTempl() = 0;
         virtual void processImage() = 0;
 
+        int cannyLowThresh_;
+        int cannyHighThresh_;
+        double minDist_;
+        double dp_;
+
+        Size templSize_;
+        Point templCenter_;
+        Mat templEdges_;
+        Mat templDx_;
+        Mat templDy_;
+
+        Size imageSize_;
+        Mat imageEdges_;
+        Mat imageDx_;
+        Mat imageDy_;
+
+        std::vector<Vec4f> posOutBuf_;
+        std::vector<Vec3i> voteOutBuf_;
+
+    private:
+        void calcEdges(InputArray src, Mat& edges, Mat& dx, Mat& dy);
         void filterMinDist();
         void convertTo(OutputArray positions, OutputArray votes);
-
-        double minDist;
-
-        Size templSize;
-        Point templCenter;
-        Mat templEdges;
-        Mat templDx;
-        Mat templDy;
-
-        Size imageSize;
-        Mat imageEdges;
-        Mat imageDx;
-        Mat imageDy;
-
-        std::vector<Vec4f> posOutBuf;
-        std::vector<Vec3i> voteOutBuf;
     };
 
-    GHT_Pos::GHT_Pos()
+    GeneralizedHoughBase::GeneralizedHoughBase()
     {
-        minDist = 1.0;
+        cannyLowThresh_ = 50;
+        cannyHighThresh_ = 100;
+        minDist_ = 1.0;
+        dp_ = 1.0;
     }
 
-    void GHT_Pos::setTemplateImpl(const Mat& edges, const Mat& dx, const Mat& dy, Point templCenter_)
+    void GeneralizedHoughBase::calcEdges(InputArray _src, Mat& edges, Mat& dx, Mat& dy)
     {
-        templSize = edges.size();
-        templCenter = templCenter_;
-        edges.copyTo(templEdges);
-        dx.copyTo(templDx);
-        dy.copyTo(templDy);
+        Mat src = _src.getMat();
+
+        CV_Assert( src.type() == CV_8UC1 );
+        CV_Assert( cannyLowThresh_ > 0 && cannyLowThresh_ < cannyHighThresh_ );
+
+        Canny(src, edges, cannyLowThresh_, cannyHighThresh_);
+        Sobel(src, dx, CV_32F, 1, 0);
+        Sobel(src, dy, CV_32F, 0, 1);
+    }
+
+    void GeneralizedHoughBase::setTemplateImpl(InputArray templ, Point templCenter)
+    {
+        calcEdges(templ, templEdges_, templDx_, templDy_);
+
+        if (templCenter == Point(-1, -1))
+            templCenter = Point(templEdges_.cols / 2, templEdges_.rows / 2);
+
+        templSize_ = templEdges_.size();
+        templCenter_ = templCenter;
 
         processTempl();
     }
 
-    void GHT_Pos::detectImpl(const Mat& edges, const Mat& dx, const Mat& dy, OutputArray positions, OutputArray votes)
+    void GeneralizedHoughBase::setTemplateImpl(InputArray edges, InputArray dx, InputArray dy, Point templCenter)
     {
-        imageSize = edges.size();
-        edges.copyTo(imageEdges);
-        dx.copyTo(imageDx);
-        dy.copyTo(imageDy);
+        edges.getMat().copyTo(templEdges_);
+        dx.getMat().copyTo(templDx_);
+        dy.getMat().copyTo(templDy_);
 
-        posOutBuf.clear();
-        voteOutBuf.clear();
+        CV_Assert( templEdges_.type() == CV_8UC1 );
+        CV_Assert( templDx_.type() == CV_32FC1 && templDx_.size() == templEdges_.size() );
+        CV_Assert( templDy_.type() == templDx_.type() && templDy_.size() == templEdges_.size() );
+
+        if (templCenter == Point(-1, -1))
+            templCenter = Point(templEdges_.cols / 2, templEdges_.rows / 2);
+
+        templSize_ = templEdges_.size();
+        templCenter_ = templCenter;
+
+        processTempl();
+    }
+
+    void GeneralizedHoughBase::detectImpl(InputArray image, OutputArray positions, OutputArray votes)
+    {
+        calcEdges(image, imageEdges_, imageDx_, imageDy_);
+
+        imageSize_ = imageEdges_.size();
+
+        posOutBuf_.clear();
+        voteOutBuf_.clear();
 
         processImage();
 
-        if (!posOutBuf.empty())
+        if (!posOutBuf_.empty())
         {
-            if (minDist > 1)
+            if (minDist_ > 1)
                 filterMinDist();
             convertTo(positions, votes);
         }
@@ -141,21 +176,35 @@ namespace
         }
     }
 
-    void GHT_Pos::releaseImpl()
+    void GeneralizedHoughBase::detectImpl(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes)
     {
-        templSize = Size();
-        templCenter = Point(-1, -1);
-        templEdges.release();
-        templDx.release();
-        templDy.release();
+        edges.getMat().copyTo(imageEdges_);
+        dx.getMat().copyTo(imageDx_);
+        dy.getMat().copyTo(imageDy_);
 
-        imageSize = Size();
-        imageEdges.release();
-        imageDx.release();
-        imageDy.release();
+        CV_Assert( imageEdges_.type() == CV_8UC1 );
+        CV_Assert( imageDx_.type() == CV_32FC1 && imageDx_.size() == imageEdges_.size() );
+        CV_Assert( imageDy_.type() == imageDx_.type() && imageDy_.size() == imageEdges_.size() );
 
-        releaseVector(posOutBuf);
-        releaseVector(voteOutBuf);
+        imageSize_ = imageEdges_.size();
+
+        posOutBuf_.clear();
+        voteOutBuf_.clear();
+
+        processImage();
+
+        if (!posOutBuf_.empty())
+        {
+            if (minDist_ > 1)
+                filterMinDist();
+            convertTo(positions, votes);
+        }
+        else
+        {
+            positions.release();
+            if (votes.needed())
+                votes.release();
+        }
     }
 
     class Vec3iGreaterThanIdx
@@ -166,31 +215,31 @@ namespace
         const Vec3i* arr;
     };
 
-    void GHT_Pos::filterMinDist()
+    void GeneralizedHoughBase::filterMinDist()
     {
-        size_t oldSize = posOutBuf.size();
-        const bool hasVotes = !voteOutBuf.empty();
+        size_t oldSize = posOutBuf_.size();
+        const bool hasVotes = !voteOutBuf_.empty();
 
-        CV_Assert(!hasVotes || voteOutBuf.size() == oldSize);
+        CV_Assert( !hasVotes || voteOutBuf_.size() == oldSize );
 
-        std::vector<Vec4f> oldPosBuf(posOutBuf);
-        std::vector<Vec3i> oldVoteBuf(voteOutBuf);
+        std::vector<Vec4f> oldPosBuf(posOutBuf_);
+        std::vector<Vec3i> oldVoteBuf(voteOutBuf_);
 
         std::vector<size_t> indexies(oldSize);
         for (size_t i = 0; i < oldSize; ++i)
             indexies[i] = i;
         std::sort(indexies.begin(), indexies.end(), Vec3iGreaterThanIdx(&oldVoteBuf[0]));
 
-        posOutBuf.clear();
-        voteOutBuf.clear();
+        posOutBuf_.clear();
+        voteOutBuf_.clear();
 
-        const int cellSize = cvRound(minDist);
-        const int gridWidth = (imageSize.width + cellSize - 1) / cellSize;
-        const int gridHeight = (imageSize.height + cellSize - 1) / cellSize;
+        const int cellSize = cvRound(minDist_);
+        const int gridWidth = (imageSize_.width + cellSize - 1) / cellSize;
+        const int gridHeight = (imageSize_.height + cellSize - 1) / cellSize;
 
         std::vector< std::vector<Point2f> > grid(gridWidth * gridHeight);
 
-        const double minDist2 = minDist * minDist;
+        const double minDist2 = minDist_ * minDist_;
 
         for (size_t i = 0; i < oldSize; ++i)
         {
@@ -239,108 +288,112 @@ namespace
             {
                 grid[yCell * gridWidth + xCell].push_back(p);
 
-                posOutBuf.push_back(oldPosBuf[ind]);
+                posOutBuf_.push_back(oldPosBuf[ind]);
                 if (hasVotes)
-                    voteOutBuf.push_back(oldVoteBuf[ind]);
+                    voteOutBuf_.push_back(oldVoteBuf[ind]);
             }
         }
     }
 
-    void GHT_Pos::convertTo(OutputArray _positions, OutputArray _votes)
+    void GeneralizedHoughBase::convertTo(OutputArray _positions, OutputArray _votes)
     {
-        const int total = static_cast<int>(posOutBuf.size());
-        const bool hasVotes = !voteOutBuf.empty();
+        const int total = static_cast<int>(posOutBuf_.size());
+        const bool hasVotes = !voteOutBuf_.empty();
 
-        CV_Assert(!hasVotes || voteOutBuf.size() == posOutBuf.size());
+        CV_Assert( !hasVotes || voteOutBuf_.size() == posOutBuf_.size() );
 
         _positions.create(1, total, CV_32FC4);
         Mat positions = _positions.getMat();
-        Mat(1, total, CV_32FC4, &posOutBuf[0]).copyTo(positions);
+        Mat(1, total, CV_32FC4, &posOutBuf_[0]).copyTo(positions);
 
         if (_votes.needed())
         {
             if (!hasVotes)
+            {
                 _votes.release();
+            }
             else
             {
                 _votes.create(1, total, CV_32SC3);
                 Mat votes = _votes.getMat();
-                Mat(1, total, CV_32SC3, &voteOutBuf[0]).copyTo(votes);
+                Mat(1, total, CV_32SC3, &voteOutBuf_[0]).copyTo(votes);
             }
         }
     }
+}
 
-    /////////////////////////////////////
-    // POSITION Ballard
+// GeneralizedHoughBallard
 
-    class GHT_Ballard_Pos : public GHT_Pos
+namespace
+{
+    class GeneralizedHoughBallardImpl : public GeneralizedHoughBallard, private GeneralizedHoughBase
     {
     public:
-        AlgorithmInfo* info() const;
+        GeneralizedHoughBallardImpl();
 
-        GHT_Ballard_Pos();
+        void setTemplate(InputArray templ, Point templCenter) { setTemplateImpl(templ, templCenter); }
+        void setTemplate(InputArray edges, InputArray dx, InputArray dy, Point templCenter) { setTemplateImpl(edges, dx, dy, templCenter); }
 
-    protected:
-        void releaseImpl();
+        void detect(InputArray image, OutputArray positions, OutputArray votes) { detectImpl(image, positions, votes); }
+        void detect(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes) { detectImpl(edges, dx, dy, positions, votes); }
 
+        void setCannyLowThresh(int cannyLowThresh) { cannyLowThresh_ = cannyLowThresh; }
+        int getCannyLowThresh() const { return cannyLowThresh_; }
+
+        void setCannyHighThresh(int cannyHighThresh) { cannyHighThresh_ = cannyHighThresh; }
+        int getCannyHighThresh() const { return cannyHighThresh_; }
+
+        void setMinDist(double minDist) { minDist_ = minDist; }
+        double getMinDist() const { return minDist_; }
+
+        void setDp(double dp) { dp_ = dp; }
+        double getDp() const { return dp_; }
+
+        void setMaxBufferSize(int) {  }
+        int getMaxBufferSize() const { return 0; }
+
+        void setLevels(int levels) { levels_ = levels; }
+        int getLevels() const { return levels_; }
+
+        void setVotesThreshold(int votesThreshold) { votesThreshold_ = votesThreshold; }
+        int getVotesThreshold() const { return votesThreshold_; }
+
+    private:
         void processTempl();
         void processImage();
 
-        virtual void calcHist();
-        virtual void findPosInHist();
+        void calcHist();
+        void findPosInHist();
 
-        int levels;
-        int votesThreshold;
-        double dp;
+        int levels_;
+        int votesThreshold_;
 
-        std::vector< std::vector<Point> > r_table;
-        Mat hist;
+        std::vector< std::vector<Point> > r_table_;
+        Mat hist_;
     };
 
-    CV_INIT_ALGORITHM(GHT_Ballard_Pos, "GeneralizedHough.POSITION",
-                      obj.info()->addParam(obj, "minDist", obj.minDist, false, 0, 0,
-                                           "Minimum distance between the centers of the detected objects.");
-                      obj.info()->addParam(obj, "levels", obj.levels, false, 0, 0,
-                                           "R-Table levels.");
-                      obj.info()->addParam(obj, "votesThreshold", obj.votesThreshold, false, 0, 0,
-                                           "The accumulator threshold for the template centers at the detection stage. The smaller it is, the more false positions may be detected.");
-                      obj.info()->addParam(obj, "dp", obj.dp, false, 0, 0,
-                                           "Inverse ratio of the accumulator resolution to the image resolution."));
-
-    GHT_Ballard_Pos::GHT_Ballard_Pos()
+    GeneralizedHoughBallardImpl::GeneralizedHoughBallardImpl()
     {
-        levels = 360;
-        votesThreshold = 100;
-        dp = 1.0;
+        levels_ = 360;
+        votesThreshold_ = 100;
     }
 
-    void GHT_Ballard_Pos::releaseImpl()
+    void GeneralizedHoughBallardImpl::processTempl()
     {
-        GHT_Pos::releaseImpl();
+        CV_Assert( levels_ > 0 );
 
-        releaseVector(r_table);
-        hist.release();
-    }
+        const double thetaScale = levels_ / 360.0;
 
-    void GHT_Ballard_Pos::processTempl()
-    {
-        CV_Assert(templEdges.type() == CV_8UC1);
-        CV_Assert(templDx.type() == CV_32FC1 && templDx.size() == templSize);
-        CV_Assert(templDy.type() == templDx.type() && templDy.size() == templSize);
-        CV_Assert(levels > 0);
+        r_table_.resize(levels_ + 1);
+        std::for_each(r_table_.begin(), r_table_.end(), std::mem_fun_ref(&std::vector<Point>::clear));
 
-        const double thetaScale = levels / 360.0;
-
-        r_table.resize(levels + 1);
-        for_each(r_table.begin(), r_table.end(), mem_fun_ref(&std::vector<Point>::clear));
-
-        for (int y = 0; y < templSize.height; ++y)
+        for (int y = 0; y < templSize_.height; ++y)
         {
-            const uchar* edgesRow = templEdges.ptr(y);
-            const float* dxRow = templDx.ptr<float>(y);
-            const float* dyRow = templDy.ptr<float>(y);
+            const uchar* edgesRow = templEdges_.ptr(y);
+            const float* dxRow = templDx_.ptr<float>(y);
+            const float* dyRow = templDy_.ptr<float>(y);
 
-            for (int x = 0; x < templSize.width; ++x)
+            for (int x = 0; x < templSize_.width; ++x)
             {
                 const Point p(x, y);
 
@@ -348,42 +401,42 @@ namespace
                 {
                     const float theta = fastAtan2(dyRow[x], dxRow[x]);
                     const int n = cvRound(theta * thetaScale);
-                    r_table[n].push_back(p - templCenter);
+                    r_table_[n].push_back(p - templCenter_);
                 }
             }
         }
     }
 
-    void GHT_Ballard_Pos::processImage()
+    void GeneralizedHoughBallardImpl::processImage()
     {
         calcHist();
         findPosInHist();
     }
 
-    void GHT_Ballard_Pos::calcHist()
+    void GeneralizedHoughBallardImpl::calcHist()
     {
-        CV_Assert(imageEdges.type() == CV_8UC1);
-        CV_Assert(imageDx.type() == CV_32FC1 && imageDx.size() == imageSize);
-        CV_Assert(imageDy.type() == imageDx.type() && imageDy.size() == imageSize);
-        CV_Assert(levels > 0 && r_table.size() == static_cast<size_t>(levels + 1));
-        CV_Assert(dp > 0.0);
+        CV_Assert( imageEdges_.type() == CV_8UC1 );
+        CV_Assert( imageDx_.type() == CV_32FC1 && imageDx_.size() == imageSize_);
+        CV_Assert( imageDy_.type() == imageDx_.type() && imageDy_.size() == imageSize_);
+        CV_Assert( levels_ > 0 && r_table_.size() == static_cast<size_t>(levels_ + 1) );
+        CV_Assert( dp_ > 0.0 );
 
-        const double thetaScale = levels / 360.0;
-        const double idp = 1.0 / dp;
+        const double thetaScale = levels_ / 360.0;
+        const double idp = 1.0 / dp_;
 
-        hist.create(cvCeil(imageSize.height * idp) + 2, cvCeil(imageSize.width * idp) + 2, CV_32SC1);
-        hist.setTo(0);
+        hist_.create(cvCeil(imageSize_.height * idp) + 2, cvCeil(imageSize_.width * idp) + 2, CV_32SC1);
+        hist_.setTo(0);
 
-        const int rows = hist.rows - 2;
-        const int cols = hist.cols - 2;
+        const int rows = hist_.rows - 2;
+        const int cols = hist_.cols - 2;
 
-        for (int y = 0; y < imageSize.height; ++y)
+        for (int y = 0; y < imageSize_.height; ++y)
         {
-            const uchar* edgesRow = imageEdges.ptr(y);
-            const float* dxRow = imageDx.ptr<float>(y);
-            const float* dyRow = imageDy.ptr<float>(y);
+            const uchar* edgesRow = imageEdges_.ptr(y);
+            const float* dxRow = imageDx_.ptr<float>(y);
+            const float* dyRow = imageDy_.ptr<float>(y);
 
-            for (int x = 0; x < imageSize.width; ++x)
+            for (int x = 0; x < imageSize_.width; ++x)
             {
                 const Point p(x, y);
 
@@ -392,7 +445,7 @@ namespace
                     const float theta = fastAtan2(dyRow[x], dxRow[x]);
                     const int n = cvRound(theta * thetaScale);
 
-                    const std::vector<Point>& r_row = r_table[n];
+                    const std::vector<Point>& r_row = r_table_[n];
 
                     for (size_t j = 0; j < r_row.size(); ++j)
                     {
@@ -402,407 +455,132 @@ namespace
                         c.y = cvRound(c.y * idp);
 
                         if (c.x >= 0 && c.x < cols && c.y >= 0 && c.y < rows)
-                            ++hist.at<int>(c.y + 1, c.x + 1);
+                            ++hist_.at<int>(c.y + 1, c.x + 1);
                     }
                 }
             }
         }
     }
 
-    void GHT_Ballard_Pos::findPosInHist()
+    void GeneralizedHoughBallardImpl::findPosInHist()
     {
-        CV_Assert(votesThreshold > 0);
+        CV_Assert( votesThreshold_ > 0 );
 
-        const int histRows = hist.rows - 2;
-        const int histCols = hist.cols - 2;
+        const int histRows = hist_.rows - 2;
+        const int histCols = hist_.cols - 2;
 
         for(int y = 0; y < histRows; ++y)
         {
-            const int* prevRow = hist.ptr<int>(y);
-            const int* curRow = hist.ptr<int>(y + 1);
-            const int* nextRow = hist.ptr<int>(y + 2);
+            const int* prevRow = hist_.ptr<int>(y);
+            const int* curRow = hist_.ptr<int>(y + 1);
+            const int* nextRow = hist_.ptr<int>(y + 2);
 
             for(int x = 0; x < histCols; ++x)
             {
                 const int votes = curRow[x + 1];
 
-                if (votes > votesThreshold && votes > curRow[x] && votes >= curRow[x + 2] && votes > prevRow[x + 1] && votes >= nextRow[x + 1])
+                if (votes > votesThreshold_ && votes > curRow[x] && votes >= curRow[x + 2] && votes > prevRow[x + 1] && votes >= nextRow[x + 1])
                 {
-                    posOutBuf.push_back(Vec4f(static_cast<float>(x * dp), static_cast<float>(y * dp), 1.0f, 0.0f));
-                    voteOutBuf.push_back(Vec3i(votes, 0, 0));
+                    posOutBuf_.push_back(Vec4f(static_cast<float>(x * dp_), static_cast<float>(y * dp_), 1.0f, 0.0f));
+                    voteOutBuf_.push_back(Vec3i(votes, 0, 0));
                 }
             }
         }
     }
+}
 
-    /////////////////////////////////////
-    // POSITION & SCALE
+Ptr<GeneralizedHoughBallard> cv::createGeneralizedHoughBallard()
+{
+    return new GeneralizedHoughBallardImpl;
+}
 
-    class GHT_Ballard_PosScale : public GHT_Ballard_Pos
+// GeneralizedHoughGuil
+
+namespace
+{
+    class GeneralizedHoughGuilImpl : public GeneralizedHoughGuil, private GeneralizedHoughBase
     {
     public:
-        AlgorithmInfo* info() const;
+        GeneralizedHoughGuilImpl();
 
-        GHT_Ballard_PosScale();
+        void setTemplate(InputArray templ, Point templCenter) { setTemplateImpl(templ, templCenter); }
+        void setTemplate(InputArray edges, InputArray dx, InputArray dy, Point templCenter) { setTemplateImpl(edges, dx, dy, templCenter); }
 
-    protected:
-        void calcHist();
-        void findPosInHist();
+        void detect(InputArray image, OutputArray positions, OutputArray votes) { detectImpl(image, positions, votes); }
+        void detect(InputArray edges, InputArray dx, InputArray dy, OutputArray positions, OutputArray votes) { detectImpl(edges, dx, dy, positions, votes); }
 
-        double minScale;
-        double maxScale;
-        double scaleStep;
+        void setCannyLowThresh(int cannyLowThresh) { cannyLowThresh_ = cannyLowThresh; }
+        int getCannyLowThresh() const { return cannyLowThresh_; }
 
-        class Worker;
-        friend class Worker;
-    };
+        void setCannyHighThresh(int cannyHighThresh) { cannyHighThresh_ = cannyHighThresh; }
+        int getCannyHighThresh() const { return cannyHighThresh_; }
 
-    CV_INIT_ALGORITHM(GHT_Ballard_PosScale, "GeneralizedHough.POSITION_SCALE",
-                      obj.info()->addParam(obj, "minDist", obj.minDist, false, 0, 0,
-                                           "Minimum distance between the centers of the detected objects.");
-                      obj.info()->addParam(obj, "levels", obj.levels, false, 0, 0,
-                                           "R-Table levels.");
-                      obj.info()->addParam(obj, "votesThreshold", obj.votesThreshold, false, 0, 0,
-                                           "The accumulator threshold for the template centers at the detection stage. The smaller it is, the more false positions may be detected.");
-                      obj.info()->addParam(obj, "dp", obj.dp, false, 0, 0,
-                                           "Inverse ratio of the accumulator resolution to the image resolution.");
-                      obj.info()->addParam(obj, "minScale", obj.minScale, false, 0, 0,
-                                           "Minimal scale to detect.");
-                      obj.info()->addParam(obj, "maxScale", obj.maxScale, false, 0, 0,
-                                           "Maximal scale to detect.");
-                      obj.info()->addParam(obj, "scaleStep", obj.scaleStep, false, 0, 0,
-                                           "Scale step."));
+        void setMinDist(double minDist) { minDist_ = minDist; }
+        double getMinDist() const { return minDist_; }
 
-    GHT_Ballard_PosScale::GHT_Ballard_PosScale()
-    {
-        minScale = 0.5;
-        maxScale = 2.0;
-        scaleStep = 0.05;
-    }
+        void setDp(double dp) { dp_ = dp; }
+        double getDp() const { return dp_; }
 
-    class GHT_Ballard_PosScale::Worker : public ParallelLoopBody
-    {
-    public:
-        explicit Worker(GHT_Ballard_PosScale* base_) : base(base_) {}
+        void setMaxBufferSize(int maxBufferSize) { maxBufferSize_ = maxBufferSize; }
+        int getMaxBufferSize() const { return maxBufferSize_; }
 
-        void operator ()(const Range& range) const;
+        void setXi(double xi) { xi_ = xi; }
+        double getXi() const { return xi_; }
+
+        void setLevels(int levels) { levels_ = levels; }
+        int getLevels() const { return levels_; }
+
+        void setAngleEpsilon(double angleEpsilon) { angleEpsilon_ = angleEpsilon; }
+        double getAngleEpsilon() const { return angleEpsilon_; }
+
+        void setMinAngle(double minAngle) { minAngle_ = minAngle; }
+        double getMinAngle() const { return minAngle_; }
+
+        void setMaxAngle(double maxAngle) { maxAngle_ = maxAngle; }
+        double getMaxAngle() const { return maxAngle_; }
+
+        void setAngleStep(double angleStep) { angleStep_ = angleStep; }
+        double getAngleStep() const { return angleStep_; }
+
+        void setAngleThresh(int angleThresh) { angleThresh_ = angleThresh; }
+        int getAngleThresh() const { return angleThresh_; }
+
+        void setMinScale(double minScale) { minScale_ = minScale; }
+        double getMinScale() const { return minScale_; }
+
+        void setMaxScale(double maxScale) { maxScale_ = maxScale; }
+        double getMaxScale() const { return maxScale_; }
+
+        void setScaleStep(double scaleStep) { scaleStep_ = scaleStep; }
+        double getScaleStep() const { return scaleStep_; }
+
+        void setScaleThresh(int scaleThresh) { scaleThresh_ = scaleThresh; }
+        int getScaleThresh() const { return scaleThresh_; }
+
+        void setPosThresh(int posThresh) { posThresh_ = posThresh; }
+        int getPosThresh() const { return posThresh_; }
 
     private:
-        GHT_Ballard_PosScale* base;
-    };
-
-    void GHT_Ballard_PosScale::Worker::operator ()(const Range& range) const
-    {
-        const double thetaScale = base->levels / 360.0;
-        const double idp = 1.0 / base->dp;
-
-        for (int s = range.start; s < range.end; ++s)
-        {
-            const double scale = base->minScale + s * base->scaleStep;
-
-            Mat curHist(base->hist.size[1], base->hist.size[2], CV_32SC1, base->hist.ptr(s + 1), base->hist.step[1]);
-
-            for (int y = 0; y < base->imageSize.height; ++y)
-            {
-                const uchar* edgesRow = base->imageEdges.ptr(y);
-                const float* dxRow = base->imageDx.ptr<float>(y);
-                const float* dyRow = base->imageDy.ptr<float>(y);
-
-                for (int x = 0; x < base->imageSize.width; ++x)
-                {
-                    const Point2d p(x, y);
-
-                    if (edgesRow[x] && (notNull(dyRow[x]) || notNull(dxRow[x])))
-                    {
-                        const float theta = fastAtan2(dyRow[x], dxRow[x]);
-                        const int n = cvRound(theta * thetaScale);
-
-                        const std::vector<Point>& r_row = base->r_table[n];
-
-                        for (size_t j = 0; j < r_row.size(); ++j)
-                        {
-                            Point2d d = r_row[j];
-                            Point2d c = p - d * scale;
-
-                            c.x *= idp;
-                            c.y *= idp;
-
-                            if (c.x >= 0 && c.x < base->hist.size[2] - 2 && c.y >= 0 && c.y < base->hist.size[1] - 2)
-                                ++curHist.at<int>(cvRound(c.y + 1), cvRound(c.x + 1));
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    void GHT_Ballard_PosScale::calcHist()
-    {
-        CV_Assert(imageEdges.type() == CV_8UC1);
-        CV_Assert(imageDx.type() == CV_32FC1 && imageDx.size() == imageSize);
-        CV_Assert(imageDy.type() == imageDx.type() && imageDy.size() == imageSize);
-        CV_Assert(levels > 0 && r_table.size() == static_cast<size_t>(levels + 1));
-        CV_Assert(dp > 0.0);
-        CV_Assert(minScale > 0.0 && minScale < maxScale);
-        CV_Assert(scaleStep > 0.0);
-
-        const double idp = 1.0 / dp;
-        const int scaleRange = cvCeil((maxScale - minScale) / scaleStep);
-
-        const int sizes[] = {scaleRange + 2, cvCeil(imageSize.height * idp) + 2, cvCeil(imageSize.width * idp) + 2};
-        hist.create(3, sizes, CV_32SC1);
-        hist.setTo(0);
-
-        parallel_for_(Range(0, scaleRange), Worker(this));
-    }
-
-    void GHT_Ballard_PosScale::findPosInHist()
-    {
-        CV_Assert(votesThreshold > 0);
-
-        const int scaleRange = hist.size[0] - 2;
-        const int histRows = hist.size[1] - 2;
-        const int histCols = hist.size[2] - 2;
-
-        for (int s = 0; s < scaleRange; ++s)
-        {
-            const float scale = static_cast<float>(minScale + s * scaleStep);
-
-            const Mat prevHist(histRows + 2, histCols + 2, CV_32SC1, hist.ptr(s), hist.step[1]);
-            const Mat curHist(histRows + 2, histCols + 2, CV_32SC1, hist.ptr(s + 1), hist.step[1]);
-            const Mat nextHist(histRows + 2, histCols + 2, CV_32SC1, hist.ptr(s + 2), hist.step[1]);
-
-            for(int y = 0; y < histRows; ++y)
-            {
-                const int* prevHistRow = prevHist.ptr<int>(y + 1);
-                const int* prevRow = curHist.ptr<int>(y);
-                const int* curRow = curHist.ptr<int>(y + 1);
-                const int* nextRow = curHist.ptr<int>(y + 2);
-                const int* nextHistRow = nextHist.ptr<int>(y + 1);
-
-                for(int x = 0; x < histCols; ++x)
-                {
-                    const int votes = curRow[x + 1];
-
-                    if (votes > votesThreshold &&
-                        votes > curRow[x] &&
-                        votes >= curRow[x + 2] &&
-                        votes > prevRow[x + 1] &&
-                        votes >= nextRow[x + 1] &&
-                        votes > prevHistRow[x + 1] &&
-                        votes >= nextHistRow[x + 1])
-                    {
-                        posOutBuf.push_back(Vec4f(static_cast<float>(x * dp), static_cast<float>(y * dp), scale, 0.0f));
-                        voteOutBuf.push_back(Vec3i(votes, votes, 0));
-                    }
-                }
-            }
-        }
-    }
-
-    /////////////////////////////////////
-    // POSITION & ROTATION
-
-    class GHT_Ballard_PosRotation : public GHT_Ballard_Pos
-    {
-    public:
-        AlgorithmInfo* info() const;
-
-        GHT_Ballard_PosRotation();
-
-    protected:
-        void calcHist();
-        void findPosInHist();
-
-        double minAngle;
-        double maxAngle;
-        double angleStep;
-
-        class Worker;
-        friend class Worker;
-    };
-
-    CV_INIT_ALGORITHM(GHT_Ballard_PosRotation, "GeneralizedHough.POSITION_ROTATION",
-                      obj.info()->addParam(obj, "minDist", obj.minDist, false, 0, 0,
-                                           "Minimum distance between the centers of the detected objects.");
-                      obj.info()->addParam(obj, "levels", obj.levels, false, 0, 0,
-                                           "R-Table levels.");
-                      obj.info()->addParam(obj, "votesThreshold", obj.votesThreshold, false, 0, 0,
-                                           "The accumulator threshold for the template centers at the detection stage. The smaller it is, the more false positions may be detected.");
-                      obj.info()->addParam(obj, "dp", obj.dp, false, 0, 0,
-                                           "Inverse ratio of the accumulator resolution to the image resolution.");
-                      obj.info()->addParam(obj, "minAngle", obj.minAngle, false, 0, 0,
-                                           "Minimal rotation angle to detect in degrees.");
-                      obj.info()->addParam(obj, "maxAngle", obj.maxAngle, false, 0, 0,
-                                           "Maximal rotation angle to detect in degrees.");
-                      obj.info()->addParam(obj, "angleStep", obj.angleStep, false, 0, 0,
-                                           "Angle step in degrees."));
-
-    GHT_Ballard_PosRotation::GHT_Ballard_PosRotation()
-    {
-        minAngle = 0.0;
-        maxAngle = 360.0;
-        angleStep = 1.0;
-    }
-
-    class GHT_Ballard_PosRotation::Worker : public ParallelLoopBody
-    {
-    public:
-        explicit Worker(GHT_Ballard_PosRotation* base_) : base(base_) {}
-
-        void operator ()(const Range& range) const;
-
-    private:
-        GHT_Ballard_PosRotation* base;
-    };
-
-    void GHT_Ballard_PosRotation::Worker::operator ()(const Range& range) const
-    {
-        const double thetaScale = base->levels / 360.0;
-        const double idp = 1.0 / base->dp;
-
-        for (int a = range.start; a < range.end; ++a)
-        {
-            const double angle = base->minAngle + a * base->angleStep;
-
-            const double sinA = ::sin(toRad(angle));
-            const double cosA = ::cos(toRad(angle));
-
-            Mat curHist(base->hist.size[1], base->hist.size[2], CV_32SC1, base->hist.ptr(a + 1), base->hist.step[1]);
-
-            for (int y = 0; y < base->imageSize.height; ++y)
-            {
-                const uchar* edgesRow = base->imageEdges.ptr(y);
-                const float* dxRow = base->imageDx.ptr<float>(y);
-                const float* dyRow = base->imageDy.ptr<float>(y);
-
-                for (int x = 0; x < base->imageSize.width; ++x)
-                {
-                    const Point2d p(x, y);
-
-                    if (edgesRow[x] && (notNull(dyRow[x]) || notNull(dxRow[x])))
-                    {
-                        double theta = fastAtan2(dyRow[x], dxRow[x]) - angle;
-                        if (theta < 0)
-                            theta += 360.0;
-                        const int n = cvRound(theta * thetaScale);
-
-                        const std::vector<Point>& r_row = base->r_table[n];
-
-                        for (size_t j = 0; j < r_row.size(); ++j)
-                        {
-                            Point2d d = r_row[j];
-                            Point2d c = p - Point2d(d.x * cosA - d.y * sinA, d.x * sinA + d.y * cosA);
-
-                            c.x *= idp;
-                            c.y *= idp;
-
-                            if (c.x >= 0 && c.x < base->hist.size[2] - 2 && c.y >= 0 && c.y < base->hist.size[1] - 2)
-                                ++curHist.at<int>(cvRound(c.y + 1), cvRound(c.x + 1));
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    void GHT_Ballard_PosRotation::calcHist()
-    {
-        CV_Assert(imageEdges.type() == CV_8UC1);
-        CV_Assert(imageDx.type() == CV_32FC1 && imageDx.size() == imageSize);
-        CV_Assert(imageDy.type() == imageDx.type() && imageDy.size() == imageSize);
-        CV_Assert(levels > 0 && r_table.size() == static_cast<size_t>(levels + 1));
-        CV_Assert(dp > 0.0);
-        CV_Assert(minAngle >= 0.0 && minAngle < maxAngle && maxAngle <= 360.0);
-        CV_Assert(angleStep > 0.0 && angleStep < 360.0);
-
-        const double idp = 1.0 / dp;
-        const int angleRange = cvCeil((maxAngle - minAngle) / angleStep);
-
-        const int sizes[] = {angleRange + 2, cvCeil(imageSize.height * idp) + 2, cvCeil(imageSize.width * idp) + 2};
-        hist.create(3, sizes, CV_32SC1);
-        hist.setTo(0);
-
-        parallel_for_(Range(0, angleRange), Worker(this));
-    }
-
-    void GHT_Ballard_PosRotation::findPosInHist()
-    {
-        CV_Assert(votesThreshold > 0);
-
-        const int angleRange = hist.size[0] - 2;
-        const int histRows = hist.size[1] - 2;
-        const int histCols = hist.size[2] - 2;
-
-        for (int a = 0; a < angleRange; ++a)
-        {
-            const float angle = static_cast<float>(minAngle + a * angleStep);
-
-            const Mat prevHist(histRows + 2, histCols + 2, CV_32SC1, hist.ptr(a), hist.step[1]);
-            const Mat curHist(histRows + 2, histCols + 2, CV_32SC1, hist.ptr(a + 1), hist.step[1]);
-            const Mat nextHist(histRows + 2, histCols + 2, CV_32SC1, hist.ptr(a + 2), hist.step[1]);
-
-            for(int y = 0; y < histRows; ++y)
-            {
-                const int* prevHistRow = prevHist.ptr<int>(y + 1);
-                const int* prevRow = curHist.ptr<int>(y);
-                const int* curRow = curHist.ptr<int>(y + 1);
-                const int* nextRow = curHist.ptr<int>(y + 2);
-                const int* nextHistRow = nextHist.ptr<int>(y + 1);
-
-                for(int x = 0; x < histCols; ++x)
-                {
-                    const int votes = curRow[x + 1];
-
-                    if (votes > votesThreshold &&
-                        votes > curRow[x] &&
-                        votes >= curRow[x + 2] &&
-                        votes > prevRow[x + 1] &&
-                        votes >= nextRow[x + 1] &&
-                        votes > prevHistRow[x + 1] &&
-                        votes >= nextHistRow[x + 1])
-                    {
-                        posOutBuf.push_back(Vec4f(static_cast<float>(x * dp), static_cast<float>(y * dp), 1.0f, angle));
-                        voteOutBuf.push_back(Vec3i(votes, 0, votes));
-                    }
-                }
-            }
-        }
-    }
-
-    /////////////////////////////////////////
-    // POSITION & SCALE & ROTATION
-
-    double clampAngle(double a)
-    {
-        double res = a;
-
-        while (res > 360.0)
-            res -= 360.0;
-        while (res < 0)
-            res += 360.0;
-
-        return res;
-    }
-
-    bool angleEq(double a, double b, double eps = 1.0)
-    {
-        return (fabs(clampAngle(a - b)) <= eps);
-    }
-
-    class GHT_Guil_Full : public GHT_Pos
-    {
-    public:
-        AlgorithmInfo* info() const;
-
-        GHT_Guil_Full();
-
-    protected:
-        void releaseImpl();
-
         void processTempl();
         void processImage();
 
+        int maxBufferSize_;
+        double xi_;
+        int levels_;
+        double angleEpsilon_;
+
+        double minAngle_;
+        double maxAngle_;
+        double angleStep_;
+        int angleThresh_;
+
+        double minScale_;
+        double maxScale_;
+        double scaleStep_;
+        int scaleThresh_;
+
+        int posThresh_;
+
         struct ContourPoint
         {
             Point2d pos;
@@ -828,137 +606,92 @@ namespace
         void calcScale(double angle);
         void calcPosition(double angle, int angleVotes, double scale, int scaleVotes);
 
-        int maxSize;
-        double xi;
-        int levels;
-        double angleEpsilon;
+        std::vector< std::vector<Feature> > templFeatures_;
+        std::vector< std::vector<Feature> > imageFeatures_;
 
-        double minAngle;
-        double maxAngle;
-        double angleStep;
-        int angleThresh;
-
-        double minScale;
-        double maxScale;
-        double scaleStep;
-        int scaleThresh;
-
-        double dp;
-        int posThresh;
-
-        std::vector< std::vector<Feature> > templFeatures;
-        std::vector< std::vector<Feature> > imageFeatures;
-
-        std::vector< std::pair<double, int> > angles;
-        std::vector< std::pair<double, int> > scales;
+        std::vector< std::pair<double, int> > angles_;
+        std::vector< std::pair<double, int> > scales_;
     };
 
-    CV_INIT_ALGORITHM(GHT_Guil_Full, "GeneralizedHough.POSITION_SCALE_ROTATION",
-                      obj.info()->addParam(obj, "minDist", obj.minDist, false, 0, 0,
-                                           "Minimum distance between the centers of the detected objects.");
-                      obj.info()->addParam(obj, "maxSize", obj.maxSize, false, 0, 0,
-                                           "Maximal size of inner buffers.");
-                      obj.info()->addParam(obj, "xi", obj.xi, false, 0, 0,
-                                           "Angle difference in degrees between two points in feature.");
-                      obj.info()->addParam(obj, "levels", obj.levels, false, 0, 0,
-                                           "Feature table levels.");
-                      obj.info()->addParam(obj, "angleEpsilon", obj.angleEpsilon, false, 0, 0,
-                                           "Maximal difference between angles that treated as equal.");
-                      obj.info()->addParam(obj, "minAngle", obj.minAngle, false, 0, 0,
-                                           "Minimal rotation angle to detect in degrees.");
-                      obj.info()->addParam(obj, "maxAngle", obj.maxAngle, false, 0, 0,
-                                           "Maximal rotation angle to detect in degrees.");
-                      obj.info()->addParam(obj, "angleStep", obj.angleStep, false, 0, 0,
-                                           "Angle step in degrees.");
-                      obj.info()->addParam(obj, "angleThresh", obj.angleThresh, false, 0, 0,
-                                           "Angle threshold.");
-                      obj.info()->addParam(obj, "minScale", obj.minScale, false, 0, 0,
-                                           "Minimal scale to detect.");
-                      obj.info()->addParam(obj, "maxScale", obj.maxScale, false, 0, 0,
-                                           "Maximal scale to detect.");
-                      obj.info()->addParam(obj, "scaleStep", obj.scaleStep, false, 0, 0,
-                                           "Scale step.");
-                      obj.info()->addParam(obj, "scaleThresh", obj.scaleThresh, false, 0, 0,
-                                           "Scale threshold.");
-                      obj.info()->addParam(obj, "dp", obj.dp, false, 0, 0,
-                                           "Inverse ratio of the accumulator resolution to the image resolution.");
-                      obj.info()->addParam(obj, "posThresh", obj.posThresh, false, 0, 0,
-                                           "Position threshold."));
-
-    GHT_Guil_Full::GHT_Guil_Full()
+    double clampAngle(double a)
     {
-        maxSize = 1000;
-        xi = 90.0;
-        levels = 360;
-        angleEpsilon = 1.0;
+        double res = a;
 
-        minAngle = 0.0;
-        maxAngle = 360.0;
-        angleStep = 1.0;
-        angleThresh = 15000;
+        while (res > 360.0)
+            res -= 360.0;
+        while (res < 0)
+            res += 360.0;
 
-        minScale = 0.5;
-        maxScale = 2.0;
-        scaleStep = 0.05;
-        scaleThresh = 1000;
-
-        dp = 1.0;
-        posThresh = 100;
+        return res;
     }
 
-    void GHT_Guil_Full::releaseImpl()
+    bool angleEq(double a, double b, double eps = 1.0)
     {
-        GHT_Pos::releaseImpl();
-
-        releaseVector(templFeatures);
-        releaseVector(imageFeatures);
-
-        releaseVector(angles);
-        releaseVector(scales);
+        return (fabs(clampAngle(a - b)) <= eps);
     }
 
-    void GHT_Guil_Full::processTempl()
+    GeneralizedHoughGuilImpl::GeneralizedHoughGuilImpl()
     {
-        buildFeatureList(templEdges, templDx, templDy, templFeatures, templCenter);
+        maxBufferSize_ = 1000;
+        xi_ = 90.0;
+        levels_ = 360;
+        angleEpsilon_ = 1.0;
+
+        minAngle_ = 0.0;
+        maxAngle_ = 360.0;
+        angleStep_ = 1.0;
+        angleThresh_ = 15000;
+
+        minScale_ = 0.5;
+        maxScale_ = 2.0;
+        scaleStep_ = 0.05;
+        scaleThresh_ = 1000;
+
+        posThresh_ = 100;
     }
 
-    void GHT_Guil_Full::processImage()
+    void GeneralizedHoughGuilImpl::processTempl()
     {
-        buildFeatureList(imageEdges, imageDx, imageDy, imageFeatures);
+        buildFeatureList(templEdges_, templDx_, templDy_, templFeatures_, templCenter_);
+    }
+
+    void GeneralizedHoughGuilImpl::processImage()
+    {
+        buildFeatureList(imageEdges_, imageDx_, imageDy_, imageFeatures_);
 
         calcOrientation();
 
-        for (size_t i = 0; i < angles.size(); ++i)
+        for (size_t i = 0; i < angles_.size(); ++i)
         {
-            const double angle = angles[i].first;
-            const int angleVotes = angles[i].second;
+            const double angle = angles_[i].first;
+            const int angleVotes = angles_[i].second;
 
             calcScale(angle);
 
-            for (size_t j = 0; j < scales.size(); ++j)
+            for (size_t j = 0; j < scales_.size(); ++j)
             {
-                const double scale = scales[j].first;
-                const int scaleVotes = scales[j].second;
+                const double scale = scales_[j].first;
+                const int scaleVotes = scales_[j].second;
 
                 calcPosition(angle, angleVotes, scale, scaleVotes);
             }
         }
     }
 
-    void GHT_Guil_Full::buildFeatureList(const Mat& edges, const Mat& dx, const Mat& dy, std::vector< std::vector<Feature> >& features, Point2d center)
+    void GeneralizedHoughGuilImpl::buildFeatureList(const Mat& edges, const Mat& dx, const Mat& dy, std::vector< std::vector<Feature> >& features, Point2d center)
     {
-        CV_Assert(levels > 0);
+        CV_Assert( levels_ > 0 );
 
-        const double maxDist = sqrt((double) templSize.width * templSize.width + templSize.height * templSize.height) * maxScale;
+        const double maxDist = sqrt((double) templSize_.width * templSize_.width + templSize_.height * templSize_.height) * maxScale_;
 
-        const double alphaScale = levels / 360.0;
+        const double alphaScale = levels_ / 360.0;
 
         std::vector<ContourPoint> points;
         getContourPoints(edges, dx, dy, points);
 
-        features.resize(levels + 1);
-        for_each(features.begin(), features.end(), mem_fun_ref(&std::vector<Feature>::clear));
-        for_each(features.begin(), features.end(), bind2nd(mem_fun_ref(&std::vector<Feature>::reserve), maxSize));
+        features.resize(levels_ + 1);
+        std::for_each(features.begin(), features.end(), std::mem_fun_ref(&std::vector<Feature>::clear));
+        std::for_each(features.begin(), features.end(), std::bind2nd(std::mem_fun_ref(&std::vector<Feature>::reserve), maxBufferSize_));
 
         for (size_t i = 0; i < points.size(); ++i)
         {
@@ -968,7 +701,7 @@ namespace
             {
                 ContourPoint p2 = points[j];
 
-                if (angleEq(p1.theta - p2.theta, xi, angleEpsilon))
+                if (angleEq(p1.theta - p2.theta, xi_, angleEpsilon_))
                 {
                     const Point2d d = p1.pos - p2.pos;
 
@@ -988,18 +721,18 @@ namespace
 
                     const int n = cvRound(f.alpha12 * alphaScale);
 
-                    if (features[n].size() < static_cast<size_t>(maxSize))
+                    if (features[n].size() < static_cast<size_t>(maxBufferSize_))
                         features[n].push_back(f);
                 }
             }
         }
     }
 
-    void GHT_Guil_Full::getContourPoints(const Mat& edges, const Mat& dx, const Mat& dy, std::vector<ContourPoint>& points)
+    void GeneralizedHoughGuilImpl::getContourPoints(const Mat& edges, const Mat& dx, const Mat& dy, std::vector<ContourPoint>& points)
     {
-        CV_Assert(edges.type() == CV_8UC1);
-        CV_Assert(dx.type() == CV_32FC1 && dx.size == edges.size);
-        CV_Assert(dy.type() == dx.type() && dy.size == edges.size);
+        CV_Assert( edges.type() == CV_8UC1 );
+        CV_Assert( dx.type() == CV_32FC1 && dx.size == edges.size );
+        CV_Assert( dy.type() == dx.type() && dy.size == edges.size );
 
         points.clear();
         points.reserve(edges.size().area());
@@ -1025,23 +758,23 @@ namespace
         }
     }
 
-    void GHT_Guil_Full::calcOrientation()
+    void GeneralizedHoughGuilImpl::calcOrientation()
     {
-        CV_Assert(levels > 0);
-        CV_Assert(templFeatures.size() == static_cast<size_t>(levels + 1));
-        CV_Assert(imageFeatures.size() == templFeatures.size());
-        CV_Assert(minAngle >= 0.0 && minAngle < maxAngle && maxAngle <= 360.0);
-        CV_Assert(angleStep > 0.0 && angleStep < 360.0);
-        CV_Assert(angleThresh > 0);
+        CV_Assert( levels_ > 0 );
+        CV_Assert( templFeatures_.size() == static_cast<size_t>(levels_ + 1) );
+        CV_Assert( imageFeatures_.size() == templFeatures_.size() );
+        CV_Assert( minAngle_ >= 0.0 && minAngle_ < maxAngle_ && maxAngle_ <= 360.0 );
+        CV_Assert( angleStep_ > 0.0 && angleStep_ < 360.0 );
+        CV_Assert( angleThresh_ > 0 );
 
-        const double iAngleStep = 1.0 / angleStep;
-        const int angleRange = cvCeil((maxAngle - minAngle) * iAngleStep);
+        const double iAngleStep = 1.0 / angleStep_;
+        const int angleRange = cvCeil((maxAngle_ - minAngle_) * iAngleStep);
 
         std::vector<int> OHist(angleRange + 1, 0);
-        for (int i = 0; i <= levels; ++i)
+        for (int i = 0; i <= levels_; ++i)
         {
-            const std::vector<Feature>& templRow = templFeatures[i];
-            const std::vector<Feature>& imageRow = imageFeatures[i];
+            const std::vector<Feature>& templRow = templFeatures_[i];
+            const std::vector<Feature>& imageRow = imageFeatures_[i];
 
             for (size_t j = 0; j < templRow.size(); ++j)
             {
@@ -1052,45 +785,45 @@ namespace
                     Feature imF = imageRow[k];
 
                     const double angle = clampAngle(imF.p1.theta - templF.p1.theta);
-                    if (angle >= minAngle && angle <= maxAngle)
+                    if (angle >= minAngle_ && angle <= maxAngle_)
                     {
-                        const int n = cvRound((angle - minAngle) * iAngleStep);
+                        const int n = cvRound((angle - minAngle_) * iAngleStep);
                         ++OHist[n];
                     }
                 }
             }
         }
 
-        angles.clear();
+        angles_.clear();
 
         for (int n = 0; n < angleRange; ++n)
         {
-            if (OHist[n] >= angleThresh)
+            if (OHist[n] >= angleThresh_)
             {
-                const double angle = minAngle + n * angleStep;
-                angles.push_back(std::make_pair(angle, OHist[n]));
+                const double angle = minAngle_ + n * angleStep_;
+                angles_.push_back(std::make_pair(angle, OHist[n]));
             }
         }
     }
 
-    void GHT_Guil_Full::calcScale(double angle)
+    void GeneralizedHoughGuilImpl::calcScale(double angle)
     {
-        CV_Assert(levels > 0);
-        CV_Assert(templFeatures.size() == static_cast<size_t>(levels + 1));
-        CV_Assert(imageFeatures.size() == templFeatures.size());
-        CV_Assert(minScale > 0.0 && minScale < maxScale);
-        CV_Assert(scaleStep > 0.0);
-        CV_Assert(scaleThresh > 0);
+        CV_Assert( levels_ > 0 );
+        CV_Assert( templFeatures_.size() == static_cast<size_t>(levels_ + 1) );
+        CV_Assert( imageFeatures_.size() == templFeatures_.size() );
+        CV_Assert( minScale_ > 0.0 && minScale_ < maxScale_ );
+        CV_Assert( scaleStep_ > 0.0 );
+        CV_Assert( scaleThresh_ > 0 );
 
-        const double iScaleStep = 1.0 / scaleStep;
-        const int scaleRange = cvCeil((maxScale - minScale) * iScaleStep);
+        const double iScaleStep = 1.0 / scaleStep_;
+        const int scaleRange = cvCeil((maxScale_ - minScale_) * iScaleStep);
 
         std::vector<int> SHist(scaleRange + 1, 0);
 
-        for (int i = 0; i <= levels; ++i)
+        for (int i = 0; i <= levels_; ++i)
         {
-            const std::vector<Feature>& templRow = templFeatures[i];
-            const std::vector<Feature>& imageRow = imageFeatures[i];
+            const std::vector<Feature>& templRow = templFeatures_[i];
+            const std::vector<Feature>& imageRow = imageFeatures_[i];
 
             for (size_t j = 0; j < templRow.size(); ++j)
             {
@@ -1102,12 +835,12 @@ namespace
                 {
                     Feature imF = imageRow[k];
 
-                    if (angleEq(imF.p1.theta, templF.p1.theta, angleEpsilon))
+                    if (angleEq(imF.p1.theta, templF.p1.theta, angleEpsilon_))
                     {
                         const double scale = imF.d12 / templF.d12;
-                        if (scale >= minScale && scale <= maxScale)
+                        if (scale >= minScale_ && scale <= maxScale_)
                         {
-                            const int s = cvRound((scale - minScale) * iScaleStep);
+                            const int s = cvRound((scale - minScale_) * iScaleStep);
                             ++SHist[s];
                         }
                     }
@@ -1115,39 +848,39 @@ namespace
             }
         }
 
-        scales.clear();
+        scales_.clear();
 
         for (int s = 0; s < scaleRange; ++s)
         {
-            if (SHist[s] >= scaleThresh)
+            if (SHist[s] >= scaleThresh_)
             {
-                const double scale = minScale + s * scaleStep;
-                scales.push_back(std::make_pair(scale, SHist[s]));
+                const double scale = minScale_ + s * scaleStep_;
+                scales_.push_back(std::make_pair(scale, SHist[s]));
             }
         }
     }
 
-    void GHT_Guil_Full::calcPosition(double angle, int angleVotes, double scale, int scaleVotes)
+    void GeneralizedHoughGuilImpl::calcPosition(double angle, int angleVotes, double scale, int scaleVotes)
     {
-        CV_Assert(levels > 0);
-        CV_Assert(templFeatures.size() == static_cast<size_t>(levels + 1));
-        CV_Assert(imageFeatures.size() == templFeatures.size());
-        CV_Assert(dp > 0.0);
-        CV_Assert(posThresh > 0);
+        CV_Assert( levels_ > 0 );
+        CV_Assert( templFeatures_.size() == static_cast<size_t>(levels_ + 1) );
+        CV_Assert( imageFeatures_.size() == templFeatures_.size() );
+        CV_Assert( dp_ > 0.0 );
+        CV_Assert( posThresh_ > 0 );
 
         const double sinVal = sin(toRad(angle));
         const double cosVal = cos(toRad(angle));
-        const double idp = 1.0 / dp;
+        const double idp = 1.0 / dp_;
 
-        const int histRows = cvCeil(imageSize.height * idp);
-        const int histCols = cvCeil(imageSize.width * idp);
+        const int histRows = cvCeil(imageSize_.height * idp);
+        const int histCols = cvCeil(imageSize_.width * idp);
 
         Mat DHist(histRows + 2, histCols + 2, CV_32SC1, Scalar::all(0));
 
-        for (int i = 0; i <= levels; ++i)
+        for (int i = 0; i <= levels_; ++i)
         {
-            const std::vector<Feature>& templRow = templFeatures[i];
-            const std::vector<Feature>& imageRow = imageFeatures[i];
+            const std::vector<Feature>& templRow = templFeatures_[i];
+            const std::vector<Feature>& imageRow = imageFeatures_[i];
 
             for (size_t j = 0; j < templRow.size(); ++j)
             {
@@ -1165,7 +898,7 @@ namespace
                 {
                     Feature imF = imageRow[k];
 
-                    if (angleEq(imF.p1.theta, templF.p1.theta, angleEpsilon))
+                    if (angleEq(imF.p1.theta, templF.p1.theta, angleEpsilon_))
                     {
                         Point2d c1, c2;
 
@@ -1195,101 +928,17 @@ namespace
             {
                 const int votes = curRow[x + 1];
 
-                if (votes > posThresh && votes > curRow[x] && votes >= curRow[x + 2] && votes > prevRow[x + 1] && votes >= nextRow[x + 1])
+                if (votes > posThresh_ && votes > curRow[x] && votes >= curRow[x + 2] && votes > prevRow[x + 1] && votes >= nextRow[x + 1])
                 {
-                    posOutBuf.push_back(Vec4f(static_cast<float>(x * dp), static_cast<float>(y * dp), static_cast<float>(scale), static_cast<float>(angle)));
-                    voteOutBuf.push_back(Vec3i(votes, scaleVotes, angleVotes));
+                    posOutBuf_.push_back(Vec4f(static_cast<float>(x * dp_), static_cast<float>(y * dp_), static_cast<float>(scale), static_cast<float>(angle)));
+                    voteOutBuf_.push_back(Vec3i(votes, scaleVotes, angleVotes));
                 }
             }
         }
     }
 }
 
-Ptr<GeneralizedHough> cv::GeneralizedHough::create(int method)
+Ptr<GeneralizedHoughGuil> cv::createGeneralizedHoughGuil()
 {
-    switch (method)
-    {
-    case GHT_POSITION:
-        CV_Assert( !GHT_Ballard_Pos_info_auto.name().empty() );
-        return new GHT_Ballard_Pos();
-
-    case (GHT_POSITION | GHT_SCALE):
-        CV_Assert( !GHT_Ballard_PosScale_info_auto.name().empty() );
-        return new GHT_Ballard_PosScale();
-
-    case (GHT_POSITION | GHT_ROTATION):
-        CV_Assert( !GHT_Ballard_PosRotation_info_auto.name().empty() );
-        return new GHT_Ballard_PosRotation();
-
-    case (GHT_POSITION | GHT_SCALE | GHT_ROTATION):
-        CV_Assert( !GHT_Guil_Full_info_auto.name().empty() );
-        return new GHT_Guil_Full();
-    }
-
-    CV_Error(CV_StsBadArg, "Unsupported method");
-    return Ptr<GeneralizedHough>();
-}
-
-cv::GeneralizedHough::~GeneralizedHough()
-{
-}
-
-void cv::GeneralizedHough::setTemplate(InputArray _templ, int cannyThreshold, Point templCenter)
-{
-    Mat templ = _templ.getMat();
-
-    CV_Assert(templ.type() == CV_8UC1);
-    CV_Assert(cannyThreshold > 0);
-
-    Canny(templ, edges_, cannyThreshold / 2, cannyThreshold);
-    Sobel(templ, dx_, CV_32F, 1, 0);
-    Sobel(templ, dy_, CV_32F, 0, 1);
-
-    if (templCenter == Point(-1, -1))
-        templCenter = Point(templ.cols / 2, templ.rows / 2);
-
-    setTemplateImpl(edges_, dx_, dy_, templCenter);
-}
-
-void cv::GeneralizedHough::setTemplate(InputArray _edges, InputArray _dx, InputArray _dy, Point templCenter)
-{
-    Mat edges = _edges.getMat();
-    Mat dx = _dx.getMat();
-    Mat dy = _dy.getMat();
-
-    if (templCenter == Point(-1, -1))
-        templCenter = Point(edges.cols / 2, edges.rows / 2);
-
-    setTemplateImpl(edges, dx, dy, templCenter);
-}
-
-void cv::GeneralizedHough::detect(InputArray _image, OutputArray positions, OutputArray votes, int cannyThreshold)
-{
-    Mat image = _image.getMat();
-
-    CV_Assert(image.type() == CV_8UC1);
-    CV_Assert(cannyThreshold > 0);
-
-    Canny(image, edges_, cannyThreshold / 2, cannyThreshold);
-    Sobel(image, dx_, CV_32F, 1, 0);
-    Sobel(image, dy_, CV_32F, 0, 1);
-
-    detectImpl(edges_, dx_, dy_, positions, votes);
-}
-
-void cv::GeneralizedHough::detect(InputArray _edges, InputArray _dx, InputArray _dy, OutputArray positions, OutputArray votes)
-{
-    cv::Mat edges = _edges.getMat();
-    cv::Mat dx = _dx.getMat();
-    cv::Mat dy = _dy.getMat();
-
-    detectImpl(edges, dx, dy, positions, votes);
-}
-
-void cv::GeneralizedHough::release()
-{
-    edges_.release();
-    dx_.release();
-    dy_.release();
-    releaseImpl();
+    return new GeneralizedHoughGuilImpl;
 }
diff --git a/modules/photo/doc/inpainting.rst b/modules/photo/doc/inpainting.rst
index 58db4dbd90..625f70a263 100644
--- a/modules/photo/doc/inpainting.rst
+++ b/modules/photo/doc/inpainting.rst
@@ -23,7 +23,7 @@ Restores the selected region in an image using the region neighborhood.
 
     :param flags: Inpainting method that could be one of the following:
 
-            * **INPAINT_NS**     Navier-Stokes based method.
+            * **INPAINT_NS**     Navier-Stokes based method [Navier01]
 
             * **INPAINT_TELEA**     Method by Alexandru Telea  [Telea04]_.
 
@@ -36,3 +36,8 @@ for more details.
    * An example using the inpainting technique can be found at opencv_source_code/samples/cpp/inpaint.cpp
 
    * (Python) An example using the inpainting technique can be found at opencv_source_code/samples/python2/inpaint.py
+
+
+.. [Telea04] Telea, Alexandru. "An image inpainting technique based on the fast marching method." Journal of graphics tools 9, no. 1 (2004): 23-34.
+
+.. [Navier01] Bertalmio, Marcelo, Andrea L. Bertozzi, and Guillermo Sapiro. "Navier-stokes, fluid dynamics, and image and video inpainting." In Computer Vision and Pattern Recognition, 2001. CVPR 2001. Proceedings of the 2001 IEEE Computer Society Conference on, vol. 1, pp. I-355. IEEE, 2001.
diff --git a/modules/stitching/src/motion_estimators.cpp b/modules/stitching/src/motion_estimators.cpp
index abd43b11fb..7ceb49584f 100644
--- a/modules/stitching/src/motion_estimators.cpp
+++ b/modules/stitching/src/motion_estimators.cpp
@@ -42,13 +42,7 @@
 
 #include "precomp.hpp"
 #include "opencv2/calib3d/calib3d_c.h"
-
-#ifdef _MSC_VER
-  #include <float.h>
-  #define isnan(x) _isnan(x)
-#else
-  #include <math.h>
-#endif
+#include "opencv2/core/cvdef.h"
 
 using namespace cv;
 using namespace cv::detail;
@@ -259,7 +253,7 @@ bool BundleAdjusterBase::estimate(const std::vector<ImageFeatures> &features,
     bool ok = true;
     for (int i = 0; i < cam_params_.rows; ++i)
     {
-        if (isnan(cam_params_.at<double>(i,0)))
+        if (cvIsNaN(cam_params_.at<double>(i,0)))
         {
             ok = false;
             break;
diff --git a/modules/video/doc/motion_analysis_and_object_tracking.rst b/modules/video/doc/motion_analysis_and_object_tracking.rst
index d7a762ce79..b392c68544 100644
--- a/modules/video/doc/motion_analysis_and_object_tracking.rst
+++ b/modules/video/doc/motion_analysis_and_object_tracking.rst
@@ -170,6 +170,8 @@ Finds the geometric transform (warp) between two images in terms of the ECC crit
 
 .. ocv:function:: double findTransformECC( InputArray templateImage, InputArray inputImage, InputOutputArray warpMatrix, int motionType=MOTION_AFFINE, TermCriteria criteria=TermCriteria(TermCriteria::COUNT+TermCriteria::EPS, 50, 0.001))
 
+.. ocv:pyfunction:: cv2.findTransformECC(templateImage, inputImage, warpMatrix[, motionType[, criteria]]) -> retval, warpMatrix
+
     :param templateImage: single-channel template image; ``CV_8U`` or ``CV_32F`` array.
 
     :param inputImage: single-channel input image which should be warped with the final ``warpMatrix`` in order to provide an image similar to ``templateImage``, same type as ``temlateImage``.
diff --git a/samples/gpu/generalized_hough.cpp b/samples/gpu/generalized_hough.cpp
index dbd924f752..1863085256 100644
--- a/samples/gpu/generalized_hough.cpp
+++ b/samples/gpu/generalized_hough.cpp
@@ -5,13 +5,12 @@
 #include "opencv2/core.hpp"
 #include "opencv2/core/utility.hpp"
 #include "opencv2/imgproc.hpp"
-#include "opencv2/gpu.hpp"
+#include "opencv2/gpuimgproc.hpp"
 #include "opencv2/highgui.hpp"
 #include "opencv2/contrib.hpp"
 
 using namespace std;
 using namespace cv;
-using cv::gpu::GpuMat;
 
 static Mat loadImage(const string& name)
 {
@@ -29,8 +28,7 @@ int main(int argc, const char* argv[])
     CommandLineParser cmd(argc, argv,
         "{ image i        | pic1.png  | input image }"
         "{ template t     | templ.png | template image }"
-        "{ scale s        |           | estimate scale }"
-        "{ rotation r     |           | estimate rotation }"
+        "{ full           |           | estimate scale and rotation }"
         "{ gpu            |           | use gpu version }"
         "{ minDist        | 100       | minimum distance between the centers of the detected objects }"
         "{ levels         | 360       | R-Table levels }"
@@ -45,7 +43,7 @@ int main(int argc, const char* argv[])
         "{ minAngle       | 0         | minimal rotation angle to detect in degrees }"
         "{ maxAngle       | 360       | maximal rotation angle to detect in degrees }"
         "{ angleStep      | 1         | angle step in degrees }"
-        "{ maxSize        | 1000      | maximal size of inner buffers }"
+        "{ maxBufSize     | 1000      | maximal size of inner buffers }"
         "{ help h ?       |           | print help message }"
     );
 
@@ -59,8 +57,7 @@ int main(int argc, const char* argv[])
 
     const string templName = cmd.get<string>("template");
     const string imageName = cmd.get<string>("image");
-    const bool estimateScale = cmd.has("scale");
-    const bool estimateRotation = cmd.has("rotation");
+    const bool full = cmd.has("full");
     const bool useGpu = cmd.has("gpu");
     const double minDist = cmd.get<double>("minDist");
     const int levels = cmd.get<int>("levels");
@@ -75,7 +72,7 @@ int main(int argc, const char* argv[])
     const double minAngle = cmd.get<double>("minAngle");
     const double maxAngle = cmd.get<double>("maxAngle");
     const double angleStep = cmd.get<double>("angleStep");
-    const int maxSize = cmd.get<int>("maxSize");
+    const int maxBufSize = cmd.get<int>("maxBufSize");
 
     if (!cmd.check())
     {
@@ -86,93 +83,69 @@ int main(int argc, const char* argv[])
     Mat templ = loadImage(templName);
     Mat image = loadImage(imageName);
 
-    int method = cv::GeneralizedHough::GHT_POSITION;
-    if (estimateScale)
-        method += cv::GeneralizedHough::GHT_SCALE;
-    if (estimateRotation)
-        method += cv::GeneralizedHough::GHT_ROTATION;
+    Ptr<GeneralizedHough> alg;
+
+    if (!full)
+    {
+        Ptr<GeneralizedHoughBallard> ballard = useGpu ? gpu::createGeneralizedHoughBallard() : createGeneralizedHoughBallard();
+
+        ballard->setMinDist(minDist);
+        ballard->setLevels(levels);
+        ballard->setDp(dp);
+        ballard->setMaxBufferSize(maxBufSize);
+        ballard->setVotesThreshold(votesThreshold);
+
+        alg = ballard;
+    }
+    else
+    {
+        Ptr<GeneralizedHoughGuil> guil = useGpu ? gpu::createGeneralizedHoughGuil() : createGeneralizedHoughGuil();
+
+        guil->setMinDist(minDist);
+        guil->setLevels(levels);
+        guil->setDp(dp);
+        guil->setMaxBufferSize(maxBufSize);
+
+        guil->setMinAngle(minAngle);
+        guil->setMaxAngle(maxAngle);
+        guil->setAngleStep(angleStep);
+        guil->setAngleThresh(angleThresh);
+
+        guil->setMinScale(minScale);
+        guil->setMaxScale(maxScale);
+        guil->setScaleStep(scaleStep);
+        guil->setScaleThresh(scaleThresh);
+
+        guil->setPosThresh(posThresh);
+
+        alg = guil;
+    }
 
     vector<Vec4f> position;
-    cv::TickMeter tm;
+    TickMeter tm;
 
     if (useGpu)
     {
-        GpuMat d_templ(templ);
-        GpuMat d_image(image);
-        GpuMat d_position;
+        gpu::GpuMat d_templ(templ);
+        gpu::GpuMat d_image(image);
+        gpu::GpuMat d_position;
 
-        Ptr<gpu::GeneralizedHough> d_hough = gpu::GeneralizedHough::create(method);
-        d_hough->set("minDist", minDist);
-        d_hough->set("levels", levels);
-        d_hough->set("dp", dp);
-        d_hough->set("maxSize", maxSize);
-        if (estimateScale && estimateRotation)
-        {
-            d_hough->set("angleThresh", angleThresh);
-            d_hough->set("scaleThresh", scaleThresh);
-            d_hough->set("posThresh", posThresh);
-        }
-        else
-        {
-            d_hough->set("votesThreshold", votesThreshold);
-        }
-        if (estimateScale)
-        {
-            d_hough->set("minScale", minScale);
-            d_hough->set("maxScale", maxScale);
-            d_hough->set("scaleStep", scaleStep);
-        }
-        if (estimateRotation)
-        {
-            d_hough->set("minAngle", minAngle);
-            d_hough->set("maxAngle", maxAngle);
-            d_hough->set("angleStep", angleStep);
-        }
-
-        d_hough->setTemplate(d_templ);
+        alg->setTemplate(d_templ);
 
         tm.start();
 
-        d_hough->detect(d_image, d_position);
-        d_hough->downloadResults(d_position, position);
+        alg->detect(d_image, d_position);
+        d_position.download(position);
 
         tm.stop();
     }
     else
     {
-        Ptr<GeneralizedHough> hough = GeneralizedHough::create(method);
-        hough->set("minDist", minDist);
-        hough->set("levels", levels);
-        hough->set("dp", dp);
-        if (estimateScale && estimateRotation)
-        {
-            hough->set("angleThresh", angleThresh);
-            hough->set("scaleThresh", scaleThresh);
-            hough->set("posThresh", posThresh);
-            hough->set("maxSize", maxSize);
-        }
-        else
-        {
-            hough->set("votesThreshold", votesThreshold);
-        }
-        if (estimateScale)
-        {
-            hough->set("minScale", minScale);
-            hough->set("maxScale", maxScale);
-            hough->set("scaleStep", scaleStep);
-        }
-        if (estimateRotation)
-        {
-            hough->set("minAngle", minAngle);
-            hough->set("maxAngle", maxAngle);
-            hough->set("angleStep", angleStep);
-        }
-
-        hough->setTemplate(templ);
+        alg->setTemplate(templ);
 
         tm.start();
 
-        hough->detect(image, position);
+        alg->detect(image, position);
 
         tm.stop();
     }