diff --git a/doc/opencv.bib b/doc/opencv.bib
index ada690497d..0991d7b171 100644
--- a/doc/opencv.bib
+++ b/doc/opencv.bib
@@ -175,13 +175,6 @@
year = {1998},
publisher = {Citeseer}
}
-@book{Breiman84,
- title = {Classification and regression trees},
- author = {Breiman, Leo and Friedman, Jerome and Stone, Charles J and Olshen, Richard A},
- year = {1984},
- publisher = {CRC press},
- url = {https://projecteuclid.org/download/pdf_1/euclid.aos/1016218223}
-}
@incollection{Brox2004,
author = {Brox, Thomas and Bruhn, Andres and Papenberg, Nils and Weickert, Joachim},
title = {High accuracy optical flow estimation based on a theory for warping},
@@ -349,12 +342,6 @@
publisher = {ACM},
url = {https://www.researchgate.net/profile/Liyuan_Li/publication/221571587_Foreground_object_detection_from_videos_containing_complex_background/links/09e4150bdf566d110c000000/Foreground-object-detection-from-videos-containing-complex-background.pdf}
}
-@article{FHT98,
- author = {Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert},
- title = {Additive Logistic Regression: a Statistical View of Boosting},
- year = {1998},
- url = {https://projecteuclid.org/download/pdf_1/euclid.aos/1016218223}
-}
@inproceedings{FL02,
author = {Fattal, Raanan and Lischinski, Dani and Werman, Michael},
title = {Gradient domain high dynamic range compression},
@@ -521,16 +508,6 @@
publisher = {IEEE},
url = {http://www.openrs.org/photogrammetry/2015/SGM%202008%20PAMI%20-%20Stereo%20Processing%20by%20Semiglobal%20Matching%20and%20Mutual%20Informtion.pdf}
}
-@article{HTF01,
- author = {Trevor, Hastie and Robert, Tibshirani and Jerome, Friedman},
- title = {The elements of statistical learning: data mining, inference and prediction},
- year = {2001},
- pages = {371--406},
- journal = {New York: Springer-Verlag},
- volume = {1},
- number = {8},
- url = {http://www.stat.auckland.ac.nz/~yee/784/files/ch09AdditiveModelsTrees.pdf}
-}
@article{Hartley99,
author = {Hartley, Richard I},
title = {Theory and practice of projective rectification},
@@ -602,17 +579,6 @@
number = {3},
publisher = {Elsevier}
}
-@article{Kirkpatrick83,
- author = {Kirkpatrick, S. and Gelatt, C. D. Jr and Vecchi, M. P.},
- title = {Optimization by Simulated Annealing},
- year = {1983},
- pages = {671--680},
- journal = {Science},
- volume = {220},
- number = {4598},
- publisher = {American Association for the Advancement of Science},
- url = {http://sci2s.ugr.es/sites/default/files/files/Teaching/GraduatesCourses/Metaheuristicas/Bibliography/1983-Science-Kirkpatrick-sim_anneal.pdf}
-}
@inproceedings{Kolmogorov03,
author = {Kim, Junhwan and Kolmogorov, Vladimir and Zabih, Ramin},
title = {Visual correspondence using energy minimization and mutual information},
@@ -657,16 +623,6 @@
volume = {5},
pages = {1530-1536}
}
-@article{LibSVM,
- author = {Chang, Chih-Chung and Lin, Chih-Jen},
- title = {LIBSVM: a library for support vector machines},
- year = {2011},
- pages = {27},
- journal = {ACM Transactions on Intelligent Systems and Technology (TIST)},
- volume = {2},
- number = {3},
- publisher = {ACM}
-}
@inproceedings{Lienhart02,
author = {Lienhart, Rainer and Maydt, Jochen},
title = {An extended set of haar-like features for rapid object detection},
@@ -905,14 +861,6 @@
number = {1},
publisher = {IEEE}
}
-@inproceedings{RPROP93,
- author = {Riedmiller, Martin and Braun, Heinrich},
- title = {A direct adaptive method for faster backpropagation learning: The RPROP algorithm},
- booktitle = {Neural Networks, 1993., IEEE International Conference on},
- year = {1993},
- pages = {586--591},
- publisher = {IEEE}
-}
@inproceedings{RRKB11,
author = {Rublee, Ethan and Rabaud, Vincent and Konolige, Kurt and Bradski, Gary},
title = {ORB: an efficient alternative to SIFT or SURF},
@@ -1235,14 +1183,6 @@
year = {2007},
publisher = {IEEE}
}
-@incollection{bottou2010large,
- title = {Large-scale machine learning with stochastic gradient descent},
- author = {Bottou, L{\'e}on},
- booktitle = {Proceedings of COMPSTAT'2010},
- pages = {177--186},
- year = {2010},
- publisher = {Springer}
-}
@inproceedings{Ke17,
author = {Ke, Tong and Roumeliotis, Stergios},
title = {An Efficient Algebraic Solution to the Perspective-Three-Point Problem},
diff --git a/doc/py_tutorials/py_ml/images/knnicon.png b/doc/py_tutorials/py_ml/images/knnicon.png
deleted file mode 100644
index 61e4dc040b..0000000000
Binary files a/doc/py_tutorials/py_ml/images/knnicon.png and /dev/null differ
diff --git a/doc/py_tutorials/py_ml/images/svmicon.png b/doc/py_tutorials/py_ml/images/svmicon.png
deleted file mode 100644
index 32608ee5c8..0000000000
Binary files a/doc/py_tutorials/py_ml/images/svmicon.png and /dev/null differ
diff --git a/doc/py_tutorials/py_ml/py_knn/images/knn_icon1.jpg b/doc/py_tutorials/py_ml/py_knn/images/knn_icon1.jpg
deleted file mode 100644
index 81feba514a..0000000000
Binary files a/doc/py_tutorials/py_ml/py_knn/images/knn_icon1.jpg and /dev/null differ
diff --git a/doc/py_tutorials/py_ml/py_knn/images/knn_icon2.jpg b/doc/py_tutorials/py_ml/py_knn/images/knn_icon2.jpg
deleted file mode 100644
index 13d3c77f69..0000000000
Binary files a/doc/py_tutorials/py_ml/py_knn/images/knn_icon2.jpg and /dev/null differ
diff --git a/doc/py_tutorials/py_ml/py_knn/py_knn_index.markdown b/doc/py_tutorials/py_ml/py_knn/py_knn_index.markdown
deleted file mode 100644
index 4f2c3d60b4..0000000000
--- a/doc/py_tutorials/py_ml/py_knn/py_knn_index.markdown
+++ /dev/null
@@ -1,10 +0,0 @@
-K-Nearest Neighbour {#tutorial_py_knn_index}
-===================
-
-- @subpage tutorial_py_knn_understanding
-
- Get a basic understanding of what kNN is
-
-- @subpage tutorial_py_knn_opencv
-
- Now let's use kNN in OpenCV for digit recognition OCR
diff --git a/doc/py_tutorials/py_ml/py_knn/py_knn_opencv/py_knn_opencv.markdown b/doc/py_tutorials/py_ml/py_knn/py_knn_opencv/py_knn_opencv.markdown
deleted file mode 100644
index e876ddf3e6..0000000000
--- a/doc/py_tutorials/py_ml/py_knn/py_knn_opencv/py_knn_opencv.markdown
+++ /dev/null
@@ -1,123 +0,0 @@
-OCR of Hand-written Data using kNN {#tutorial_py_knn_opencv}
-==================================
-
-Goal
-----
-
-In this chapter:
- - We will use our knowledge on kNN to build a basic OCR (Optical Character Recognition) application.
- - We will try our application on Digits and Alphabets data that comes with OpenCV.
-
-OCR of Hand-written Digits
---------------------------
-
-Our goal is to build an application which can read handwritten digits. For this we need some
-training data and some test data. OpenCV comes with an image digits.png (in the folder
-opencv/samples/data/) which has 5000 handwritten digits (500 for each digit). Each digit is
-a 20x20 image. So our first step is to split this image into 5000 different digit images. Then for each digit (20x20 image),
-we flatten it into a single row with 400 pixels. That is our feature set, i.e. intensity values of all
-pixels. It is the simplest feature set we can create. We use the first 250 samples of each digit as
-training data, and the other 250 samples as test data. So let's prepare them first.
-@code{.py}
-import numpy as np
-import cv2 as cv
-
-img = cv.imread('digits.png')
-gray = cv.cvtColor(img,cv.COLOR_BGR2GRAY)
-
-# Now we split the image to 5000 cells, each 20x20 size
-cells = [np.hsplit(row,100) for row in np.vsplit(gray,50)]
-
-# Make it into a Numpy array: its size will be (50,100,20,20)
-x = np.array(cells)
-
-# Now we prepare the training data and test data
-train = x[:,:50].reshape(-1,400).astype(np.float32) # Size = (2500,400)
-test = x[:,50:100].reshape(-1,400).astype(np.float32) # Size = (2500,400)
-
-# Create labels for train and test data
-k = np.arange(10)
-train_labels = np.repeat(k,250)[:,np.newaxis]
-test_labels = train_labels.copy()
-
-# Initiate kNN, train it on the training data, then test it with the test data with k=1
-knn = cv.ml.KNearest_create()
-knn.train(train, cv.ml.ROW_SAMPLE, train_labels)
-ret,result,neighbours,dist = knn.findNearest(test,k=5)
-
-# Now we check the accuracy of classification
-# For that, compare the result with test_labels and check which are wrong
-matches = result==test_labels
-correct = np.count_nonzero(matches)
-accuracy = correct*100.0/result.size
-print( accuracy )
-@endcode
-So our basic OCR app is ready. This particular example gave me an accuracy of 91%. One option to
-improve accuracy is to add more data for training, especially for the digits where we had more errors.
-
-Instead of finding
-this training data every time I start the application, I better save it, so that the next time, I can directly
-read this data from a file and start classification. This can be done with the help of some Numpy
-functions like np.savetxt, np.savez, np.load, etc. Please check the NumPy docs for more details.
-@code{.py}
-# Save the data
-np.savez('knn_data.npz',train=train, train_labels=train_labels)
-
-# Now load the data
-with np.load('knn_data.npz') as data:
- print( data.files )
- train = data['train']
- train_labels = data['train_labels']
-@endcode
-In my system, it takes around 4.4 MB of memory. Since we are using intensity values (uint8 data) as
-features, it would be better to convert the data to np.uint8 first and then save it. It takes only
-1.1 MB in this case. Then while loading, you can convert back into float32.
-
-OCR of the English Alphabet
-------------------------
-
-Next we will do the same for the English alphabet, but there is a slight change in data and feature
-set. Here, instead of images, OpenCV comes with a data file, letter-recognition.data in
-opencv/samples/cpp/ folder. If you open it, you will see 20000 lines which may, on first sight, look
-like garbage. Actually, in each row, the first column is a letter which is our label. The next 16 numbers
-following it are the different features. These features are obtained from the [UCI Machine Learning
-Repository](http://archive.ics.uci.edu/ml/). You can find the details of these features in [this
-page](http://archive.ics.uci.edu/ml/datasets/Letter+Recognition).
-
-There are 20000 samples available, so we take the first 10000 as training samples and the remaining
-10000 as test samples. We should change the letters to ascii characters because we can't work with
-letters directly.
-@code{.py}
-import cv2 as cv
-import numpy as np
-
-# Load the data and convert the letters to numbers
-data= np.loadtxt('letter-recognition.data', dtype= 'float32', delimiter = ',',
- converters= {0: lambda ch: ord(ch)-ord('A')})
-
-# Split the dataset in two, with 10000 samples each for training and test sets
-train, test = np.vsplit(data,2)
-
-# Split trainData and testData into features and responses
-responses, trainData = np.hsplit(train,[1])
-labels, testData = np.hsplit(test,[1])
-
-# Initiate the kNN, classify, measure accuracy
-knn = cv.ml.KNearest_create()
-knn.train(trainData, cv.ml.ROW_SAMPLE, responses)
-ret, result, neighbours, dist = knn.findNearest(testData, k=5)
-
-correct = np.count_nonzero(result == labels)
-accuracy = correct*100.0/10000
-print( accuracy )
-@endcode
-It gives me an accuracy of 93.22%. Again, if you want to increase accuracy, you can iteratively add
-more data.
-
-Additional Resources
---------------------
-1. [Wikipedia article on Optical character recognition](https://en.wikipedia.org/wiki/Optical_character_recognition)
-
-Exercises
----------
-1. Here we used k=5. What happens if you try other values of k? Can you find a value that maximizes accuracy (minimizes the number of errors)?
\ No newline at end of file
diff --git a/doc/py_tutorials/py_ml/py_knn/py_knn_understanding/images/knn_simple.png b/doc/py_tutorials/py_ml/py_knn/py_knn_understanding/images/knn_simple.png
deleted file mode 100644
index cb3744e517..0000000000
Binary files a/doc/py_tutorials/py_ml/py_knn/py_knn_understanding/images/knn_simple.png and /dev/null differ
diff --git a/doc/py_tutorials/py_ml/py_knn/py_knn_understanding/images/knn_theory.png b/doc/py_tutorials/py_ml/py_knn/py_knn_understanding/images/knn_theory.png
deleted file mode 100644
index 9d1abdded4..0000000000
Binary files a/doc/py_tutorials/py_ml/py_knn/py_knn_understanding/images/knn_theory.png and /dev/null differ
diff --git a/doc/py_tutorials/py_ml/py_knn/py_knn_understanding/py_knn_understanding.markdown b/doc/py_tutorials/py_ml/py_knn/py_knn_understanding/py_knn_understanding.markdown
deleted file mode 100644
index 5985cdd559..0000000000
--- a/doc/py_tutorials/py_ml/py_knn/py_knn_understanding/py_knn_understanding.markdown
+++ /dev/null
@@ -1,150 +0,0 @@
-Understanding k-Nearest Neighbour {#tutorial_py_knn_understanding}
-=================================
-
-Goal
-----
-
-In this chapter, we will understand the concepts of the k-Nearest Neighbour (kNN) algorithm.
-
-Theory
-------
-
-kNN is one of the simplest classification algorithms available for supervised learning. The idea
-is to search for the closest match(es) of the test data in the feature space. We will look into it with the below
-image.
-
-
-
-In the image, there are two families: Blue Squares and Red Triangles. We refer to each family as
-a **Class**. Their houses are shown in their town map which we call the **Feature Space**. You can consider
-a feature space as a space where all data are projected. For example, consider a 2D coordinate
-space. Each datum has two features, a x coordinate and a y coordinate. You can represent this datum in your 2D
-coordinate space, right? Now imagine that there are three features, you will need 3D space. Now consider N
-features: you need N-dimensional space, right? This N-dimensional space is its feature space.
-In our image, you can consider it as a 2D case with two features.
-
-Now consider what happens if a new member comes into the town and creates a new home, which is shown as the green circle. He
-should be added to one of these Blue or Red families (or *classes*). We call that process, **Classification**. How exactly should this new member be classified? Since we are dealing with kNN, let us apply the algorithm.
-
-One simple method is to check who is his nearest neighbour. From the image, it is clear that it is a member of the Red
-Triangle family. So he is classified as a Red Triangle. This method is called simply **Nearest Neighbour** classification, because classification depends only on the *nearest neighbour*.
-
-But there is a problem with this approach! Red Triangle may be the nearest neighbour, but what if there are also a lot of Blue
-Squares nearby? Then Blue Squares have more strength in that locality than Red Triangles, so
-just checking the nearest one is not sufficient. Instead we may want to check some **k** nearest families. Then whichever family is the majority amongst them, the new guy should belong to that family. In our image, let's take k=3, i.e. consider the 3 nearest
-neighbours. The new member has two Red neighbours and one Blue neighbour (there are two Blues equidistant, but since k=3, we can take only
-one of them), so again he should be added to Red family. But what if we take k=7? Then he has 5 Blue
-neighbours and 2 Red neighbours and should be added to the Blue family. The result will vary with the selected
-value of k. Note that if k is not an odd number, we can get a tie, as would happen in the above case with k=4. We would see that our new member has 2 Red and 2 Blue neighbours as his four nearest neighbours and we would need to choose a method for breaking the tie to perform classification. So to reiterate, this method is called **k-Nearest Neighbour** since
-classification depends on the *k nearest neighbours*.
-
-Again, in kNN, it is true we are considering k neighbours, but we are giving equal importance to
-all, right? Is this justified? For example, take the tied case of k=4. As we can see, the 2
-Red neighbours are actually closer to the new member than the other 2 Blue neighbours, so he is more eligible to be
-added to the Red family. How do we mathematically explain that? We give some weights to each neighbour
-depending on their distance to the new-comer: those who are nearer to him get higher weights, while
-those that are farther away get lower weights. Then we add the total weights of each family separately and classify the new-comer as part of whichever family
-received higher total weights. This is called **modified kNN** or **weighted kNN**.
-
-So what are some important things you see here?
-
-- Because we have to check
- the distance from the new-comer to all the existing houses to find the nearest neighbour(s), you need to have information about all of the houses in town, right? If there are plenty of houses and families, it takes a lot of memory, and also more time for calculation.
-- There is almost zero time for any kind of "training" or preparation. Our "learning" involves only memorizing (storing) the data, before testing and classifying.
-
-Now let's see this algorithm at work in OpenCV.
-
-kNN in OpenCV
--------------
-
-We will do a simple example here, with two families (classes), just like above. Then in the next
-chapter, we will do an even better example.
-
-So here, we label the Red family as **Class-0** (so denoted by 0) and Blue family as **Class-1**
-(denoted by 1). We create 25 neighbours or 25 training data, and label each of them as either part of Class-0 or Class-1.
-We can do this with the help of a Random Number Generator from NumPy.
-
-Then we can plot it with the help of Matplotlib. Red neighbours are shown as Red Triangles and Blue
-neighbours are shown as Blue Squares.
-@code{.py}
-import cv2 as cv
-import numpy as np
-import matplotlib.pyplot as plt
-
-# Feature set containing (x,y) values of 25 known/training data
-trainData = np.random.randint(0,100,(25,2)).astype(np.float32)
-
-# Label each one either Red or Blue with numbers 0 and 1
-responses = np.random.randint(0,2,(25,1)).astype(np.float32)
-
-# Take Red neighbours and plot them
-red = trainData[responses.ravel()==0]
-plt.scatter(red[:,0],red[:,1],80,'r','^')
-
-# Take Blue neighbours and plot them
-blue = trainData[responses.ravel()==1]
-plt.scatter(blue[:,0],blue[:,1],80,'b','s')
-
-plt.show()
-@endcode
-You will get something similar to our first image. Since you are using a random number generator, you
-will get different data each time you run the code.
-
-Next initiate the kNN algorithm and pass the trainData and responses to train the kNN. (Underneath the hood, it constructs
-a search tree: see the Additional Resources section below for more information on this.)
-
-Then we will bring one new-comer and classify him as belonging to a family with the help of kNN in OpenCV. Before
-running kNN, we need to know something about our test data (data of new comers). Our data should be a
-floating point array with size \f$number \; of \; testdata \times number \; of \; features\f$. Then we
-find the nearest neighbours of the new-comer. We can specify *k*: how many neighbours we want. (Here we used 3.) It returns:
-
-1. The label given to the new-comer depending upon the kNN theory we saw earlier. If you want the *Nearest
- Neighbour* algorithm, just specify k=1.
-2. The labels of the k-Nearest Neighbours.
-3. The corresponding distances from the new-comer to each nearest neighbour.
-
-So let's see how it works. The new-comer is marked in green.
-@code{.py}
-newcomer = np.random.randint(0,100,(1,2)).astype(np.float32)
-plt.scatter(newcomer[:,0],newcomer[:,1],80,'g','o')
-
-knn = cv.ml.KNearest_create()
-knn.train(trainData, cv.ml.ROW_SAMPLE, responses)
-ret, results, neighbours ,dist = knn.findNearest(newcomer, 3)
-
-print( "result: {}\n".format(results) )
-print( "neighbours: {}\n".format(neighbours) )
-print( "distance: {}\n".format(dist) )
-
-plt.show()
-@endcode
-I got the following results:
-@code{.py}
-result: [[ 1.]]
-neighbours: [[ 1. 1. 1.]]
-distance: [[ 53. 58. 61.]]
-@endcode
-It says that our new-comer's 3 nearest neighbours are all from the Blue family. Therefore, he is labelled as part of the Blue
-family. It is obvious from the plot below:
-
-
-
-If you have multiple new-comers (test data), you can just pass them as an array. Corresponding results are also
-obtained as arrays.
-@code{.py}
-# 10 new-comers
-newcomers = np.random.randint(0,100,(10,2)).astype(np.float32)
-ret, results,neighbours,dist = knn.findNearest(newcomer, 3)
-# The results also will contain 10 labels.
-@endcode
-Additional Resources
---------------------
-
-1. [NPTEL notes on Pattern Recognition, Chapter
- 11](https://nptel.ac.in/courses/106108057)
-2. [Wikipedia article on Nearest neighbor search](https://en.wikipedia.org/wiki/Nearest_neighbor_search)
-3. [Wikipedia article on k-d tree](https://en.wikipedia.org/wiki/K-d_tree)
-
-Exercises
----------
-1. Try repeating the above with more classes and different choices of k. Does choosing k become harder with more classes in the same 2D feature space?
\ No newline at end of file
diff --git a/doc/py_tutorials/py_ml/py_svm/images/svm_icon1.jpg b/doc/py_tutorials/py_ml/py_svm/images/svm_icon1.jpg
deleted file mode 100644
index 9bb1238087..0000000000
Binary files a/doc/py_tutorials/py_ml/py_svm/images/svm_icon1.jpg and /dev/null differ
diff --git a/doc/py_tutorials/py_ml/py_svm/images/svm_icon2.jpg b/doc/py_tutorials/py_ml/py_svm/images/svm_icon2.jpg
deleted file mode 100644
index dd13e9d7f3..0000000000
Binary files a/doc/py_tutorials/py_ml/py_svm/images/svm_icon2.jpg and /dev/null differ
diff --git a/doc/py_tutorials/py_ml/py_svm/py_svm_basics/images/svm_basics1.png b/doc/py_tutorials/py_ml/py_svm/py_svm_basics/images/svm_basics1.png
deleted file mode 100644
index 93ae457c0c..0000000000
Binary files a/doc/py_tutorials/py_ml/py_svm/py_svm_basics/images/svm_basics1.png and /dev/null differ
diff --git a/doc/py_tutorials/py_ml/py_svm/py_svm_basics/images/svm_basics2.png b/doc/py_tutorials/py_ml/py_svm/py_svm_basics/images/svm_basics2.png
deleted file mode 100644
index d4522f0479..0000000000
Binary files a/doc/py_tutorials/py_ml/py_svm/py_svm_basics/images/svm_basics2.png and /dev/null differ
diff --git a/doc/py_tutorials/py_ml/py_svm/py_svm_basics/images/svm_basics3.png b/doc/py_tutorials/py_ml/py_svm/py_svm_basics/images/svm_basics3.png
deleted file mode 100644
index 1379a56d52..0000000000
Binary files a/doc/py_tutorials/py_ml/py_svm/py_svm_basics/images/svm_basics3.png and /dev/null differ
diff --git a/doc/py_tutorials/py_ml/py_svm/py_svm_basics/py_svm_basics.markdown b/doc/py_tutorials/py_ml/py_svm/py_svm_basics/py_svm_basics.markdown
deleted file mode 100644
index 55f74237e9..0000000000
--- a/doc/py_tutorials/py_ml/py_svm/py_svm_basics/py_svm_basics.markdown
+++ /dev/null
@@ -1,134 +0,0 @@
-Understanding SVM {#tutorial_py_svm_basics}
-=================
-
-Goal
-----
-
-In this chapter
- - We will see an intuitive understanding of SVM
-
-Theory
-------
-
-### Linearly Separable Data
-
-Consider the image below which has two types of data, red and blue. In kNN, for a test data, we used
-to measure its distance to all the training samples and take the one with minimum distance. It takes
-plenty of time to measure all the distances and plenty of memory to store all the training-samples.
-But considering the data given in image, should we need that much?
-
-
-
-Consider another idea. We find a line, \f$f(x)=ax_1+bx_2+c\f$ which divides both the data to two
-regions. When we get a new test_data \f$X\f$, just substitute it in \f$f(x)\f$. If \f$f(X) > 0\f$, it belongs
-to blue group, else it belongs to red group. We can call this line as **Decision Boundary**. It is
-very simple and memory-efficient. Such data which can be divided into two with a straight line (or
-hyperplanes in higher dimensions) is called **Linear Separable**.
-
-So in above image, you can see plenty of such lines are possible. Which one we will take? Very
-intuitively we can say that the line should be passing as far as possible from all the points. Why?
-Because there can be noise in the incoming data. This data should not affect the classification
-accuracy. So taking a farthest line will provide more immunity against noise. So what SVM does is to
-find a straight line (or hyperplane) with largest minimum distance to the training samples. See the
-bold line in below image passing through the center.
-
-
-
-So to find this Decision Boundary, you need training data. Do you need all? NO. Just the ones which
-are close to the opposite group are sufficient. In our image, they are the one blue filled circle
-and two red filled squares. We can call them **Support Vectors** and the lines passing through them
-are called **Support Planes**. They are adequate for finding our decision boundary. We need not
-worry about all the data. It helps in data reduction.
-
-What happened is, first two hyperplanes are found which best represents the data. For eg, blue data
-is represented by \f$w^Tx+b_0 > 1\f$ while red data is represented by \f$w^Tx+b_0 < -1\f$ where \f$w\f$ is
-**weight vector** ( \f$w=[w_1, w_2,..., w_n]\f$) and \f$x\f$ is the feature vector
-(\f$x = [x_1,x_2,..., x_n]\f$). \f$b_0\f$ is the **bias**. Weight vector decides the orientation of decision
-boundary while bias point decides its location. Now decision boundary is defined to be midway
-between these hyperplanes, so expressed as \f$w^Tx+b_0 = 0\f$. The minimum distance from support vector
-to the decision boundary is given by, \f$distance_{support \, vectors}=\frac{1}{||w||}\f$. Margin is
-twice this distance, and we need to maximize this margin. i.e. we need to minimize a new function
-\f$L(w, b_0)\f$ with some constraints which can expressed below:
-
-\f[\min_{w, b_0} L(w, b_0) = \frac{1}{2}||w||^2 \; \text{subject to} \; t_i(w^Tx+b_0) \geq 1 \; \forall i\f]
-
-where \f$t_i\f$ is the label of each class, \f$t_i \in [-1,1]\f$.
-
-### Non-Linearly Separable Data
-
-Consider some data which can't be divided into two with a straight line. For example, consider an
-one-dimensional data where 'X' is at -3 & +3 and 'O' is at -1 & +1. Clearly it is not linearly
-separable. But there are methods to solve these kinds of problems. If we can map this data set with
-a function, \f$f(x) = x^2\f$, we get 'X' at 9 and 'O' at 1 which are linear separable.
-
-Otherwise we can convert this one-dimensional to two-dimensional data. We can use \f$f(x)=(x,x^2)\f$
-function to map this data. Then 'X' becomes (-3,9) and (3,9) while 'O' becomes (-1,1) and (1,1).
-This is also linear separable. In short, chance is more for a non-linear separable data in
-lower-dimensional space to become linear separable in higher-dimensional space.
-
-In general, it is possible to map points in a d-dimensional space to some D-dimensional space
-\f$(D>d)\f$ to check the possibility of linear separability. There is an idea which helps to compute the
-dot product in the high-dimensional (kernel) space by performing computations in the low-dimensional
-input (feature) space. We can illustrate with following example.
-
-Consider two points in two-dimensional space, \f$p=(p_1,p_2)\f$ and \f$q=(q_1,q_2)\f$. Let \f$\phi\f$ be a
-mapping function which maps a two-dimensional point to three-dimensional space as follows:
-
-\f[\phi (p) = (p_{1}^2,p_{2}^2,\sqrt{2} p_1 p_2)
-\phi (q) = (q_{1}^2,q_{2}^2,\sqrt{2} q_1 q_2)\f]
-
-Let us define a kernel function \f$K(p,q)\f$ which does a dot product between two points, shown below:
-
-\f[
-\begin{aligned}
-K(p,q) = \phi(p).\phi(q) &= \phi(p)^T \phi(q) \\
- &= (p_{1}^2,p_{2}^2,\sqrt{2} p_1 p_2).(q_{1}^2,q_{2}^2,\sqrt{2} q_1 q_2) \\
- &= p_{1}^2 q_{1}^2 + p_{2}^2 q_{2}^2 + 2 p_1 q_1 p_2 q_2 \\
- &= (p_1 q_1 + p_2 q_2)^2 \\
- \phi(p).\phi(q) &= (p.q)^2
-\end{aligned}
-\f]
-
-It means, a dot product in three-dimensional space can be achieved using squared dot product in
-two-dimensional space. This can be applied to higher dimensional space. So we can calculate higher
-dimensional features from lower dimensions itself. Once we map them, we get a higher dimensional
-space.
-
-In addition to all these concepts, there comes the problem of misclassification. So just finding
-decision boundary with maximum margin is not sufficient. We need to consider the problem of
-misclassification errors also. Sometimes, it may be possible to find a decision boundary with less
-margin, but with reduced misclassification. Anyway we need to modify our model such that it should
-find decision boundary with maximum margin, but with less misclassification. The minimization
-criteria is modified as:
-
-\f[min \; ||w||^2 + C(distance \; of \; misclassified \; samples \; to \; their \; correct \; regions)\f]
-
-Below image shows this concept. For each sample of the training data a new parameter \f$\xi_i\f$ is
-defined. It is the distance from its corresponding training sample to their correct decision region.
-For those who are not misclassified, they fall on their corresponding support planes, so their
-distance is zero.
-
-
-
-So the new optimization problem is :
-
-\f[\min_{w, b_{0}} L(w,b_0) = ||w||^{2} + C \sum_{i} {\xi_{i}} \text{ subject to } y_{i}(w^{T} x_{i} + b_{0}) \geq 1 - \xi_{i} \text{ and } \xi_{i} \geq 0 \text{ } \forall i\f]
-
-How should the parameter C be chosen? It is obvious that the answer to this question depends on how
-the training data is distributed. Although there is no general answer, it is useful to take into
-account these rules:
-
-- Large values of C give solutions with less misclassification errors but a smaller margin.
- Consider that in this case it is expensive to make misclassification errors. Since the aim of
- the optimization is to minimize the argument, few misclassifications errors are allowed.
-- Small values of C give solutions with bigger margin and more classification errors. In this
- case the minimization does not consider that much the term of the sum so it focuses more on
- finding a hyperplane with big margin.
-
-Additional Resources
---------------------
-
--# [NPTEL notes on Statistical Pattern Recognition, Chapters
- 25-29](https://nptel.ac.in/courses/117108048)
-Exercises
----------
diff --git a/doc/py_tutorials/py_ml/py_svm/py_svm_index.markdown b/doc/py_tutorials/py_ml/py_svm/py_svm_index.markdown
deleted file mode 100644
index dc737e97a0..0000000000
--- a/doc/py_tutorials/py_ml/py_svm/py_svm_index.markdown
+++ /dev/null
@@ -1,10 +0,0 @@
-Support Vector Machines (SVM) {#tutorial_py_svm_index}
-=============================
-
-- @subpage tutorial_py_svm_basics
-
- Get a basic understanding of what SVM is
-
-- @subpage tutorial_py_svm_opencv
-
- Let's use SVM functionalities in OpenCV
diff --git a/doc/py_tutorials/py_ml/py_svm/py_svm_opencv/images/deskew.jpg b/doc/py_tutorials/py_ml/py_svm/py_svm_opencv/images/deskew.jpg
deleted file mode 100644
index 32c22b7afe..0000000000
Binary files a/doc/py_tutorials/py_ml/py_svm/py_svm_opencv/images/deskew.jpg and /dev/null differ
diff --git a/doc/py_tutorials/py_ml/py_svm/py_svm_opencv/py_svm_opencv.markdown b/doc/py_tutorials/py_ml/py_svm/py_svm_opencv/py_svm_opencv.markdown
deleted file mode 100644
index 8ec36dfc2f..0000000000
--- a/doc/py_tutorials/py_ml/py_svm/py_svm_opencv/py_svm_opencv.markdown
+++ /dev/null
@@ -1,56 +0,0 @@
-OCR of Hand-written Data using SVM {#tutorial_py_svm_opencv}
-==================================
-
-Goal
-----
-
-In this chapter
-
-- We will revisit the hand-written data OCR, but, with SVM instead of kNN.
-
-OCR of Hand-written Digits
---------------------------
-
-In kNN, we directly used pixel intensity as the feature vector. This time we will use [Histogram of
-Oriented Gradients](http://en.wikipedia.org/wiki/Histogram_of_oriented_gradients) (HOG) as feature
-vectors.
-
-Here, before finding the HOG, we deskew the image using its second order moments. So we first define
-a function **deskew()** which takes a digit image and deskew it. Below is the deskew() function:
-
-@snippet samples/python/tutorial_code/ml/py_svm_opencv/hogsvm.py deskew
-
-Below image shows above deskew function applied to an image of zero. Left image is the original
-image and right image is the deskewed image.
-
-
-
-Next we have to find the HOG Descriptor of each cell. For that, we find Sobel derivatives of each
-cell in X and Y direction. Then find their magnitude and direction of gradient at each pixel. This
-gradient is quantized to 16 integer values. Divide this image to four sub-squares. For each
-sub-square, calculate the histogram of direction (16 bins) weighted with their magnitude. So each
-sub-square gives you a vector containing 16 values. Four such vectors (of four sub-squares) together
-gives us a feature vector containing 64 values. This is the feature vector we use to train our data.
-
-@snippet samples/python/tutorial_code/ml/py_svm_opencv/hogsvm.py hog
-
-Finally, as in the previous case, we start by splitting our big dataset into individual cells. For
-every digit, 250 cells are reserved for training data and remaining 250 data is reserved for
-testing. Full code is given below, you also can download it from [here](https://github.com/opencv/opencv/tree/5.x/samples/python/tutorial_code/ml/py_svm_opencv/hogsvm.py):
-
-@include samples/python/tutorial_code/ml/py_svm_opencv/hogsvm.py
-
-This particular technique gave me nearly 94% accuracy. You can try different values for various
-parameters of SVM to check if higher accuracy is possible. Or you can read technical papers on this
-area and try to implement them.
-
-Additional Resources
---------------------
-
--# [Histograms of Oriented Gradients Video](https://www.youtube.com/watch?v=0Zib1YEE4LU)
-
-Exercises
----------
-
--# OpenCV samples contain digits.py which applies a slight improvement of the above method to get
- improved result. It also contains the reference. Check it and understand it.
diff --git a/doc/py_tutorials/py_ml/py_table_of_contents_ml.markdown b/doc/py_tutorials/py_ml/py_table_of_contents_ml.markdown
index 67c988ae23..d9b9efd26b 100644
--- a/doc/py_tutorials/py_ml/py_table_of_contents_ml.markdown
+++ b/doc/py_tutorials/py_ml/py_table_of_contents_ml.markdown
@@ -1,15 +1,6 @@
Machine Learning {#tutorial_py_table_of_contents_ml}
================
-- @subpage tutorial_py_knn_index
-
- Learn to use kNN for classification
- Plus learn about handwritten digit recognition using kNN
-
-- @subpage tutorial_py_svm_index
-
- Understand concepts of SVM
-
- @subpage tutorial_py_kmeans_index
Learn to use K-Means Clustering to group data to a number of clusters.
diff --git a/doc/tutorials/others/_old/table_of_content_ml.markdown b/doc/tutorials/others/_old/table_of_content_ml.markdown
deleted file mode 100644
index 5999b0208a..0000000000
--- a/doc/tutorials/others/_old/table_of_content_ml.markdown
+++ /dev/null
@@ -1,4 +0,0 @@
-Machine Learning (ml module) {#tutorial_table_of_content_ml}
-============================
-
-Content has been moved to this page: @ref tutorial_table_of_content_other
diff --git a/doc/tutorials/others/barcode_detect_and_decode.markdown b/doc/tutorials/others/barcode_detect_and_decode.markdown
index edfe9b8c10..f1ea1b51de 100644
--- a/doc/tutorials/others/barcode_detect_and_decode.markdown
+++ b/doc/tutorials/others/barcode_detect_and_decode.markdown
@@ -4,7 +4,7 @@ Barcode Recognition {#tutorial_barcode_detect_and_decode}
@tableofcontents
@prev_tutorial{tutorial_traincascade}
-@next_tutorial{tutorial_introduction_to_svm}
+@next_tutorial{tutorial_introduction_to_pca}
| | |
| -: | :- |
diff --git a/doc/tutorials/others/images/optimal-hyperplane.png b/doc/tutorials/others/images/optimal-hyperplane.png
deleted file mode 100644
index d4522f0479..0000000000
Binary files a/doc/tutorials/others/images/optimal-hyperplane.png and /dev/null differ
diff --git a/doc/tutorials/others/images/sample-errors-dist.png b/doc/tutorials/others/images/sample-errors-dist.png
deleted file mode 100644
index 1379a56d52..0000000000
Binary files a/doc/tutorials/others/images/sample-errors-dist.png and /dev/null differ
diff --git a/doc/tutorials/others/images/separating-lines.png b/doc/tutorials/others/images/separating-lines.png
deleted file mode 100644
index 93ae457c0c..0000000000
Binary files a/doc/tutorials/others/images/separating-lines.png and /dev/null differ
diff --git a/doc/tutorials/others/images/svm_intro_result.png b/doc/tutorials/others/images/svm_intro_result.png
deleted file mode 100644
index 5f477a4f69..0000000000
Binary files a/doc/tutorials/others/images/svm_intro_result.png and /dev/null differ
diff --git a/doc/tutorials/others/images/svm_non_linear_result.png b/doc/tutorials/others/images/svm_non_linear_result.png
deleted file mode 100644
index bfecae9a1b..0000000000
Binary files a/doc/tutorials/others/images/svm_non_linear_result.png and /dev/null differ
diff --git a/doc/tutorials/others/introduction_to_pca.markdown b/doc/tutorials/others/introduction_to_pca.markdown
index 77ae0522a3..4b8745fbc8 100644
--- a/doc/tutorials/others/introduction_to_pca.markdown
+++ b/doc/tutorials/others/introduction_to_pca.markdown
@@ -3,7 +3,7 @@ Introduction to Principal Component Analysis (PCA) {#tutorial_introduction_to_pc
@tableofcontents
-@prev_tutorial{tutorial_non_linear_svms}
+@prev_tutorial{tutorial_barcode_detect_and_decode}
| | |
| -: | :- |
diff --git a/doc/tutorials/others/introduction_to_svm.markdown b/doc/tutorials/others/introduction_to_svm.markdown
deleted file mode 100644
index 11c9fbaf78..0000000000
--- a/doc/tutorials/others/introduction_to_svm.markdown
+++ /dev/null
@@ -1,273 +0,0 @@
-Introduction to Support Vector Machines {#tutorial_introduction_to_svm}
-=======================================
-
-@tableofcontents
-
-@prev_tutorial{tutorial_barcode_detect_and_decode}
-@next_tutorial{tutorial_non_linear_svms}
-
-| | |
-| -: | :- |
-| Original author | Fernando Iglesias García |
-| Compatibility | OpenCV >= 3.0 |
-
-Goal
-----
-
-In this tutorial you will learn how to:
-
-- Use the OpenCV functions @ref cv::ml::SVM::train to build a classifier based on SVMs and @ref
- cv::ml::SVM::predict to test its performance.
-
-What is a SVM?
---------------
-
-A Support Vector Machine (SVM) is a discriminative classifier formally defined by a separating
-hyperplane. In other words, given labeled training data (*supervised learning*), the algorithm
-outputs an optimal hyperplane which categorizes new examples.
-
-In which sense is the hyperplane obtained optimal? Let's consider the following simple problem:
-
-For a linearly separable set of 2D-points which belong to one of two classes, find a separating
-straight line.
-
-
-
-@note In this example we deal with lines and points in the Cartesian plane instead of hyperplanes
-and vectors in a high dimensional space. This is a simplification of the problem.It is important to
-understand that this is done only because our intuition is better built from examples that are easy
-to imagine. However, the same concepts apply to tasks where the examples to classify lie in a space
-whose dimension is higher than two.
-
-In the above picture you can see that there exists multiple lines that offer a solution to the
-problem. Is any of them better than the others? We can intuitively define a criterion to estimate
-the worth of the lines: A line is bad if it passes too close to the points because it will be
-noise sensitive and it will not generalize correctly. Therefore, our goal should be to find
-the line passing as far as possible from all points.
-
-Then, the operation of the SVM algorithm is based on finding the hyperplane that gives the largest
-minimum distance to the training examples. Twice, this distance receives the important name of
-**margin** within SVM's theory. Therefore, the optimal separating hyperplane *maximizes* the margin
-of the training data.
-
-
-
-How is the optimal hyperplane computed?
----------------------------------------
-
-Let's introduce the notation used to define formally a hyperplane:
-
-\f[f(x) = \beta_{0} + \beta^{T} x,\f]
-
-where \f$\beta\f$ is known as the *weight vector* and \f$\beta_{0}\f$ as the *bias*.
-
-@note A more in depth description of this and hyperplanes you can find in the section 4.5 (*Separating
-Hyperplanes*) of the book: *Elements of Statistical Learning* by T. Hastie, R. Tibshirani and J. H.
-Friedman (@cite HTF01).
-
-The optimal hyperplane can be represented in an infinite number of different ways by
-scaling of \f$\beta\f$ and \f$\beta_{0}\f$. As a matter of convention, among all the possible
-representations of the hyperplane, the one chosen is
-
-\f[|\beta_{0} + \beta^{T} x| = 1\f]
-
-where \f$x\f$ symbolizes the training examples closest to the hyperplane. In general, the training
-examples that are closest to the hyperplane are called **support vectors**. This representation is
-known as the **canonical hyperplane**.
-
-Now, we use the result of geometry that gives the distance between a point \f$x\f$ and a hyperplane
-\f$(\beta, \beta_{0})\f$:
-
-\f[\mathrm{distance} = \frac{|\beta_{0} + \beta^{T} x|}{||\beta||}.\f]
-
-In particular, for the canonical hyperplane, the numerator is equal to one and the distance to the
-support vectors is
-
-\f[\mathrm{distance}_{\text{ support vectors}} = \frac{|\beta_{0} + \beta^{T} x|}{||\beta||} = \frac{1}{||\beta||}.\f]
-
-Recall that the margin introduced in the previous section, here denoted as \f$M\f$, is twice the
-distance to the closest examples:
-
-\f[M = \frac{2}{||\beta||}\f]
-
-Finally, the problem of maximizing \f$M\f$ is equivalent to the problem of minimizing a function
-\f$L(\beta)\f$ subject to some constraints. The constraints model the requirement for the hyperplane to
-classify correctly all the training examples \f$x_{i}\f$. Formally,
-
-\f[\min_{\beta, \beta_{0}} L(\beta) = \frac{1}{2}||\beta||^{2} \text{ subject to } y_{i}(\beta^{T} x_{i} + \beta_{0}) \geq 1 \text{ } \forall i,\f]
-
-where \f$y_{i}\f$ represents each of the labels of the training examples.
-
-This is a problem of Lagrangian optimization that can be solved using Lagrange multipliers to obtain
-the weight vector \f$\beta\f$ and the bias \f$\beta_{0}\f$ of the optimal hyperplane.
-
-Source Code
------------
-
-@add_toggle_cpp
-- **Downloadable code**: Click
- [here](https://github.com/opencv/opencv/tree/5.x/samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp)
-
-- **Code at glance:**
- @include samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp
-@end_toggle
-
-@add_toggle_java
-- **Downloadable code**: Click
- [here](https://github.com/opencv/opencv/tree/5.x/samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java)
-
-- **Code at glance:**
- @include samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java
-@end_toggle
-
-@add_toggle_python
-- **Downloadable code**: Click
- [here](https://github.com/opencv/opencv/tree/5.x/samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py)
-
-- **Code at glance:**
- @include samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py
-@end_toggle
-
-Explanation
------------
-
-- **Set up the training data**
-
-The training data of this exercise is formed by a set of labeled 2D-points that belong to one of
-two different classes; one of the classes consists of one point and the other of three points.
-
-@add_toggle_cpp
-@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp setup1
-@end_toggle
-
-@add_toggle_java
-@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java setup1
-@end_toggle
-
-@add_toggle_python
-@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py setup1
-@end_toggle
-
-The function @ref cv::ml::SVM::train that will be used afterwards requires the training data to be
-stored as @ref cv::Mat objects of floats. Therefore, we create these objects from the arrays
-defined above:
-
-@add_toggle_cpp
-@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp setup2
-@end_toggle
-
-@add_toggle_java
-@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java setup2
-@end_toggle
-
-@add_toggle_python
-@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py setup1
-@end_toggle
-
-- **Set up SVM's parameters**
-
- In this tutorial we have introduced the theory of SVMs in the most simple case, when the
- training examples are spread into two classes that are linearly separable. However, SVMs can be
- used in a wide variety of problems (e.g. problems with non-linearly separable data, a SVM using
- a kernel function to raise the dimensionality of the examples, etc). As a consequence of this,
- we have to define some parameters before training the SVM. These parameters are stored in an
- object of the class @ref cv::ml::SVM.
-
-@add_toggle_cpp
-@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp init
-@end_toggle
-
-@add_toggle_java
-@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java init
-@end_toggle
-
-@add_toggle_python
-@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py init
-@end_toggle
-
-Here:
-- *Type of SVM*. We choose here the type @ref cv::ml::SVM::C_SVC "C_SVC" that can be used for
- n-class classification (n \f$\geq\f$ 2). The important feature of this type is that it deals
- with imperfect separation of classes (i.e. when the training data is non-linearly separable).
- This feature is not important here since the data is linearly separable and we chose this SVM
- type only for being the most commonly used.
-
-- *Type of SVM kernel*. We have not talked about kernel functions since they are not
- interesting for the training data we are dealing with. Nevertheless, let's explain briefly now
- the main idea behind a kernel function. It is a mapping done to the training data to improve
- its resemblance to a linearly separable set of data. This mapping consists of increasing the
- dimensionality of the data and is done efficiently using a kernel function. We choose here the
- type @ref cv::ml::SVM::LINEAR "LINEAR" which means that no mapping is done. This parameter is
- defined using cv::ml::SVM::setKernel.
-
-- *Termination criteria of the algorithm*. The SVM training procedure is implemented solving a
- constrained quadratic optimization problem in an **iterative** fashion. Here we specify a
- maximum number of iterations and a tolerance error so we allow the algorithm to finish in
- less number of steps even if the optimal hyperplane has not been computed yet. This
- parameter is defined in a structure @ref cv::TermCriteria .
-
-- **Train the SVM**
- We call the method @ref cv::ml::SVM::train to build the SVM model.
-
-@add_toggle_cpp
-@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp train
-@end_toggle
-
-@add_toggle_java
-@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java train
-@end_toggle
-
-@add_toggle_python
-@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py train
-@end_toggle
-
-- **Regions classified by the SVM**
-
- The method @ref cv::ml::SVM::predict is used to classify an input sample using a trained SVM. In
- this example we have used this method in order to color the space depending on the prediction done
- by the SVM. In other words, an image is traversed interpreting its pixels as points of the
- Cartesian plane. Each of the points is colored depending on the class predicted by the SVM; in
- green if it is the class with label 1 and in blue if it is the class with label -1.
-
-@add_toggle_cpp
-@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp show
-@end_toggle
-
-@add_toggle_java
-@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java show
-@end_toggle
-
-@add_toggle_python
-@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py show
-@end_toggle
-
-- **Support vectors**
-
- We use here a couple of methods to obtain information about the support vectors.
- The method @ref cv::ml::SVM::getSupportVectors obtain all of the support
- vectors. We have used this methods here to find the training examples that are
- support vectors and highlight them.
-
-@add_toggle_cpp
-@snippet samples/cpp/tutorial_code/ml/introduction_to_svm/introduction_to_svm.cpp show_vectors
-@end_toggle
-
-@add_toggle_java
-@snippet samples/java/tutorial_code/ml/introduction_to_svm/IntroductionToSVMDemo.java show_vectors
-@end_toggle
-
-@add_toggle_python
-@snippet samples/python/tutorial_code/ml/introduction_to_svm/introduction_to_svm.py show_vectors
-@end_toggle
-
-Results
--------
-
-- The code opens an image and shows the training examples of both classes. The points of one class
- are represented with white circles and black ones are used for the other class.
-- The SVM is trained and used to classify all the pixels of the image. This results in a division
- of the image in a blue region and a green region. The boundary between both regions is the
- optimal separating hyperplane.
-- Finally the support vectors are shown using gray rings around the training examples.
-
-
diff --git a/doc/tutorials/others/non_linear_svms.markdown b/doc/tutorials/others/non_linear_svms.markdown
deleted file mode 100644
index 5bfd8aae4c..0000000000
--- a/doc/tutorials/others/non_linear_svms.markdown
+++ /dev/null
@@ -1,288 +0,0 @@
-Support Vector Machines for Non-Linearly Separable Data {#tutorial_non_linear_svms}
-=======================================================
-
-@tableofcontents
-
-@prev_tutorial{tutorial_introduction_to_svm}
-@next_tutorial{tutorial_introduction_to_pca}
-
-| | |
-| -: | :- |
-| Original author | Fernando Iglesias García |
-| Compatibility | OpenCV >= 3.0 |
-
-Goal
-----
-
-In this tutorial you will learn how to:
-
-- Define the optimization problem for SVMs when it is not possible to separate linearly the
- training data.
-- How to configure the parameters to adapt your SVM for this class of problems.
-
-Motivation
-----------
-
-Why is it interesting to extend the SVM optimization problem in order to handle non-linearly separable
-training data? Most of the applications in which SVMs are used in computer vision require a more
-powerful tool than a simple linear classifier. This stems from the fact that in these tasks __the
-training data can be rarely separated using an hyperplane__.
-
-Consider one of these tasks, for example, face detection. The training data in this case is composed
-by a set of images that are faces and another set of images that are non-faces (_every other thing
-in the world except from faces_). This training data is too complex so as to find a representation
-of each sample (_feature vector_) that could make the whole set of faces linearly separable from the
-whole set of non-faces.
-
-Extension of the Optimization Problem
--------------------------------------
-
-Remember that using SVMs we obtain a separating hyperplane. Therefore, since the training data is
-now non-linearly separable, we must admit that the hyperplane found will misclassify some of the
-samples. This _misclassification_ is a new variable in the optimization that must be taken into
-account. The new model has to include both the old requirement of finding the hyperplane that gives
-the biggest margin and the new one of generalizing the training data correctly by not allowing too
-many classification errors.
-
-We start here from the formulation of the optimization problem of finding the hyperplane which
-maximizes the __margin__ (this is explained in the previous tutorial (@ref tutorial_introduction_to_svm):
-
-\f[\min_{\beta, \beta_{0}} L(\beta) = \frac{1}{2}||\beta||^{2} \text{ subject to } y_{i}(\beta^{T} x_{i} + \beta_{0}) \geq 1 \text{ } \forall i\f]
-
-There are multiple ways in which this model can be modified so it takes into account the
-misclassification errors. For example, one could think of minimizing the same quantity plus a
-constant times the number of misclassification errors in the training data, i.e.:
-
-\f[\min ||\beta||^{2} + C \text{(misclassification errors)}\f]
-
-However, this one is not a very good solution since, among some other reasons, we do not distinguish
-between samples that are misclassified with a small distance to their appropriate decision region or
-samples that are not. Therefore, a better solution will take into account the _distance of the
-misclassified samples to their correct decision regions_, i.e.:
-
-\f[\min ||\beta||^{2} + C \text{(distance of misclassified samples to their correct regions)}\f]
-
-For each sample of the training data a new parameter \f$\xi_{i}\f$ is defined. Each one of these
-parameters contains the distance from its corresponding training sample to their correct decision
-region. The following picture shows non-linearly separable training data from two classes, a
-separating hyperplane and the distances to their correct regions of the samples that are
-misclassified.
-
-
-
-@note Only the distances of the samples that are misclassified are shown in the picture. The
-distances of the rest of the samples are zero since they lay already in their correct decision
-region.
-
-The red and blue lines that appear on the picture are the margins to each one of the
-decision regions. It is very __important__ to realize that each of the \f$\xi_{i}\f$ goes from a
-misclassified training sample to the margin of its appropriate region.
-
-Finally, the new formulation for the optimization problem is:
-
-\f[\min_{\beta, \beta_{0}} L(\beta) = ||\beta||^{2} + C \sum_{i} {\xi_{i}} \text{ subject to } y_{i}(\beta^{T} x_{i} + \beta_{0}) \geq 1 - \xi_{i} \text{ and } \xi_{i} \geq 0 \text{ } \forall i\f]
-
-How should the parameter C be chosen? It is obvious that the answer to this question depends on how
-the training data is distributed. Although there is no general answer, it is useful to take into
-account these rules:
-
-- Large values of C give solutions with _less misclassification errors_ but a _smaller margin_.
- Consider that in this case it is expensive to make misclassification errors. Since the aim of
- the optimization is to minimize the argument, few misclassifications errors are allowed.
-- Small values of C give solutions with _bigger margin_ and _more classification errors_. In this
- case the minimization does not consider that much the term of the sum so it focuses more on
- finding a hyperplane with big margin.
-
-Source Code
------------
-
-You may also find the source code in `samples/cpp/tutorial_code/ml/non_linear_svms` folder of the OpenCV source library or
-[download it from here](https://github.com/opencv/opencv/tree/5.x/samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp).
-
-@add_toggle_cpp
-- **Downloadable code**: Click
- [here](https://github.com/opencv/opencv/tree/5.x/samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp)
-
-- **Code at glance:**
- @include samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp
-@end_toggle
-
-@add_toggle_java
-- **Downloadable code**: Click
- [here](https://github.com/opencv/opencv/tree/5.x/samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java)
-
-- **Code at glance:**
- @include samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java
-@end_toggle
-
-@add_toggle_python
-- **Downloadable code**: Click
- [here](https://github.com/opencv/opencv/tree/5.x/samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py)
-
-- **Code at glance:**
- @include samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py
-@end_toggle
-
-Explanation
------------
-
-- __Set up the training data__
-
-The training data of this exercise is formed by a set of labeled 2D-points that belong to one of
-two different classes. To make the exercise more appealing, the training data is generated
-randomly using a uniform probability density functions (PDFs).
-
-We have divided the generation of the training data into two main parts.
-
-In the first part we generate data for both classes that is linearly separable.
-
-@add_toggle_cpp
-@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp setup1
-@end_toggle
-
-@add_toggle_java
-@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java setup1
-@end_toggle
-
-@add_toggle_python
-@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py setup1
-@end_toggle
-
-In the second part we create data for both classes that is non-linearly separable, data that
-overlaps.
-
-@add_toggle_cpp
-@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp setup2
-@end_toggle
-
-@add_toggle_java
-@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java setup2
-@end_toggle
-
-@add_toggle_python
-@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py setup2
-@end_toggle
-
-- __Set up SVM's parameters__
-
-@note In the previous tutorial @ref tutorial_introduction_to_svm there is an explanation of the
-attributes of the class @ref cv::ml::SVM that we configure here before training the SVM.
-
-@add_toggle_cpp
-@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp init
-@end_toggle
-
-@add_toggle_java
-@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java init
-@end_toggle
-
-@add_toggle_python
-@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py init
-@end_toggle
-
-There are just two differences between the configuration we do here and the one that was done in
-the previous tutorial (@ref tutorial_introduction_to_svm) that we use as reference.
-
-- _C_. We chose here a small value of this parameter in order not to punish too much the
- misclassification errors in the optimization. The idea of doing this stems from the will of
- obtaining a solution close to the one intuitively expected. However, we recommend to get a
- better insight of the problem by making adjustments to this parameter.
-
- @note In this case there are just very few points in the overlapping region between classes.
- By giving a smaller value to __FRAC_LINEAR_SEP__ the density of points can be incremented and the
- impact of the parameter _C_ explored deeply.
-
-- _Termination Criteria of the algorithm_. The maximum number of iterations has to be
- increased considerably in order to solve correctly a problem with non-linearly separable
- training data. In particular, we have increased in five orders of magnitude this value.
-
-- __Train the SVM__
-
-We call the method @ref cv::ml::SVM::train to build the SVM model. Watch out that the training
-process may take a quite long time. Have patiance when your run the program.
-
-@add_toggle_cpp
-@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp train
-@end_toggle
-
-@add_toggle_java
-@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java train
-@end_toggle
-
-@add_toggle_python
-@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py train
-@end_toggle
-
-- __Show the Decision Regions__
-
-The method @ref cv::ml::SVM::predict is used to classify an input sample using a trained SVM. In
-this example we have used this method in order to color the space depending on the prediction done
-by the SVM. In other words, an image is traversed interpreting its pixels as points of the
-Cartesian plane. Each of the points is colored depending on the class predicted by the SVM; in
-dark green if it is the class with label 1 and in dark blue if it is the class with label 2.
-
-@add_toggle_cpp
-@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp show
-@end_toggle
-
-@add_toggle_java
-@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java show
-@end_toggle
-
-@add_toggle_python
-@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py show
-@end_toggle
-
-- __Show the training data__
-
-The method @ref cv::circle is used to show the samples that compose the training data. The samples
-of the class labeled with 1 are shown in light green and in light blue the samples of the class
-labeled with 2.
-
-@add_toggle_cpp
-@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp show_data
-@end_toggle
-
-@add_toggle_java
-@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java show_data
-@end_toggle
-
-@add_toggle_python
-@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py show_data
-@end_toggle
-
-- __Support vectors__
-
-We use here a couple of methods to obtain information about the support vectors. The method
-@ref cv::ml::SVM::getSupportVectors obtain all support vectors. We have used this methods here
-to find the training examples that are support vectors and highlight them.
-
-@add_toggle_cpp
-@snippet samples/cpp/tutorial_code/ml/non_linear_svms/non_linear_svms.cpp show_vectors
-@end_toggle
-
-@add_toggle_java
-@snippet samples/java/tutorial_code/ml/non_linear_svms/NonLinearSVMsDemo.java show_vectors
-@end_toggle
-
-@add_toggle_python
-@snippet samples/python/tutorial_code/ml/non_linear_svms/non_linear_svms.py show_vectors
-@end_toggle
-
-Results
--------
-
-- The code opens an image and shows the training examples of both classes. The points of one class
- are represented with light green and light blue ones are used for the other class.
-- The SVM is trained and used to classify all the pixels of the image. This results in a division
- of the image in a blue region and a green region. The boundary between both regions is the
- separating hyperplane. Since the training data is non-linearly separable, it can be seen that
- some of the examples of both classes are misclassified; some green points lay on the blue region
- and some blue points lay on the green one.
-- Finally the support vectors are shown using gray rings around the training examples.
-
-
-
-You may observe a runtime instance of this on the [YouTube here](https://www.youtube.com/watch?v=vFv2yPcSo-Q).
-
-@youtube{vFv2yPcSo-Q}
diff --git a/doc/tutorials/others/table_of_content_other.markdown b/doc/tutorials/others/table_of_content_other.markdown
index b4bbf62777..f6fe601fa2 100644
--- a/doc/tutorials/others/table_of_content_other.markdown
+++ b/doc/tutorials/others/table_of_content_other.markdown
@@ -1,4 +1,4 @@
-Other tutorials (ml, objdetect, photo, stitching, video) {#tutorial_table_of_content_other}
+Other tutorials (objdetect, photo, stitching, video) {#tutorial_table_of_content_other}
========================================================
- photo. @subpage tutorial_hdr_imaging
@@ -9,6 +9,4 @@ Other tutorials (ml, objdetect, photo, stitching, video) {#tutorial_table_of_con
- objdetect. @subpage tutorial_cascade_classifier
- objdetect. @subpage tutorial_traincascade
- objdetect. @subpage tutorial_barcode_detect_and_decode
-- ml. @subpage tutorial_introduction_to_svm
-- ml. @subpage tutorial_non_linear_svms
- ml. @subpage tutorial_introduction_to_pca
diff --git a/doc/tutorials/tutorials.markdown b/doc/tutorials/tutorials.markdown
index c8aae6ab56..75b0f8fa43 100644
--- a/doc/tutorials/tutorials.markdown
+++ b/doc/tutorials/tutorials.markdown
@@ -10,7 +10,7 @@ OpenCV Tutorials {#tutorial_root}
- @subpage tutorial_table_of_content_features2d - feature detectors, descriptors and matching framework
- @subpage tutorial_table_of_content_dnn - infer neural networks using built-in _dnn_ module
- @subpage tutorial_table_of_content_gapi - graph-based approach to computer vision algorithms building
-- @subpage tutorial_table_of_content_other - other modules (ml, objdetect, stitching, video, photo)
+- @subpage tutorial_table_of_content_other - other modules (objdetect, stitching, video, photo)
- @subpage tutorial_table_of_content_ios - running OpenCV on an iDevice
- @subpage tutorial_table_of_content_3d - 3d objects processing and visualisation
@cond CUDA_MODULES
diff --git a/modules/CMakeLists.txt b/modules/CMakeLists.txt
index 10b72f8880..b61cf9deb1 100644
--- a/modules/CMakeLists.txt
+++ b/modules/CMakeLists.txt
@@ -20,7 +20,7 @@ foreach(mod ${OPENCV_MODULES_BUILD} ${OPENCV_MODULES_DISABLED_USER} ${OPENCV_MOD
endforeach()
ocv_list_sort(OPENCV_MODULES_MAIN)
ocv_list_sort(OPENCV_MODULES_EXTRA)
-set(FIXED_ORDER_MODULES core imgproc imgcodecs videoio highgui video 3d stereo features2d calib objdetect dnn ml flann photo stitching)
+set(FIXED_ORDER_MODULES core imgproc imgcodecs videoio highgui video 3d stereo features2d calib objdetect dnn flann photo stitching)
list(REMOVE_ITEM OPENCV_MODULES_MAIN ${FIXED_ORDER_MODULES})
set(OPENCV_MODULES_MAIN ${FIXED_ORDER_MODULES} ${OPENCV_MODULES_MAIN})
diff --git a/modules/ml/CMakeLists.txt b/modules/ml/CMakeLists.txt
deleted file mode 100644
index e1d5f3100b..0000000000
--- a/modules/ml/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-set(the_description "Machine Learning")
-ocv_define_module(ml opencv_core WRAP java objc python)
diff --git a/modules/ml/doc/ml_intro.markdown b/modules/ml/doc/ml_intro.markdown
deleted file mode 100644
index f49e378e79..0000000000
--- a/modules/ml/doc/ml_intro.markdown
+++ /dev/null
@@ -1,481 +0,0 @@
-Machine Learning Overview {#ml_intro}
-=========================
-
-[TOC]
-
-Training Data {#ml_intro_data}
-=============
-
-In machine learning algorithms there is notion of training data. Training data includes several
-components:
-
-- A set of training samples. Each training sample is a vector of values (in Computer Vision it's
- sometimes referred to as feature vector). Usually all the vectors have the same number of
- components (features); OpenCV ml module assumes that. Each feature can be ordered (i.e. its
- values are floating-point numbers that can be compared with each other and strictly ordered,
- i.e. sorted) or categorical (i.e. its value belongs to a fixed set of values that can be
- integers, strings etc.).
-- Optional set of responses corresponding to the samples. Training data with no responses is used
- in unsupervised learning algorithms that learn structure of the supplied data based on distances
- between different samples. Training data with responses is used in supervised learning
- algorithms, which learn the function mapping samples to responses. Usually the responses are
- scalar values, ordered (when we deal with regression problem) or categorical (when we deal with
- classification problem; in this case the responses are often called "labels"). Some algorithms,
- most noticeably Neural networks, can handle not only scalar, but also multi-dimensional or
- vector responses.
-- Another optional component is the mask of missing measurements. Most algorithms require all the
- components in all the training samples be valid, but some other algorithms, such as decision
- trees, can handle the cases of missing measurements.
-- In the case of classification problem user may want to give different weights to different
- classes. This is useful, for example, when:
- - user wants to shift prediction accuracy towards lower false-alarm rate or higher hit-rate.
- - user wants to compensate for significantly different amounts of training samples from
- different classes.
-- In addition to that, each training sample may be given a weight, if user wants the algorithm to
- pay special attention to certain training samples and adjust the training model accordingly.
-- Also, user may wish not to use the whole training data at once, but rather use parts of it, e.g.
- to do parameter optimization via cross-validation procedure.
-
-As you can see, training data can have rather complex structure; besides, it may be very big and/or
-not entirely available, so there is need to make abstraction for this concept. In OpenCV ml there is
-cv::ml::TrainData class for that.
-
-@sa cv::ml::TrainData
-
-Normal Bayes Classifier {#ml_intro_bayes}
-=======================
-
-This simple classification model assumes that feature vectors from each class are normally
-distributed (though, not necessarily independently distributed). So, the whole data distribution
-function is assumed to be a Gaussian mixture, one component per class. Using the training data the
-algorithm estimates mean vectors and covariance matrices for every class, and then it uses them for
-prediction.
-
-@sa cv::ml::NormalBayesClassifier
-
-K-Nearest Neighbors {#ml_intro_knn}
-===================
-
-The algorithm caches all training samples and predicts the response for a new sample by analyzing a
-certain number (__K__) of the nearest neighbors of the sample using voting, calculating weighted
-sum, and so on. The method is sometimes referred to as "learning by example" because for prediction
-it looks for the feature vector with a known response that is closest to the given vector.
-
-@sa cv::ml::KNearest
-
-Support Vector Machines {#ml_intro_svm}
-=======================
-
-Originally, support vector machines (SVM) was a technique for building an optimal binary (2-class)
-classifier. Later the technique was extended to regression and clustering problems. SVM is a partial
-case of kernel-based methods. It maps feature vectors into a higher-dimensional space using a kernel
-function and builds an optimal linear discriminating function in this space or an optimal hyper-
-plane that fits into the training data. In case of SVM, the kernel is not defined explicitly.
-Instead, a distance between any 2 points in the hyper-space needs to be defined.
-
-The solution is optimal, which means that the margin between the separating hyper-plane and the
-nearest feature vectors from both classes (in case of 2-class classifier) is maximal. The feature
-vectors that are the closest to the hyper-plane are called _support vectors_, which means that the
-position of other vectors does not affect the hyper-plane (the decision function).
-
-SVM implementation in OpenCV is based on @cite LibSVM
-
-@sa cv::ml::SVM
-
-Prediction with SVM {#ml_intro_svm_predict}
--------------------
-
-StatModel::predict(samples, results, flags) should be used. Pass flags=StatModel::RAW_OUTPUT to get
-the raw response from SVM (in the case of regression, 1-class or 2-class classification problem).
-
-Decision Trees {#ml_intro_trees}
-==============
-
-The ML classes discussed in this section implement Classification and Regression Tree algorithms
-described in @cite Breiman84 .
-
-The class cv::ml::DTrees represents a single decision tree or a collection of decision trees. It's
-also a base class for RTrees and Boost.
-
-A decision tree is a binary tree (tree where each non-leaf node has two child nodes). It can be used
-either for classification or for regression. For classification, each tree leaf is marked with a
-class label; multiple leaves may have the same label. For regression, a constant is also assigned to
-each tree leaf, so the approximation function is piecewise constant.
-
-@sa cv::ml::DTrees
-
-Predicting with Decision Trees {#ml_intro_trees_predict}
-------------------------------
-
-To reach a leaf node and to obtain a response for the input feature vector, the prediction procedure
-starts with the root node. From each non-leaf node the procedure goes to the left (selects the left
-child node as the next observed node) or to the right based on the value of a certain variable whose
-index is stored in the observed node. The following variables are possible:
-
-- __Ordered variables.__ The variable value is compared with a threshold that is also stored in
- the node. If the value is less than the threshold, the procedure goes to the left. Otherwise, it
- goes to the right. For example, if the weight is less than 1 kilogram, the procedure goes to the
- left, else to the right.
-
-- __Categorical variables.__ A discrete variable value is tested to see whether it belongs to a
- certain subset of values (also stored in the node) from a limited set of values the variable
- could take. If it does, the procedure goes to the left. Otherwise, it goes to the right. For
- example, if the color is green or red, go to the left, else to the right.
-
-So, in each node, a pair of entities (variable_index , `decision_rule (threshold/subset)` ) is used.
-This pair is called a _split_ (split on the variable variable_index ). Once a leaf node is reached,
-the value assigned to this node is used as the output of the prediction procedure.
-
-Sometimes, certain features of the input vector are missed (for example, in the darkness it is
-difficult to determine the object color), and the prediction procedure may get stuck in the certain
-node (in the mentioned example, if the node is split by color). To avoid such situations, decision
-trees use so-called _surrogate splits_. That is, in addition to the best "primary" split, every tree
-node may also be split to one or more other variables with nearly the same results.
-
-Training Decision Trees {#ml_intro_trees_train}
------------------------
-
-The tree is built recursively, starting from the root node. All training data (feature vectors and
-responses) is used to split the root node. In each node the optimum decision rule (the best
-"primary" split) is found based on some criteria. In machine learning, gini "purity" criteria are
-used for classification, and sum of squared errors is used for regression. Then, if necessary, the
-surrogate splits are found. They resemble the results of the primary split on the training data. All
-the data is divided using the primary and the surrogate splits (like it is done in the prediction
-procedure) between the left and the right child node. Then, the procedure recursively splits both
-left and right nodes. At each node the recursive procedure may stop (that is, stop splitting the
-node further) in one of the following cases:
-
-- Depth of the constructed tree branch has reached the specified maximum value.
-- Number of training samples in the node is less than the specified threshold when it is not
- statistically representative to split the node further.
-- All the samples in the node belong to the same class or, in case of regression, the variation is
- too small.
-- The best found split does not give any noticeable improvement compared to a random choice.
-
-When the tree is built, it may be pruned using a cross-validation procedure, if necessary. That is,
-some branches of the tree that may lead to the model overfitting are cut off. Normally, this
-procedure is only applied to standalone decision trees. Usually tree ensembles build trees that are
-small enough and use their own protection schemes against overfitting.
-
-Variable Importance {#ml_intro_trees_var}
--------------------
-
-Besides the prediction that is an obvious use of decision trees, the tree can be also used for
-various data analyses. One of the key properties of the constructed decision tree algorithms is an
-ability to compute the importance (relative decisive power) of each variable. For example, in a spam
-filter that uses a set of words occurred in the message as a feature vector, the variable importance
-rating can be used to determine the most "spam-indicating" words and thus help keep the dictionary
-size reasonable.
-
-Importance of each variable is computed over all the splits on this variable in the tree, primary
-and surrogate ones. Thus, to compute variable importance correctly, the surrogate splits must be
-enabled in the training parameters, even if there is no missing data.
-
-Boosting {#ml_intro_boost}
-========
-
-A common machine learning task is supervised learning. In supervised learning, the goal is to learn
-the functional relationship \f$F: y = F(x)\f$ between the input \f$x\f$ and the output \f$y\f$ .
-Predicting the qualitative output is called _classification_, while predicting the quantitative
-output is called _regression_.
-
-Boosting is a powerful learning concept that provides a solution to the supervised classification
-learning task. It combines the performance of many "weak" classifiers to produce a powerful
-committee @cite HTF01 . A weak classifier is only required to be better than chance, and thus can be
-very simple and computationally inexpensive. However, many of them smartly combine results to a
-strong classifier that often outperforms most "monolithic" strong classifiers such as SVMs and
-Neural Networks.
-
-Decision trees are the most popular weak classifiers used in boosting schemes. Often the simplest
-decision trees with only a single split node per tree (called stumps ) are sufficient.
-
-The boosted model is based on \f$N\f$ training examples \f${(x_i,y_i)}1N\f$ with \f$x_i \in{R^K}\f$
-and \f$y_i \in{-1, +1}\f$ . \f$x_i\f$ is a \f$K\f$ -component vector. Each component encodes a
-feature relevant to the learning task at hand. The desired two-class output is encoded as -1 and +1.
-
-Different variants of boosting are known as Discrete Adaboost, Real AdaBoost, LogitBoost, and Gentle
-AdaBoost @cite FHT98 . All of them are very similar in their overall structure. Therefore, this
-chapter focuses only on the standard two-class Discrete AdaBoost algorithm, outlined below.
-Initially the same weight is assigned to each sample (step 2). Then, a weak classifier
-\f$f_{m(x)}\f$ is trained on the weighted training data (step 3a). Its weighted training error and
-scaling factor \f$c_m\f$ is computed (step 3b). The weights are increased for training samples that
-have been misclassified (step 3c). All weights are then normalized, and the process of finding the
-next weak classifier continues for another \f$M\f$ -1 times. The final classifier \f$F(x)\f$ is the
-sign of the weighted sum over the individual weak classifiers (step 4).
-
-__Two-class Discrete AdaBoost Algorithm__
-
-- Set \f$N\f$ examples \f${(x_i,y_i)}1N\f$ with \f$x_i \in{R^K}, y_i \in{-1, +1}\f$ .
-
-- Assign weights as \f$w_i = 1/N, i = 1,...,N\f$ .
-
-- Repeat for \f$m = 1,2,...,M\f$ :
-
- - Fit the classifier \f$f_m(x) \in{-1,1}\f$, using weights \f$w_i\f$ on the training data.
-
- - Compute \f$err_m = E_w [1_{(y \neq f_m(x))}], c_m = log((1 - err_m)/err_m)\f$ .
-
- - Set \f$w_i \Leftarrow w_i exp[c_m 1_{(y_i \neq f_m(x_i))}], i = 1,2,...,N,\f$ and
- renormalize so that \f$\Sigma i w_i = 1\f$ .
-
-- Classify new samples _x_ using the formula: \f$\textrm{sign} (\Sigma m = 1M c_m f_m(x))\f$ .
-
-@note Similar to the classical boosting methods, the current implementation supports two-class
-classifiers only. For M \> 2 classes, there is the __AdaBoost.MH__ algorithm (described in
-@cite FHT98) that reduces the problem to the two-class problem, yet with a much larger training set.
-
-To reduce computation time for boosted models without substantially losing accuracy, the influence
-trimming technique can be employed. As the training algorithm proceeds and the number of trees in
-the ensemble is increased, a larger number of the training samples are classified correctly and with
-increasing confidence, thereby those samples receive smaller weights on the subsequent iterations.
-Examples with a very low relative weight have a small impact on the weak classifier training. Thus,
-such examples may be excluded during the weak classifier training without having much effect on the
-induced classifier. This process is controlled with the weight_trim_rate parameter. Only examples
-with the summary fraction weight_trim_rate of the total weight mass are used in the weak classifier
-training. Note that the weights for __all__ training examples are recomputed at each training
-iteration. Examples deleted at a particular iteration may be used again for learning some of the
-weak classifiers further @cite FHT98
-
-@sa cv::ml::Boost
-
-Prediction with Boost {#ml_intro_boost_predict}
----------------------
-StatModel::predict(samples, results, flags) should be used. Pass flags=StatModel::RAW_OUTPUT to get
-the raw sum from Boost classifier.
-
-Random Trees {#ml_intro_rtrees}
-============
-
-Random trees have been introduced by Leo Breiman and Adele Cutler:
- . The algorithm can deal with both
-classification and regression problems. Random trees is a collection (ensemble) of tree predictors
-that is called _forest_ further in this section (the term has been also introduced by L. Breiman).
-The classification works as follows: the random trees classifier takes the input feature vector,
-classifies it with every tree in the forest, and outputs the class label that received the majority
-of "votes". In case of a regression, the classifier response is the average of the responses over
-all the trees in the forest.
-
-All the trees are trained with the same parameters but on different training sets. These sets are
-generated from the original training set using the bootstrap procedure: for each training set, you
-randomly select the same number of vectors as in the original set ( =N ). The vectors are chosen
-with replacement. That is, some vectors will occur more than once and some will be absent. At each
-node of each trained tree, not all the variables are used to find the best split, but a random
-subset of them. With each node a new subset is generated. However, its size is fixed for all the
-nodes and all the trees. It is a training parameter set to \f$\sqrt{number\_of\_variables}\f$ by
-default. None of the built trees are pruned.
-
-In random trees there is no need for any accuracy estimation procedures, such as cross-validation or
-bootstrap, or a separate test set to get an estimate of the training error. The error is estimated
-internally during the training. When the training set for the current tree is drawn by sampling with
-replacement, some vectors are left out (so-called _oob (out-of-bag) data_ ). The size of oob data is
-about N/3 . The classification error is estimated by using this oob-data as follows:
-
-- Get a prediction for each vector, which is oob relative to the i-th tree, using the very i-th
- tree.
-
-- After all the trees have been trained, for each vector that has ever been oob, find the
- class-winner for it (the class that has got the majority of votes in the trees where
- the vector was oob) and compare it to the ground-truth response.
-
-- Compute the classification error estimate as a ratio of the number of misclassified oob vectors
- to all the vectors in the original data. In case of regression, the oob-error is computed as the
- squared error for oob vectors difference divided by the total number of vectors.
-
-For the random trees usage example, please, see letter_recog.cpp sample in OpenCV distribution.
-
-@sa cv::ml::RTrees
-
-__References:__
-
-- _Machine Learning_, Wald I, July 2002.
-
-- _Looking Inside the Black Box_, Wald II, July 2002.
-
-- _Software for the Masses_, Wald III, July 2002.
-
-- And other articles from the web site
-
-
-Expectation Maximization {#ml_intro_em}
-========================
-
-The Expectation Maximization(EM) algorithm estimates the parameters of the multivariate probability
-density function in the form of a Gaussian mixture distribution with a specified number of mixtures.
-
-Consider the set of the N feature vectors { \f$x_1, x_2,...,x_{N}\f$ } from a d-dimensional Euclidean
-space drawn from a Gaussian mixture:
-
-\f[p(x;a_k,S_k, \pi _k) = \sum _{k=1}^{m} \pi _kp_k(x), \quad \pi _k \geq 0, \quad \sum _{k=1}^{m} \pi _k=1,\f]
-
-\f[p_k(x)= \varphi (x;a_k,S_k)= \frac{1}{(2\pi)^{d/2}\mid{S_k}\mid^{1/2}} exp \left \{ - \frac{1}{2} (x-a_k)^TS_k^{-1}(x-a_k) \right \} ,\f]
-
-where \f$m\f$ is the number of mixtures, \f$p_k\f$ is the normal distribution density with the mean
-\f$a_k\f$ and covariance matrix \f$S_k\f$, \f$\pi_k\f$ is the weight of the k-th mixture. Given the
-number of mixtures \f$M\f$ and the samples \f$x_i\f$, \f$i=1..N\f$ the algorithm finds the maximum-
-likelihood estimates (MLE) of all the mixture parameters, that is, \f$a_k\f$, \f$S_k\f$ and
-\f$\pi_k\f$ :
-
-\f[L(x, \theta )=logp(x, \theta )= \sum _{i=1}^{N}log \left ( \sum _{k=1}^{m} \pi _kp_k(x) \right ) \to \max _{ \theta \in \Theta },\f]
-
-\f[\Theta = \left \{ (a_k,S_k, \pi _k): a_k \in \mathbbm{R} ^d,S_k=S_k^T>0,S_k \in \mathbbm{R} ^{d \times d}, \pi _k \geq 0, \sum _{k=1}^{m} \pi _k=1 \right \} .\f]
-
-The EM algorithm is an iterative procedure. Each iteration includes two steps. At the first step
-(Expectation step or E-step), you find a probability \f$p_{i,k}\f$ (denoted \f$\alpha_{i,k}\f$ in
-the formula below) of sample i to belong to mixture k using the currently available mixture
-parameter estimates:
-
-\f[\alpha _{ki} = \frac{\pi_k\varphi(x;a_k,S_k)}{\sum\limits_{j=1}^{m}\pi_j\varphi(x;a_j,S_j)} .\f]
-
-At the second step (Maximization step or M-step), the mixture parameter estimates are refined using
-the computed probabilities:
-
-\f[\pi _k= \frac{1}{N} \sum _{i=1}^{N} \alpha _{ki}, \quad a_k= \frac{\sum\limits_{i=1}^{N}\alpha_{ki}x_i}{\sum\limits_{i=1}^{N}\alpha_{ki}} , \quad S_k= \frac{\sum\limits_{i=1}^{N}\alpha_{ki}(x_i-a_k)(x_i-a_k)^T}{\sum\limits_{i=1}^{N}\alpha_{ki}}\f]
-
-Alternatively, the algorithm may start with the M-step when the initial values for \f$p_{i,k}\f$ can
-be provided. Another alternative when \f$p_{i,k}\f$ are unknown is to use a simpler clustering
-algorithm to pre-cluster the input samples and thus obtain initial \f$p_{i,k}\f$ . Often (including
-machine learning) the k-means algorithm is used for that purpose.
-
-One of the main problems of the EM algorithm is a large number of parameters to estimate. The
-majority of the parameters reside in covariance matrices, which are \f$d \times d\f$ elements each
-where \f$d\f$ is the feature space dimensionality. However, in many practical problems, the
-covariance matrices are close to diagonal or even to \f$\mu_k*I\f$ , where \f$I\f$ is an identity
-matrix and \f$\mu_k\f$ is a mixture-dependent "scale" parameter. So, a robust computation scheme
-could start with harder constraints on the covariance matrices and then use the estimated parameters
-as an input for a less constrained optimization problem (often a diagonal covariance matrix is
-already a good enough approximation).
-
-@sa cv::ml::EM
-
-References:
-- Bilmes98 J. A. Bilmes. _A Gentle Tutorial of the EM Algorithm and its Application to Parameter
-Estimation for Gaussian Mixture and Hidden Markov Models_. Technical Report TR-97-021,
-International Computer Science Institute and Computer Science Division, University of California
-at Berkeley, April 1998.
-
-Neural Networks {#ml_intro_ann}
-===============
-
-ML implements feed-forward artificial neural networks or, more particularly, multi-layer perceptrons
-(MLP), the most commonly used type of neural networks. MLP consists of the input layer, output
-layer, and one or more hidden layers. Each layer of MLP includes one or more neurons directionally
-linked with the neurons from the previous and the next layer. The example below represents a 3-layer
-perceptron with three inputs, two outputs, and the hidden layer including five neurons:
-
-
-
-All the neurons in MLP are similar. Each of them has several input links (it takes the output values
-from several neurons in the previous layer as input) and several output links (it passes the
-response to several neurons in the next layer). The values retrieved from the previous layer are
-summed up with certain weights, individual for each neuron, plus the bias term. The sum is
-transformed using the activation function \f$f\f$ that may be also different for different neurons.
-
-
-
-In other words, given the outputs \f$x_j\f$ of the layer \f$n\f$ , the outputs \f$y_i\f$ of the
-layer \f$n+1\f$ are computed as:
-
-\f[u_i = \sum _j (w^{n+1}_{i,j}*x_j) + w^{n+1}_{i,bias}\f]
-
-\f[y_i = f(u_i)\f]
-
-Different activation functions may be used. ML implements three standard functions:
-
-- Identity function ( cv::ml::ANN_MLP::IDENTITY ): \f$f(x)=x\f$
-
-- Symmetrical sigmoid ( cv::ml::ANN_MLP::SIGMOID_SYM ): \f$f(x)=\beta*(1-e^{-\alpha
- x})/(1+e^{-\alpha x}\f$ ), which is the default choice for MLP. The standard sigmoid with
- \f$\beta =1, \alpha =1\f$ is shown below:
-
- 
-
-- Gaussian function ( cv::ml::ANN_MLP::GAUSSIAN ): \f$f(x)=\beta e^{-\alpha x*x}\f$ , which is not
- completely supported at the moment.
-
-In ML, all the neurons have the same activation functions, with the same free parameters (
-\f$\alpha, \beta\f$ ) that are specified by user and are not altered by the training algorithms.
-
-So, the whole trained network works as follows:
-
-1. Take the feature vector as input. The vector size is equal to the size of the input layer.
-2. Pass values as input to the first hidden layer.
-3. Compute outputs of the hidden layer using the weights and the activation functions.
-4. Pass outputs further downstream until you compute the output layer.
-
-So, to compute the network, you need to know all the weights \f$w^{n+1)}_{i,j}\f$ . The weights are
-computed by the training algorithm. The algorithm takes a training set, multiple input vectors with
-the corresponding output vectors, and iteratively adjusts the weights to enable the network to give
-the desired response to the provided input vectors.
-
-The larger the network size (the number of hidden layers and their sizes) is, the more the potential
-network flexibility is. The error on the training set could be made arbitrarily small. But at the
-same time the learned network also "learns" the noise present in the training set, so the error on
-the test set usually starts increasing after the network size reaches a limit. Besides, the larger
-networks are trained much longer than the smaller ones, so it is reasonable to pre-process the data,
-using cv::PCA or similar technique, and train a smaller network on only essential features.
-
-Another MLP feature is an inability to handle categorical data as is. However, there is a
-workaround. If a certain feature in the input or output (in case of n -class classifier for
-\f$n>2\f$ ) layer is categorical and can take \f$M>2\f$ different values, it makes sense to
-represent it as a binary tuple of M elements, where the i -th element is 1 if and only if the
-feature is equal to the i -th value out of M possible. It increases the size of the input/output
-layer but speeds up the training algorithm convergence and at the same time enables "fuzzy" values
-of such variables, that is, a tuple of probabilities instead of a fixed value.
-
-ML implements two algorithms for training MLP's. The first algorithm is a classical random
-sequential back-propagation algorithm. The second (default) one is a batch RPROP algorithm.
-
-@sa cv::ml::ANN_MLP
-
-Logistic Regression {#ml_intro_lr}
-===================
-
-ML implements logistic regression, which is a probabilistic classification technique. Logistic
-Regression is a binary classification algorithm which is closely related to Support Vector Machines
-(SVM). Like SVM, Logistic Regression can be extended to work on multi-class classification problems
-like digit recognition (i.e. recognizing digits like 0,1 2, 3,... from the given images). This
-version of Logistic Regression supports both binary and multi-class classifications (for multi-class
-it creates a multiple 2-class classifiers). In order to train the logistic regression classifier,
-Batch Gradient Descent and Mini-Batch Gradient Descent algorithms are used (see
-). Logistic Regression is a
-discriminative classifier (see for more details).
-Logistic Regression is implemented as a C++ class in LogisticRegression.
-
-In Logistic Regression, we try to optimize the training parameter \f$\theta\f$ such that the
-hypothesis \f$0 \leq h_\theta(x) \leq 1\f$ is achieved. We have \f$h_\theta(x) = g(h_\theta(x))\f$
-and \f$g(z) = \frac{1}{1+e^{-z}}\f$ as the logistic or sigmoid function. The term "Logistic" in
-Logistic Regression refers to this function. For given data of a binary classification problem of
-classes 0 and 1, one can determine that the given data instance belongs to class 1 if \f$h_\theta(x)
-\geq 0.5\f$ or class 0 if \f$h_\theta(x) < 0.5\f$ .
-
-In Logistic Regression, choosing the right parameters is of utmost importance for reducing the
-training error and ensuring high training accuracy:
-
-- The learning rate can be set with @ref cv::ml::LogisticRegression::setLearningRate "setLearningRate"
- method. It determines how fast we approach the solution. It is a positive real number.
-
-- Optimization algorithms like Batch Gradient Descent and Mini-Batch Gradient Descent are supported
- in LogisticRegression. It is important that we mention the number of iterations these optimization
- algorithms have to run. The number of iterations can be set with @ref
- cv::ml::LogisticRegression::setIterations "setIterations". This parameter can be thought
- as number of steps taken and learning rate specifies if it is a long step or a short step. This
- and previous parameter define how fast we arrive at a possible solution.
-
-- In order to compensate for overfitting regularization is performed, which can be enabled with
- @ref cv::ml::LogisticRegression::setRegularization "setRegularization". One can specify what
- kind of regularization has to be performed by passing one of @ref
- cv::ml::LogisticRegression::RegKinds "regularization kinds" to this method.
-
-- Logistic regression implementation provides a choice of 2 training methods with Batch Gradient
- Descent or the MiniBatch Gradient Descent. To specify this, call @ref
- cv::ml::LogisticRegression::setTrainMethod "setTrainMethod" with either @ref
- cv::ml::LogisticRegression::BATCH "LogisticRegression::BATCH" or @ref
- cv::ml::LogisticRegression::MINI_BATCH "LogisticRegression::MINI_BATCH". If training method is
- set to @ref cv::ml::LogisticRegression::MINI_BATCH "MINI_BATCH", the size of the mini batch has
- to be to a positive integer set with @ref cv::ml::LogisticRegression::setMiniBatchSize
- "setMiniBatchSize".
-
-A sample set of training parameters for the Logistic Regression classifier can be initialized as follows:
-@snippet samples/cpp/logistic_regression.cpp init
-
-@sa cv::ml::LogisticRegression
diff --git a/modules/ml/doc/pics/SVM_Comparison.png b/modules/ml/doc/pics/SVM_Comparison.png
deleted file mode 100644
index 4bb3dababc..0000000000
Binary files a/modules/ml/doc/pics/SVM_Comparison.png and /dev/null differ
diff --git a/modules/ml/doc/pics/mlp.png b/modules/ml/doc/pics/mlp.png
deleted file mode 100644
index ce3392c454..0000000000
Binary files a/modules/ml/doc/pics/mlp.png and /dev/null differ
diff --git a/modules/ml/doc/pics/neuron_model.png b/modules/ml/doc/pics/neuron_model.png
deleted file mode 100644
index 635a531804..0000000000
Binary files a/modules/ml/doc/pics/neuron_model.png and /dev/null differ
diff --git a/modules/ml/doc/pics/sigmoid_bipolar.png b/modules/ml/doc/pics/sigmoid_bipolar.png
deleted file mode 100644
index d94a85031d..0000000000
Binary files a/modules/ml/doc/pics/sigmoid_bipolar.png and /dev/null differ
diff --git a/modules/ml/include/opencv2/ml.hpp b/modules/ml/include/opencv2/ml.hpp
deleted file mode 100644
index d537ab7759..0000000000
--- a/modules/ml/include/opencv2/ml.hpp
+++ /dev/null
@@ -1,1956 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-// By downloading, copying, installing or using the software you agree to this license.
-// If you do not agree to this license, do not download, install,
-// copy or use the software.
-//
-//
-// License Agreement
-// For Open Source Computer Vision Library
-//
-// Copyright (C) 2000, Intel Corporation, all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Copyright (C) 2014, Itseez Inc, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-// * Redistribution's of source code must retain the above copyright notice,
-// this list of conditions and the following disclaimer.
-//
-// * Redistribution's in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution.
-//
-// * The name of the copyright holders may not be used to endorse or promote products
-// derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef OPENCV_ML_HPP
-#define OPENCV_ML_HPP
-
-#ifdef __cplusplus
-# include "opencv2/core.hpp"
-#endif
-
-#ifdef __cplusplus
-
-#include
-#include