diff --git a/modules/ml/doc/mldata.rst b/modules/ml/doc/mldata.rst index 8c415b1e20..b09303701c 100644 --- a/modules/ml/doc/mldata.rst +++ b/modules/ml/doc/mldata.rst @@ -3,7 +3,7 @@ MLData .. highlight:: cpp -For the machine learning algorithms, the data set is often stored in a file of the ``.csv``-like format. The file contains a table of predictor and response values where each row of the table corresponds to a sample. Missing values are supported. The UC Irvine Machine Learning Repository (http://archive.ics.uci.edu/ml/) provides many data sets stored in such a format to the machine learning community. The class ``MLData`` is implemented to easily load the data for training one of the OpenCV machine learning algorithms. For float values, only the ``'.'`` separator is supported. +For the machine learning algorithms, the data set is often stored in a file of the ``.csv``-like format. The file contains a table of predictor and response values where each row of the table corresponds to a sample. Missing values are supported. The UC Irvine Machine Learning Repository (http://archive.ics.uci.edu/ml/) provides many data sets stored in such a format to the machine learning community. The class ``MLData`` is implemented to easily load the data for training one of the OpenCV machine learning algorithms. For float values, only the ``'.'`` separator is supported. The table can have a header and in such case the user have to set the number of the header lines to skip them duaring the file reading. CvMLData -------- @@ -182,6 +182,20 @@ Sets the variables types in the loaded data. In the string, a variable type is followed by a list of variables indices. For example: ``"ord[0-17],cat[18]"``, ``"ord[0,2,4,10-12], cat[1,3,5-9,13,14]"``, ``"cat"`` (all variables are categorical), ``"ord"`` (all variables are ordered). +CvMLData::get_header_lines_number +--------------------------------- +Returns a number of the table header lines. + +.. ocv:function:: int CvMLData::get_header_lines_number() const + +CvMLData::set_header_lines_number +--------------------------------- +Sets a number of the table header lines. + +.. ocv:function:: void CvMLData::set_header_lines_number( int n ) + +By default it is supposed that the table does not have a header, i.e. it contains only the data. + CvMLData::get_var_type ---------------------- Returns type of the specified variable diff --git a/modules/ml/include/opencv2/ml/ml.hpp b/modules/ml/include/opencv2/ml/ml.hpp index 6358fa7be5..24e4a4bc9f 100644 --- a/modules/ml/include/opencv2/ml/ml.hpp +++ b/modules/ml/include/opencv2/ml/ml.hpp @@ -2040,6 +2040,9 @@ public: const CvMat* get_responses(); const CvMat* get_missing() const; + void set_header_lines_number( int n ); + int get_header_lines_number() const; + void set_response_idx( int idx ); // old response become predictors, new response_idx = idx // if idx < 0 there will be no response int get_response_idx() const; @@ -2091,6 +2094,8 @@ protected: CvMat* var_idx_out; // mat CvMat* var_types_out; // mat + int header_lines_number; + int response_idx; int train_sample_count; diff --git a/modules/ml/src/data.cpp b/modules/ml/src/data.cpp index 2b44d384be..9bf4674c4a 100644 --- a/modules/ml/src/data.cpp +++ b/modules/ml/src/data.cpp @@ -71,6 +71,7 @@ CvMLData::CvMLData() { values = missing = var_types = var_idx_mask = response_out = var_idx_out = var_types_out = 0; train_sample_idx = test_sample_idx = 0; + header_lines_number = 0; sample_idx = 0; response_idx = -1; @@ -117,6 +118,17 @@ void CvMLData::clear() train_sample_count = -1; } + +void CvMLData::set_header_lines_number( int idx ) +{ + header_lines_number = std::max(0, idx); +} + +int CvMLData::get_header_lines_number() const +{ + return header_lines_number; +} + static char *fgets_chomp(char *str, int n, FILE *stream) { char *head = fgets(str, n, stream); @@ -153,9 +165,15 @@ int CvMLData::read_csv(const char* filename) if( !file ) return -1; - // read the first line and determine the number of variables - std::vector _buf(M); + std::vector _buf(M); char* buf = &_buf[0]; + + // skip header lines + for( int i = 0; i < header_lines_number; i++ ) + if( fgets( buf, M, file ) == 0 ) + return -1; + + // read the first data line and determine the number of variables if( !fgets_chomp( buf, M, file )) { fclose(file);