tesseract/src/training/commontraining.cpp

853 lines
29 KiB
C++
Raw Normal View History

// Copyright 2008 Google Inc. All Rights Reserved.
// Author: scharron@google.com (Samuel Charron)
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#define _USE_MATH_DEFINES // for M_PI
#include "commontraining.h"
#include <algorithm>
#include <cmath> // for M_PI
#ifdef DISABLED_LEGACY_ENGINE
#include "params.h"
#include "tessopt.h"
#include "tprintf.h"
INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
STRING_PARAM_FLAG(D, "", "Directory to write output files to");
STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
STRING_PARAM_FLAG(X, "", "File listing font xheights");
STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
STRING_PARAM_FLAG(O, "", "File to write unicharset to");
STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
/**
* This routine parses the command line arguments that were
* passed to the program and uses them to set relevant
* training-related global parameters.
*
* Globals:
* - Config current clustering parameters
* @param argc number of command line arguments to parse
* @param argv command line arguments
* @note Exceptions: Illegal options terminate the program.
*/
void ParseArguments(int* argc, char ***argv) {
STRING usage;
if (*argc) {
usage += (*argv)[0];
usage += " -v | --version | ";
usage += (*argv)[0];
}
usage += " [.tr files ...]";
tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
}
#else
#include "allheaders.h"
#include "ccutil.h"
#include "classify.h"
#include "cluster.h"
#include "clusttool.h"
#include "emalloc.h"
#include "featdefs.h"
#include "fontinfo.h"
#include "intfeaturespace.h"
#include "mastertrainer.h"
#include "mf.h"
#include "oldlist.h"
#include "params.h"
#include "shapetable.h"
#include "tessdatamanager.h"
#include "tessopt.h"
#include "tprintf.h"
#include "unicity_table.h"
using tesseract::CCUtil;
using tesseract::IntFeatureSpace;
using tesseract::ParamUtils;
using tesseract::ShapeTable;
// Global Variables.
// global variable to hold configuration parameters to control clustering
// -M 0.625 -B 0.05 -I 1.0 -C 1e-6.
CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 };
FEATURE_DEFS_STRUCT feature_defs;
static CCUtil ccutil;
INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
static INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
static STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
STRING_PARAM_FLAG(D, "", "Directory to write output files to");
STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
STRING_PARAM_FLAG(X, "", "File listing font xheights");
STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
STRING_PARAM_FLAG(O, "", "File to write unicharset to");
STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
static DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples,
"Min number of samples per proto as % of total");
static DOUBLE_PARAM_FLAG(clusterconfig_max_illegal, Config.MaxIllegal,
"Max percentage of samples in a cluster which have more"
" than 1 feature in that cluster");
static DOUBLE_PARAM_FLAG(clusterconfig_independence, Config.Independence,
"Desired independence between dimensions");
static DOUBLE_PARAM_FLAG(clusterconfig_confidence, Config.Confidence,
"Desired confidence in prototypes created");
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
/**
* This routine parses the command line arguments that were
* passed to the program and uses them to set relevant
* training-related global parameters.
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
*
* Globals:
* - Config current clustering parameters
* @param argc number of command line arguments to parse
* @param argv command line arguments
*/
void ParseArguments(int* argc, char ***argv) {
STRING usage;
if (*argc) {
usage += (*argv)[0];
usage += " -v | --version | ";
usage += (*argv)[0];
}
usage += " [.tr files ...]";
tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
// Record the index of the first non-flag argument to 1, since we set
// remove_flags to true when parsing the flags.
tessoptind = 1;
// Set some global values based on the flags.
Config.MinSamples =
std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
Config.MaxIllegal =
std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_max_illegal)));
Config.Independence =
std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_independence)));
Config.Confidence =
std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_confidence)));
// Set additional parameters from config file if specified.
if (!FLAGS_configfile.empty()) {
tesseract::ParamUtils::ReadParamsFile(
FLAGS_configfile.c_str(),
tesseract::SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
ccutil.params());
}
}
namespace tesseract {
// Helper loads shape table from the given file.
ShapeTable* LoadShapeTable(const STRING& file_prefix) {
ShapeTable* shape_table = nullptr;
STRING shape_table_file = file_prefix;
shape_table_file += kShapeTableFileSuffix;
TFile shape_fp;
if (shape_fp.Open(shape_table_file.string(), nullptr)) {
shape_table = new ShapeTable;
if (!shape_table->DeSerialize(&shape_fp)) {
delete shape_table;
shape_table = nullptr;
tprintf("Error: Failed to read shape table %s\n",
shape_table_file.string());
} else {
int num_shapes = shape_table->NumShapes();
tprintf("Read shape table %s of %d shapes\n",
shape_table_file.string(), num_shapes);
}
} else {
tprintf("Warning: No shape table file present: %s\n",
shape_table_file.string());
}
return shape_table;
}
// Helper to write the shape_table.
void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table) {
STRING shape_table_file = file_prefix;
shape_table_file += kShapeTableFileSuffix;
FILE* fp = fopen(shape_table_file.string(), "wb");
if (fp != nullptr) {
if (!shape_table.Serialize(fp)) {
fprintf(stderr, "Error writing shape table: %s\n",
shape_table_file.string());
}
fclose(fp);
} else {
fprintf(stderr, "Error creating shape table: %s\n",
shape_table_file.string());
}
}
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
/**
* Creates a MasterTrainer and loads the training data into it:
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
* Initializes feature_defs and IntegerFX.
* Loads the shape_table if shape_table != nullptr.
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
* Loads initial unicharset from -U command-line option.
* If FLAGS_T is set, loads the majority of data from there, else:
* - Loads font info from -F option.
* - Loads xheights from -X option.
* - Loads samples from .tr files in remaining command-line args.
* - Deletes outliers and computes canonical samples.
* - If FLAGS_output_trainer is set, saves the trainer for future use.
* TODO: Who uses that? There is currently no code which reads it.
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
* Computes canonical and cloud features.
* If shape_table is not nullptr, but failed to load, make a fake flat one,
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
* as shape clustering was not run.
*/
MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
bool replication,
ShapeTable** shape_table,
STRING* file_prefix) {
InitFeatureDefs(&feature_defs);
InitIntegerFX();
*file_prefix = "";
if (!FLAGS_D.empty()) {
*file_prefix += FLAGS_D.c_str();
*file_prefix += "/";
}
// If we are shape clustering (nullptr shape_table) or we successfully load
// a shape_table written by a previous shape clustering, then
// shape_analysis will be true, meaning that the MasterTrainer will replace
// some members of the unicharset with their fragments.
bool shape_analysis = false;
if (shape_table != nullptr) {
*shape_table = LoadShapeTable(*file_prefix);
2017-01-26 08:20:19 +08:00
if (*shape_table != nullptr) shape_analysis = true;
} else {
shape_analysis = true;
}
MasterTrainer* trainer = new MasterTrainer(NM_CHAR_ANISOTROPIC,
shape_analysis,
replication,
FLAGS_debug_level);
IntFeatureSpace fs;
fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets);
trainer->LoadUnicharset(FLAGS_U.c_str());
// Get basic font information from font_properties.
if (!FLAGS_F.empty()) {
if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
delete trainer;
return nullptr;
}
}
if (!FLAGS_X.empty()) {
if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
delete trainer;
return nullptr;
}
}
trainer->SetFeatureSpace(fs);
const char* page_name;
// Load training data from .tr files on the command line.
while ((page_name = GetNextFilename(argc, argv)) != nullptr) {
tprintf("Reading %s ...\n", page_name);
trainer->ReadTrainingSamples(page_name, feature_defs, false);
// If there is a file with [lang].[fontname].exp[num].fontinfo present,
// read font spacing information in to fontinfo_table.
int pagename_len = strlen(page_name);
char* fontinfo_file_name = new char[pagename_len + 7];
strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr"
strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
trainer->AddSpacingInfo(fontinfo_file_name);
delete[] fontinfo_file_name;
// Load the images into memory if required by the classifier.
if (FLAGS_load_images) {
STRING image_name = page_name;
// Chop off the tr and replace with tif. Extension must be tif!
image_name.truncate_at(image_name.length() - 2);
image_name += "tif";
trainer->LoadPageImages(image_name.string());
}
}
trainer->PostLoadCleanup();
// Write the master trainer if required.
if (!FLAGS_output_trainer.empty()) {
FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
if (fp == nullptr) {
tprintf("Can't create saved trainer data!\n");
} else {
trainer->Serialize(fp);
fclose(fp);
}
}
trainer->PreTrainingSetup();
if (!FLAGS_O.empty() &&
!trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
delete trainer;
return nullptr;
}
if (shape_table != nullptr) {
// If we previously failed to load a shapetable, then shape clustering
// wasn't run so make a flat one now.
if (*shape_table == nullptr) {
*shape_table = new ShapeTable;
trainer->SetupFlatShapeTable(*shape_table);
tprintf("Flat shape table summary: %s\n",
(*shape_table)->SummaryStr().string());
}
(*shape_table)->set_unicharset(trainer->unicharset());
}
return trainer;
}
} // namespace tesseract.
/*---------------------------------------------------------------------------*/
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
/**
* This routine returns the next command line argument. If
* there are no remaining command line arguments, it returns
* nullptr. This routine should only be called after all option
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
* arguments have been parsed and removed with ParseArguments.
*
* Globals:
* - tessoptind defined by tessopt sys call
* @return Next command line argument or nullptr.
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
*/
const char *GetNextFilename(int argc, const char* const * argv) {
if (tessoptind < argc)
return argv[tessoptind++];
else
return nullptr;
2016-11-08 02:46:33 +08:00
} /* GetNextFilename */
/*---------------------------------------------------------------------------*/
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
/**
* This routine searches through a list of labeled lists to find
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
* a list with the specified label. If a matching labeled list
* cannot be found, nullptr is returned.
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
* @param List list to search
* @param Label label to search for
* @return Labeled list with the specified label or nullptr.
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
* @note Globals: none
*/
2016-11-08 02:46:33 +08:00
LABELEDLIST FindList(LIST List, char* Label) {
LABELEDLIST LabeledList;
iterate (List)
{
LabeledList = reinterpret_cast<LABELEDLIST>first_node (List);
if (strcmp (LabeledList->Label, Label) == 0)
return (LabeledList);
}
return (nullptr);
2016-11-08 02:46:33 +08:00
} /* FindList */
/*---------------------------------------------------------------------------*/
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
/**
* This routine allocates a new, empty labeled list and gives
* it the specified label.
* @param Label label for new list
* @return New, empty labeled list.
* @note Globals: none
*/
2016-11-08 02:46:33 +08:00
LABELEDLIST NewLabeledList(const char* Label) {
LABELEDLIST LabeledList;
LabeledList = static_cast<LABELEDLIST>(Emalloc (sizeof (LABELEDLISTNODE)));
LabeledList->Label = static_cast<char*>(Emalloc (strlen (Label)+1));
strcpy (LabeledList->Label, Label);
LabeledList->List = NIL_LIST;
LabeledList->SampleCount = 0;
LabeledList->font_sample_count = 0;
return (LabeledList);
2016-11-08 02:46:33 +08:00
} /* NewLabeledList */
/*---------------------------------------------------------------------------*/
// TODO(rays) This is now used only by cntraining. Convert cntraining to use
// the new method or get rid of it entirely.
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
/**
* This routine reads training samples from a file and
* places them into a data structure which organizes the
* samples by FontName and CharName. It then returns this
* data structure.
* @param file open text file to read samples from
* @param feature_definitions
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
* @param feature_name
* @param max_samples
* @param unicharset
* @param training_samples
*/
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_definitions,
const char *feature_name, int max_samples,
UNICHARSET* unicharset,
FILE* file, LIST* training_samples) {
char buffer[2048];
char unichar[UNICHAR_LEN + 1];
LABELEDLIST char_sample;
FEATURE_SET feature_samples;
CHAR_DESC char_desc;
uint32_t feature_type =
ShortNameToFeatureType(feature_definitions, feature_name);
// Zero out the font_sample_count for all the classes.
LIST it = *training_samples;
iterate(it) {
char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
char_sample->font_sample_count = 0;
}
while (fgets(buffer, 2048, file) != nullptr) {
if (buffer[0] == '\n')
continue;
sscanf(buffer, "%*s %s", unichar);
if (unicharset != nullptr && !unicharset->contains_unichar(unichar)) {
unicharset->unichar_insert(unichar);
if (unicharset->size() > MAX_NUM_CLASSES) {
tprintf("Error: Size of unicharset in training is "
"greater than MAX_NUM_CLASSES\n");
exit(1);
}
}
char_sample = FindList(*training_samples, unichar);
if (char_sample == nullptr) {
char_sample = NewLabeledList(unichar);
*training_samples = push(*training_samples, char_sample);
}
char_desc = ReadCharDescription(feature_definitions, file);
feature_samples = char_desc->FeatureSets[feature_type];
if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
char_sample->List = push(char_sample->List, feature_samples);
char_sample->SampleCount++;
char_sample->font_sample_count++;
} else {
FreeFeatureSet(feature_samples);
}
2017-05-09 16:49:58 +08:00
for (size_t i = 0; i < char_desc->NumFeatureSets; i++) {
if (feature_type != i)
FreeFeatureSet(char_desc->FeatureSets[i]);
}
free(char_desc);
}
} // ReadTrainingSamples
/*---------------------------------------------------------------------------*/
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
/**
* This routine deallocates all of the space allocated to
* the specified list of training samples.
* @param CharList list of all fonts in document
*/
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
void FreeTrainingSamples(LIST CharList) {
LABELEDLIST char_sample;
FEATURE_SET FeatureSet;
LIST FeatureList;
2016-11-08 02:46:33 +08:00
LIST nodes = CharList;
iterate(CharList) { /* iterate through all of the fonts */
char_sample = reinterpret_cast<LABELEDLIST>first_node(CharList);
FeatureList = char_sample->List;
2016-11-08 02:46:33 +08:00
iterate(FeatureList) { /* iterate through all of the classes */
FeatureSet = reinterpret_cast<FEATURE_SET>first_node(FeatureList);
FreeFeatureSet(FeatureSet);
}
FreeLabeledList(char_sample);
}
2016-11-08 02:46:33 +08:00
destroy(nodes);
} /* FreeTrainingSamples */
/*---------------------------------------------------------------------------*/
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
/**
* This routine deallocates all of the memory consumed by
* a labeled list. It does not free any memory which may be
* consumed by the items in the list.
* @param LabeledList labeled list to be freed
* @note Globals: none
*/
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
void FreeLabeledList(LABELEDLIST LabeledList) {
destroy(LabeledList->List);
free(LabeledList->Label);
free(LabeledList);
} /* FreeLabeledList */
/*---------------------------------------------------------------------------*/
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
/**
* This routine reads samples from a LABELEDLIST and enters
* those samples into a clusterer data structure. This
* data structure is then returned to the caller.
* @param char_sample: LABELEDLIST that holds all the feature information for a
* @param FeatureDefs
* @param program_feature_type
* given character.
* @return Pointer to new clusterer data structure.
* @note Globals: None
*/
CLUSTERER *SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs,
LABELEDLIST char_sample,
const char* program_feature_type) {
Use POSIX data types and macros (#878) * api: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccmain: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccstruct: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * classify: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * cutil: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * dict: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * textord: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * training: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * wordrec: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccutil: Replace Tesseract data types by POSIX data types Now all Tesseract data types which are no longer needed can be removed from ccutil/host.h. Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccmain: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccstruct: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * classify: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * dict: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * lstm: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * textord: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * wordrec: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccutil: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Remove the macros which are now unused from ccutil/host.h. Remove also the obsolete history comments. Signed-off-by: Stefan Weil <sw@weilnetz.de> * Fix build error caused by ambiguous ClipToRange Error message vom Appveyor CI: C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2672: 'ClipToRange': no matching overloaded function found [C:\projects\tesseract\build\libtesseract.vcxproj] C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2782: 'T ClipToRange(const T &,const T &,const T &)': template parameter 'T' is ambiguous [C:\projects\tesseract\build\libtesseract.vcxproj] c:\projects\tesseract\ccutil\helpers.h(122): note: see declaration of 'ClipToRange' C:\projects\tesseract\ccstruct\coutln.cpp(818): note: could be 'char' C:\projects\tesseract\ccstruct\coutln.cpp(818): note: or 'int' Signed-off-by: Stefan Weil <sw@weilnetz.de> * unittest: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * arch: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de>
2018-03-14 04:36:30 +08:00
uint16_t N;
int i, j;
float* Sample = nullptr;
CLUSTERER *Clusterer;
Use POSIX data types and macros (#878) * api: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccmain: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccstruct: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * classify: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * cutil: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * dict: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * textord: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * training: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * wordrec: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccutil: Replace Tesseract data types by POSIX data types Now all Tesseract data types which are no longer needed can be removed from ccutil/host.h. Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccmain: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccstruct: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * classify: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * dict: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * lstm: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * textord: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * wordrec: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccutil: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Remove the macros which are now unused from ccutil/host.h. Remove also the obsolete history comments. Signed-off-by: Stefan Weil <sw@weilnetz.de> * Fix build error caused by ambiguous ClipToRange Error message vom Appveyor CI: C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2672: 'ClipToRange': no matching overloaded function found [C:\projects\tesseract\build\libtesseract.vcxproj] C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2782: 'T ClipToRange(const T &,const T &,const T &)': template parameter 'T' is ambiguous [C:\projects\tesseract\build\libtesseract.vcxproj] c:\projects\tesseract\ccutil\helpers.h(122): note: see declaration of 'ClipToRange' C:\projects\tesseract\ccstruct\coutln.cpp(818): note: could be 'char' C:\projects\tesseract\ccstruct\coutln.cpp(818): note: or 'int' Signed-off-by: Stefan Weil <sw@weilnetz.de> * unittest: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * arch: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de>
2018-03-14 04:36:30 +08:00
int32_t CharID;
LIST FeatureList = nullptr;
FEATURE_SET FeatureSet = nullptr;
int32_t desc_index =
ShortNameToFeatureType(FeatureDefs, program_feature_type);
N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
FeatureList = char_sample->List;
CharID = 0;
iterate(FeatureList) {
FeatureSet = reinterpret_cast<FEATURE_SET>first_node(FeatureList);
for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
if (Sample == nullptr) Sample = static_cast<float*>(Emalloc(N * sizeof(float)));
for (j = 0; j < N; j++)
Sample[j] = FeatureSet->Features[i]->Params[j];
MakeSample (Clusterer, Sample, CharID);
}
CharID++;
}
free(Sample);
return Clusterer;
2016-11-08 02:46:33 +08:00
} /* SetUpForClustering */
/*------------------------------------------------------------------------*/
void MergeInsignificantProtos(LIST ProtoList, const char* label,
CLUSTERER* Clusterer,
CLUSTERCONFIG* clusterconfig) {
2016-11-08 02:46:33 +08:00
PROTOTYPE* Prototype;
bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
LIST pProtoList = ProtoList;
iterate(pProtoList) {
Prototype = reinterpret_cast<PROTOTYPE *>first_node (pProtoList);
if (Prototype->Significant || Prototype->Merged)
continue;
float best_dist = 0.125;
PROTOTYPE* best_match = nullptr;
// Find the nearest alive prototype.
LIST list_it = ProtoList;
iterate(list_it) {
PROTOTYPE* test_p = reinterpret_cast<PROTOTYPE *>first_node (list_it);
if (test_p != Prototype && !test_p->Merged) {
float dist = ComputeDistance(Clusterer->SampleSize,
Clusterer->ParamDesc,
Prototype->Mean, test_p->Mean);
if (dist < best_dist) {
best_match = test_p;
best_dist = dist;
}
}
}
if (best_match != nullptr && !best_match->Significant) {
if (debug)
tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
best_match->NumSamples, Prototype->NumSamples,
best_match->Mean[0], best_match->Mean[1],
Prototype->Mean[0], Prototype->Mean[1]);
best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
Clusterer->ParamDesc,
best_match->NumSamples,
Prototype->NumSamples,
best_match->Mean,
best_match->Mean, Prototype->Mean);
Prototype->NumSamples = 0;
Prototype->Merged = true;
} else if (best_match != nullptr) {
if (debug)
tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
Prototype->Mean[0], Prototype->Mean[1],
best_match->Mean[0], best_match->Mean[1]);
Prototype->Merged = true;
}
}
// Mark significant those that now have enough samples.
int min_samples =
static_cast<int32_t>(clusterconfig->MinSamples * Clusterer->NumChar);
pProtoList = ProtoList;
iterate(pProtoList) {
Prototype = reinterpret_cast<PROTOTYPE *>first_node (pProtoList);
// Process insignificant protos that do not match a green one
if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
!Prototype->Merged) {
if (debug)
tprintf("Red proto at %g,%g becoming green\n",
Prototype->Mean[0], Prototype->Mean[1]);
Prototype->Significant = true;
}
}
2016-11-08 02:46:33 +08:00
} /* MergeInsignificantProtos */
/*-----------------------------------------------------------------------------*/
void CleanUpUnusedData(
LIST ProtoList)
{
PROTOTYPE* Prototype;
iterate(ProtoList)
{
Prototype = reinterpret_cast<PROTOTYPE *>first_node (ProtoList);
free(Prototype->Variance.Elliptical);
Prototype->Variance.Elliptical = nullptr;
free(Prototype->Magnitude.Elliptical);
Prototype->Magnitude.Elliptical = nullptr;
free(Prototype->Weight.Elliptical);
Prototype->Weight.Elliptical = nullptr;
}
}
/*------------------------------------------------------------------------*/
LIST RemoveInsignificantProtos(
LIST ProtoList,
bool KeepSigProtos,
bool KeepInsigProtos,
int N)
{
LIST NewProtoList = NIL_LIST;
LIST pProtoList;
PROTOTYPE* Proto;
PROTOTYPE* NewProto;
int i;
pProtoList = ProtoList;
iterate(pProtoList)
{
Proto = reinterpret_cast<PROTOTYPE *>first_node (pProtoList);
if ((Proto->Significant && KeepSigProtos) ||
(!Proto->Significant && KeepInsigProtos))
{
NewProto = static_cast<PROTOTYPE *>(Emalloc(sizeof(PROTOTYPE)));
NewProto->Mean = static_cast<float *>(Emalloc(N * sizeof(float)));
NewProto->Significant = Proto->Significant;
NewProto->Style = Proto->Style;
NewProto->NumSamples = Proto->NumSamples;
NewProto->Cluster = nullptr;
NewProto->Distrib = nullptr;
for (i=0; i < N; i++)
NewProto->Mean[i] = Proto->Mean[i];
2017-01-26 08:20:19 +08:00
if (Proto->Variance.Elliptical != nullptr) {
NewProto->Variance.Elliptical = static_cast<float *>(Emalloc(N * sizeof(float)));
for (i=0; i < N; i++)
NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
}
else
NewProto->Variance.Elliptical = nullptr;
//---------------------------------------------
2017-01-26 08:20:19 +08:00
if (Proto->Magnitude.Elliptical != nullptr) {
NewProto->Magnitude.Elliptical = static_cast<float *>(Emalloc(N * sizeof(float)));
for (i=0; i < N; i++)
NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
}
else
NewProto->Magnitude.Elliptical = nullptr;
//------------------------------------------------
2017-01-26 08:20:19 +08:00
if (Proto->Weight.Elliptical != nullptr) {
NewProto->Weight.Elliptical = static_cast<float *>(Emalloc(N * sizeof(float)));
for (i=0; i < N; i++)
NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
}
else
NewProto->Weight.Elliptical = nullptr;
NewProto->TotalMagnitude = Proto->TotalMagnitude;
NewProto->LogMagnitude = Proto->LogMagnitude;
NewProtoList = push_last(NewProtoList, NewProto);
}
}
FreeProtoList(&ProtoList);
return (NewProtoList);
2016-11-08 02:46:33 +08:00
} /* RemoveInsignificantProtos */
/*----------------------------------------------------------------------------*/
2016-11-08 02:46:33 +08:00
MERGE_CLASS FindClass(LIST List, const char* Label) {
MERGE_CLASS MergeClass;
iterate (List)
{
MergeClass = reinterpret_cast<MERGE_CLASS>first_node (List);
if (strcmp (MergeClass->Label, Label) == 0)
return (MergeClass);
}
return (nullptr);
2016-11-08 02:46:33 +08:00
} /* FindClass */
/*---------------------------------------------------------------------------*/
2016-11-08 02:46:33 +08:00
MERGE_CLASS NewLabeledClass(const char* Label) {
MERGE_CLASS MergeClass;
MergeClass = new MERGE_CLASS_NODE;
MergeClass->Label = static_cast<char*>(Emalloc (strlen (Label)+1));
strcpy (MergeClass->Label, Label);
MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS);
return (MergeClass);
2016-11-08 02:46:33 +08:00
} /* NewLabeledClass */
/*-----------------------------------------------------------------------------*/
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
/**
* This routine deallocates all of the space allocated to
* the specified list of training samples.
* @param ClassList list of all fonts in document
*/
2016-11-08 02:46:33 +08:00
void FreeLabeledClassList(LIST ClassList) {
MERGE_CLASS MergeClass;
2016-11-08 02:46:33 +08:00
LIST nodes = ClassList;
iterate(ClassList) /* iterate through all of the fonts */
{
MergeClass = reinterpret_cast<MERGE_CLASS>first_node (ClassList);
free (MergeClass->Label);
FreeClass(MergeClass->Class);
delete MergeClass;
}
2016-11-08 02:46:33 +08:00
destroy(nodes);
2016-11-08 02:46:33 +08:00
} /* FreeLabeledClassList */
Doxygen Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to &lt; to make doxygen happy b476a28 wrong place
2014-09-13 04:41:19 +08:00
/* SetUpForFloat2Int */
CLASS_STRUCT* SetUpForFloat2Int(const UNICHARSET& unicharset,
LIST LabeledClassList) {
2016-11-08 02:46:33 +08:00
MERGE_CLASS MergeClass;
CLASS_TYPE Class;
int NumProtos;
int NumConfigs;
int NumWords;
int i, j;
float Values[3];
PROTO NewProto;
PROTO OldProto;
BIT_VECTOR NewConfig;
BIT_VECTOR OldConfig;
// printf("Float2Int ...\n");
CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
iterate(LabeledClassList)
{
UnicityTableEqEq<int> font_set;
MergeClass = reinterpret_cast<MERGE_CLASS>first_node (LabeledClassList);
Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
NumProtos = MergeClass->Class->NumProtos;
NumConfigs = MergeClass->Class->NumConfigs;
font_set.move(&MergeClass->Class->font_set);
Class->NumProtos = NumProtos;
Class->MaxNumProtos = NumProtos;
Class->Prototypes = static_cast<PROTO>(Emalloc (sizeof(PROTO_STRUCT) * NumProtos));
for(i=0; i < NumProtos; i++)
{
NewProto = ProtoIn(Class, i);
OldProto = ProtoIn(MergeClass->Class, i);
Values[0] = OldProto->X;
Values[1] = OldProto->Y;
Values[2] = OldProto->Angle;
Normalize(Values);
NewProto->X = OldProto->X;
NewProto->Y = OldProto->Y;
NewProto->Length = OldProto->Length;
NewProto->Angle = OldProto->Angle;
NewProto->A = Values[0];
NewProto->B = Values[1];
NewProto->C = Values[2];
}
Class->NumConfigs = NumConfigs;
Class->MaxNumConfigs = NumConfigs;
Class->font_set.move(&font_set);
Class->Configurations = static_cast<BIT_VECTOR*>(Emalloc (sizeof(BIT_VECTOR) * NumConfigs));
NumWords = WordsInVectorOfSize(NumProtos);
for(i=0; i < NumConfigs; i++)
{
NewConfig = NewBitVector(NumProtos);
OldConfig = MergeClass->Class->Configurations[i];
for(j=0; j < NumWords; j++)
NewConfig[j] = OldConfig[j];
Class->Configurations[i] = NewConfig;
}
}
return float_classes;
} // SetUpForFloat2Int
/*--------------------------------------------------------------------------*/
void Normalize (
float *Values)
{
float Slope;
float Intercept;
float Normalizer;
Slope = tan(Values [2] * 2 * M_PI);
Intercept = Values [1] - Slope * Values [0];
Normalizer = 1 / sqrt (Slope * Slope + 1.0);
Values [0] = Slope * Normalizer;
Values [1] = - Normalizer;
Values [2] = Intercept * Normalizer;
} // Normalize
/*-------------------------------------------------------------------------*/
2016-11-08 02:46:33 +08:00
void FreeNormProtoList(LIST CharList)
{
2016-11-08 02:46:33 +08:00
LABELEDLIST char_sample;
2016-11-08 02:46:33 +08:00
LIST nodes = CharList;
iterate(CharList) /* iterate through all of the fonts */
{
char_sample = reinterpret_cast<LABELEDLIST>first_node (CharList);
FreeLabeledList (char_sample);
}
2016-11-08 02:46:33 +08:00
destroy(nodes);
2016-11-08 02:46:33 +08:00
} // FreeNormProtoList
/*---------------------------------------------------------------------------*/
void AddToNormProtosList(
LIST* NormProtoList,
LIST ProtoList,
char* CharName)
{
PROTOTYPE* Proto;
LABELEDLIST LabeledProtoList;
LabeledProtoList = NewLabeledList(CharName);
iterate(ProtoList)
{
Proto = reinterpret_cast<PROTOTYPE *>first_node (ProtoList);
LabeledProtoList->List = push(LabeledProtoList->List, Proto);
}
*NormProtoList = push(*NormProtoList, LabeledProtoList);
}
/*---------------------------------------------------------------------------*/
int NumberOfProtos(LIST ProtoList, bool CountSigProtos,
bool CountInsigProtos) {
int N = 0;
iterate(ProtoList)
{
PROTOTYPE* Proto = reinterpret_cast<PROTOTYPE*>first_node(ProtoList);
2016-11-08 02:46:33 +08:00
if ((Proto->Significant && CountSigProtos) ||
(!Proto->Significant && CountInsigProtos))
N++;
}
return(N);
}
#endif // def DISABLED_LEGACY_ENGINE