mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-07 09:52:40 +08:00
Fixed blob division params to fix CJK training speed.
This commit is contained in:
parent
4c7ab0caea
commit
d74c625e52
@ -33,127 +33,135 @@
|
||||
|
||||
namespace tesseract {
|
||||
Classify::Classify()
|
||||
: BOOL_MEMBER(prioritize_division, FALSE,
|
||||
"Prioritize blob division over chopping", this->params()),
|
||||
INT_MEMBER(tessedit_single_match, FALSE,
|
||||
"Top choice only from CP", this->params()),
|
||||
BOOL_MEMBER(classify_enable_learning, true,
|
||||
"Enable adaptive classifier", this->params()),
|
||||
INT_MEMBER(classify_debug_level, 0, "Classify debug level",
|
||||
this->params()),
|
||||
INT_MEMBER(classify_norm_method, character, "Normalization Method ...",
|
||||
this->params()),
|
||||
double_MEMBER(classify_char_norm_range, 0.2,
|
||||
"Character Normalization Range ...", this->params()),
|
||||
double_MEMBER(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...",
|
||||
this->params()), /* PREV DEFAULT 0.1 */
|
||||
double_MEMBER(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...",
|
||||
this->params()), /* PREV DEFAULT 0.3 */
|
||||
double_MEMBER(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...",
|
||||
this->params()), /* PREV DEFAULT 0.1 */
|
||||
double_MEMBER(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...",
|
||||
this->params()), /* PREV DEFAULT 0.3 */
|
||||
double_MEMBER(classify_max_rating_ratio, 1.5,
|
||||
"Veto ratio between classifier ratings", this->params()),
|
||||
double_MEMBER(classify_max_certainty_margin, 5.5,
|
||||
"Veto difference between classifier certainties",
|
||||
: BOOL_MEMBER(allow_blob_division, true, "Use divisible blobs chopping",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching",
|
||||
this->params()),
|
||||
BOOL_MEMBER(classify_enable_adaptive_matcher, 1,
|
||||
"Enable adaptive classifier",
|
||||
this->params()),
|
||||
BOOL_MEMBER(classify_use_pre_adapted_templates, 0,
|
||||
"Use pre-adapted classifier templates", this->params()),
|
||||
BOOL_MEMBER(classify_save_adapted_templates, 0,
|
||||
"Save adapted templates to a file", this->params()),
|
||||
BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger",
|
||||
this->params()),
|
||||
BOOL_MEMBER(classify_nonlinear_norm, 0,
|
||||
"Non-linear stroke-density normalization", this->params()),
|
||||
INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()),
|
||||
INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()),
|
||||
INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ",
|
||||
this->params()),
|
||||
double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)",
|
||||
BOOL_MEMBER(prioritize_division, FALSE,
|
||||
"Prioritize blob division over chopping", this->params()),
|
||||
INT_MEMBER(tessedit_single_match, FALSE, "Top choice only from CP",
|
||||
this->params()),
|
||||
BOOL_MEMBER(classify_enable_learning, true, "Enable adaptive classifier",
|
||||
this->params()),
|
||||
double_MEMBER(matcher_great_threshold, 0.0, "Great Match (0-1)",
|
||||
INT_MEMBER(classify_debug_level, 0, "Classify debug level",
|
||||
this->params()),
|
||||
INT_MEMBER(classify_norm_method, character, "Normalization Method ...",
|
||||
this->params()),
|
||||
double_MEMBER(classify_char_norm_range, 0.2,
|
||||
"Character Normalization Range ...", this->params()),
|
||||
double_MEMBER(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...",
|
||||
this->params()), /* PREV DEFAULT 0.1 */
|
||||
double_MEMBER(classify_max_norm_scale_x, 0.325,
|
||||
"Max char x-norm scale ...",
|
||||
this->params()), /* PREV DEFAULT 0.3 */
|
||||
double_MEMBER(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...",
|
||||
this->params()), /* PREV DEFAULT 0.1 */
|
||||
double_MEMBER(classify_max_norm_scale_y, 0.325,
|
||||
"Max char y-norm scale ...",
|
||||
this->params()), /* PREV DEFAULT 0.3 */
|
||||
double_MEMBER(classify_max_rating_ratio, 1.5,
|
||||
"Veto ratio between classifier ratings", this->params()),
|
||||
double_MEMBER(classify_max_certainty_margin, 5.5,
|
||||
"Veto difference between classifier certainties",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching",
|
||||
this->params()),
|
||||
double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)",
|
||||
BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching",
|
||||
this->params()),
|
||||
double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)",
|
||||
BOOL_MEMBER(classify_enable_adaptive_matcher, 1,
|
||||
"Enable adaptive classifier", this->params()),
|
||||
BOOL_MEMBER(classify_use_pre_adapted_templates, 0,
|
||||
"Use pre-adapted classifier templates", this->params()),
|
||||
BOOL_MEMBER(classify_save_adapted_templates, 0,
|
||||
"Save adapted templates to a file", this->params()),
|
||||
BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger",
|
||||
this->params()),
|
||||
double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)",
|
||||
BOOL_MEMBER(classify_nonlinear_norm, 0,
|
||||
"Non-linear stroke-density normalization", this->params()),
|
||||
INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()),
|
||||
INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()),
|
||||
INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ",
|
||||
this->params()),
|
||||
double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)",
|
||||
this->params()),
|
||||
double_MEMBER(matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)",
|
||||
this->params()),
|
||||
double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)",
|
||||
this->params()),
|
||||
double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)",
|
||||
this->params()),
|
||||
double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)",
|
||||
this->params()),
|
||||
double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length",
|
||||
this->params()),
|
||||
INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes",
|
||||
this->params()),
|
||||
INT_MEMBER(matcher_min_examples_for_prototyping, 3,
|
||||
"Reliable Config Threshold", this->params()),
|
||||
INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5,
|
||||
"Enable adaption even if the ambiguities have not been seen",
|
||||
this->params()),
|
||||
double_MEMBER(matcher_clustering_max_angle_delta, 0.015,
|
||||
"Maximum angle delta for prototype clustering",
|
||||
this->params()),
|
||||
double_MEMBER(classify_misfit_junk_penalty, 0.0,
|
||||
"Penalty to apply when a non-alnum is vertically out of "
|
||||
"its expected textline position",
|
||||
this->params()),
|
||||
double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()),
|
||||
double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
|
||||
this->params()),
|
||||
double_MEMBER(tessedit_class_miss_scale, 0.00390625,
|
||||
"Scale factor for features not used", this->params()),
|
||||
double_MEMBER(
|
||||
classify_adapted_pruning_factor, 2.5,
|
||||
"Prune poor adapted results this much worse than best result",
|
||||
this->params()),
|
||||
double_MEMBER(classify_adapted_pruning_threshold, -1.0,
|
||||
"Threshold at which classify_adapted_pruning_factor starts",
|
||||
this->params()),
|
||||
INT_MEMBER(classify_adapt_proto_threshold, 230,
|
||||
"Threshold for good protos during adaptive 0-255",
|
||||
this->params()),
|
||||
INT_MEMBER(classify_adapt_feature_threshold, 230,
|
||||
"Threshold for good features during adaptive 0-255",
|
||||
this->params()),
|
||||
BOOL_MEMBER(disable_character_fragments, TRUE,
|
||||
"Do not include character fragments in the"
|
||||
" results of the classifier",
|
||||
this->params()),
|
||||
double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length",
|
||||
double_MEMBER(classify_character_fragments_garbage_certainty_threshold,
|
||||
-3.0,
|
||||
"Exclude fragments that do not look like whole"
|
||||
" characters from training and adaption",
|
||||
this->params()),
|
||||
BOOL_MEMBER(classify_debug_character_fragments, FALSE,
|
||||
"Bring up graphical debugging windows for fragments training",
|
||||
this->params()),
|
||||
INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes",
|
||||
this->params()),
|
||||
INT_MEMBER(matcher_min_examples_for_prototyping, 3,
|
||||
"Reliable Config Threshold", this->params()),
|
||||
INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5,
|
||||
"Enable adaption even if the ambiguities have not been seen",
|
||||
this->params()),
|
||||
double_MEMBER(matcher_clustering_max_angle_delta, 0.015,
|
||||
"Maximum angle delta for prototype clustering",
|
||||
BOOL_MEMBER(matcher_debug_separate_windows, FALSE,
|
||||
"Use two different windows for debugging the matching: "
|
||||
"One for the protos and one for the features.",
|
||||
this->params()),
|
||||
double_MEMBER(classify_misfit_junk_penalty, 0.0,
|
||||
"Penalty to apply when a non-alnum is vertically out of "
|
||||
"its expected textline position",
|
||||
this->params()),
|
||||
double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()),
|
||||
double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
|
||||
this->params()),
|
||||
double_MEMBER(tessedit_class_miss_scale, 0.00390625,
|
||||
"Scale factor for features not used", this->params()),
|
||||
double_MEMBER(classify_adapted_pruning_factor, 2.5,
|
||||
"Prune poor adapted results this much worse than best result",
|
||||
this->params()),
|
||||
double_MEMBER(classify_adapted_pruning_threshold, -1.0,
|
||||
"Threshold at which classify_adapted_pruning_factor starts",
|
||||
this->params()),
|
||||
INT_MEMBER(classify_adapt_proto_threshold, 230,
|
||||
"Threshold for good protos during adaptive 0-255",
|
||||
this->params()),
|
||||
INT_MEMBER(classify_adapt_feature_threshold, 230,
|
||||
"Threshold for good features during adaptive 0-255",
|
||||
this->params()),
|
||||
BOOL_MEMBER(disable_character_fragments, TRUE,
|
||||
"Do not include character fragments in the"
|
||||
" results of the classifier", this->params()),
|
||||
double_MEMBER(classify_character_fragments_garbage_certainty_threshold,
|
||||
-3.0, "Exclude fragments that do not look like whole"
|
||||
" characters from training and adaption", this->params()),
|
||||
BOOL_MEMBER(classify_debug_character_fragments, FALSE,
|
||||
"Bring up graphical debugging windows for fragments training",
|
||||
this->params()),
|
||||
BOOL_MEMBER(matcher_debug_separate_windows, FALSE,
|
||||
"Use two different windows for debugging the matching: "
|
||||
"One for the protos and one for the features.", this->params()),
|
||||
STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning",
|
||||
this->params()),
|
||||
INT_MEMBER(classify_class_pruner_threshold, 229,
|
||||
"Class Pruner Threshold 0-255", this->params()),
|
||||
INT_MEMBER(classify_class_pruner_multiplier, 15,
|
||||
"Class Pruner Multiplier 0-255: ", this->params()),
|
||||
INT_MEMBER(classify_cp_cutoff_strength, 7,
|
||||
"Class Pruner CutoffStrength: ", this->params()),
|
||||
INT_MEMBER(classify_integer_matcher_multiplier, 10,
|
||||
"Integer Matcher Multiplier 0-255: ", this->params()),
|
||||
EnableLearning(true),
|
||||
INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word",
|
||||
this->params()),
|
||||
BOOL_MEMBER(classify_bln_numeric_mode, 0,
|
||||
"Assume the input is numbers [0-9].", this->params()),
|
||||
double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size",
|
||||
this->params()),
|
||||
double_MEMBER(speckle_rating_penalty, 10.0,
|
||||
"Penalty to add to worst rating for noise", this->params()),
|
||||
shape_table_(NULL),
|
||||
dict_(this),
|
||||
static_classifier_(NULL) {
|
||||
STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning",
|
||||
this->params()),
|
||||
INT_MEMBER(classify_class_pruner_threshold, 229,
|
||||
"Class Pruner Threshold 0-255", this->params()),
|
||||
INT_MEMBER(classify_class_pruner_multiplier, 15,
|
||||
"Class Pruner Multiplier 0-255: ", this->params()),
|
||||
INT_MEMBER(classify_cp_cutoff_strength, 7,
|
||||
"Class Pruner CutoffStrength: ", this->params()),
|
||||
INT_MEMBER(classify_integer_matcher_multiplier, 10,
|
||||
"Integer Matcher Multiplier 0-255: ", this->params()),
|
||||
EnableLearning(true),
|
||||
INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word",
|
||||
this->params()),
|
||||
BOOL_MEMBER(classify_bln_numeric_mode, 0,
|
||||
"Assume the input is numbers [0-9].", this->params()),
|
||||
double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size",
|
||||
this->params()),
|
||||
double_MEMBER(speckle_rating_penalty, 10.0,
|
||||
"Penalty to add to worst rating for noise", this->params()),
|
||||
shape_table_(NULL),
|
||||
dict_(this),
|
||||
static_classifier_(NULL) {
|
||||
fontinfo_table_.set_compare_callback(
|
||||
NewPermanentTessCallback(CompareFontInfo));
|
||||
fontinfo_table_.set_clear_callback(
|
||||
|
@ -374,6 +374,12 @@ class Classify : public CCStruct {
|
||||
// Member variables.
|
||||
|
||||
// Parameters.
|
||||
// Set during training (in lang.config) to indicate whether the divisible
|
||||
// blobs chopper should be used (true for latin script.)
|
||||
BOOL_VAR_H(allow_blob_division, true, "Use divisible blobs chopping");
|
||||
// Set during training (in lang.config) to indicate whether the divisible
|
||||
// blobs chopper should be used in preference to chopping. Set to true for
|
||||
// southern Indic scripts.
|
||||
BOOL_VAR_H(prioritize_division, FALSE,
|
||||
"Prioritize blob division over chopping");
|
||||
INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP");
|
||||
|
@ -200,7 +200,7 @@ SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number,
|
||||
if (seam == NULL) {
|
||||
if (repair_unchopped_blobs)
|
||||
restore_outline_tree(blob->outlines);
|
||||
if (word->latin_script) {
|
||||
if (allow_blob_division && !prioritize_division) {
|
||||
// If the blob can simply be divided into outlines, then do that.
|
||||
TPOINT location;
|
||||
if (divisible_blob(blob, italic_blob, &location)) {
|
||||
|
Loading…
Reference in New Issue
Block a user