diff --git a/classify/classify.cpp b/classify/classify.cpp index 22b59b405..be6ad8bc4 100644 --- a/classify/classify.cpp +++ b/classify/classify.cpp @@ -33,127 +33,135 @@ namespace tesseract { Classify::Classify() - : BOOL_MEMBER(prioritize_division, FALSE, - "Prioritize blob division over chopping", this->params()), - INT_MEMBER(tessedit_single_match, FALSE, - "Top choice only from CP", this->params()), - BOOL_MEMBER(classify_enable_learning, true, - "Enable adaptive classifier", this->params()), - INT_MEMBER(classify_debug_level, 0, "Classify debug level", - this->params()), - INT_MEMBER(classify_norm_method, character, "Normalization Method ...", - this->params()), - double_MEMBER(classify_char_norm_range, 0.2, - "Character Normalization Range ...", this->params()), - double_MEMBER(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...", - this->params()), /* PREV DEFAULT 0.1 */ - double_MEMBER(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...", - this->params()), /* PREV DEFAULT 0.3 */ - double_MEMBER(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...", - this->params()), /* PREV DEFAULT 0.1 */ - double_MEMBER(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...", - this->params()), /* PREV DEFAULT 0.3 */ - double_MEMBER(classify_max_rating_ratio, 1.5, - "Veto ratio between classifier ratings", this->params()), - double_MEMBER(classify_max_certainty_margin, 5.5, - "Veto difference between classifier certainties", + : BOOL_MEMBER(allow_blob_division, true, "Use divisible blobs chopping", this->params()), - BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching", - this->params()), - BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching", - this->params()), - BOOL_MEMBER(classify_enable_adaptive_matcher, 1, - "Enable adaptive classifier", - this->params()), - BOOL_MEMBER(classify_use_pre_adapted_templates, 0, - "Use pre-adapted classifier templates", this->params()), - BOOL_MEMBER(classify_save_adapted_templates, 0, - "Save adapted templates to a file", this->params()), - BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger", - this->params()), - BOOL_MEMBER(classify_nonlinear_norm, 0, - "Non-linear stroke-density normalization", this->params()), - INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()), - INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()), - INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ", - this->params()), - double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)", + BOOL_MEMBER(prioritize_division, FALSE, + "Prioritize blob division over chopping", this->params()), + INT_MEMBER(tessedit_single_match, FALSE, "Top choice only from CP", + this->params()), + BOOL_MEMBER(classify_enable_learning, true, "Enable adaptive classifier", this->params()), - double_MEMBER(matcher_great_threshold, 0.0, "Great Match (0-1)", + INT_MEMBER(classify_debug_level, 0, "Classify debug level", + this->params()), + INT_MEMBER(classify_norm_method, character, "Normalization Method ...", + this->params()), + double_MEMBER(classify_char_norm_range, 0.2, + "Character Normalization Range ...", this->params()), + double_MEMBER(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...", + this->params()), /* PREV DEFAULT 0.1 */ + double_MEMBER(classify_max_norm_scale_x, 0.325, + "Max char x-norm scale ...", + this->params()), /* PREV DEFAULT 0.3 */ + double_MEMBER(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...", + this->params()), /* PREV DEFAULT 0.1 */ + double_MEMBER(classify_max_norm_scale_y, 0.325, + "Max char y-norm scale ...", + this->params()), /* PREV DEFAULT 0.3 */ + double_MEMBER(classify_max_rating_ratio, 1.5, + "Veto ratio between classifier ratings", this->params()), + double_MEMBER(classify_max_certainty_margin, 5.5, + "Veto difference between classifier certainties", + this->params()), + BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching", this->params()), - double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)", + BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching", this->params()), - double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)", + BOOL_MEMBER(classify_enable_adaptive_matcher, 1, + "Enable adaptive classifier", this->params()), + BOOL_MEMBER(classify_use_pre_adapted_templates, 0, + "Use pre-adapted classifier templates", this->params()), + BOOL_MEMBER(classify_save_adapted_templates, 0, + "Save adapted templates to a file", this->params()), + BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger", this->params()), - double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)", + BOOL_MEMBER(classify_nonlinear_norm, 0, + "Non-linear stroke-density normalization", this->params()), + INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()), + INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()), + INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ", + this->params()), + double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)", + this->params()), + double_MEMBER(matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)", + this->params()), + double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)", + this->params()), + double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)", + this->params()), + double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)", + this->params()), + double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length", + this->params()), + INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes", + this->params()), + INT_MEMBER(matcher_min_examples_for_prototyping, 3, + "Reliable Config Threshold", this->params()), + INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5, + "Enable adaption even if the ambiguities have not been seen", + this->params()), + double_MEMBER(matcher_clustering_max_angle_delta, 0.015, + "Maximum angle delta for prototype clustering", + this->params()), + double_MEMBER(classify_misfit_junk_penalty, 0.0, + "Penalty to apply when a non-alnum is vertically out of " + "its expected textline position", + this->params()), + double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()), + double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor", + this->params()), + double_MEMBER(tessedit_class_miss_scale, 0.00390625, + "Scale factor for features not used", this->params()), + double_MEMBER( + classify_adapted_pruning_factor, 2.5, + "Prune poor adapted results this much worse than best result", + this->params()), + double_MEMBER(classify_adapted_pruning_threshold, -1.0, + "Threshold at which classify_adapted_pruning_factor starts", + this->params()), + INT_MEMBER(classify_adapt_proto_threshold, 230, + "Threshold for good protos during adaptive 0-255", + this->params()), + INT_MEMBER(classify_adapt_feature_threshold, 230, + "Threshold for good features during adaptive 0-255", + this->params()), + BOOL_MEMBER(disable_character_fragments, TRUE, + "Do not include character fragments in the" + " results of the classifier", this->params()), - double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length", + double_MEMBER(classify_character_fragments_garbage_certainty_threshold, + -3.0, + "Exclude fragments that do not look like whole" + " characters from training and adaption", + this->params()), + BOOL_MEMBER(classify_debug_character_fragments, FALSE, + "Bring up graphical debugging windows for fragments training", this->params()), - INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes", - this->params()), - INT_MEMBER(matcher_min_examples_for_prototyping, 3, - "Reliable Config Threshold", this->params()), - INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5, - "Enable adaption even if the ambiguities have not been seen", - this->params()), - double_MEMBER(matcher_clustering_max_angle_delta, 0.015, - "Maximum angle delta for prototype clustering", + BOOL_MEMBER(matcher_debug_separate_windows, FALSE, + "Use two different windows for debugging the matching: " + "One for the protos and one for the features.", this->params()), - double_MEMBER(classify_misfit_junk_penalty, 0.0, - "Penalty to apply when a non-alnum is vertically out of " - "its expected textline position", - this->params()), - double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params()), - double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor", - this->params()), - double_MEMBER(tessedit_class_miss_scale, 0.00390625, - "Scale factor for features not used", this->params()), - double_MEMBER(classify_adapted_pruning_factor, 2.5, - "Prune poor adapted results this much worse than best result", - this->params()), - double_MEMBER(classify_adapted_pruning_threshold, -1.0, - "Threshold at which classify_adapted_pruning_factor starts", - this->params()), - INT_MEMBER(classify_adapt_proto_threshold, 230, - "Threshold for good protos during adaptive 0-255", - this->params()), - INT_MEMBER(classify_adapt_feature_threshold, 230, - "Threshold for good features during adaptive 0-255", - this->params()), - BOOL_MEMBER(disable_character_fragments, TRUE, - "Do not include character fragments in the" - " results of the classifier", this->params()), - double_MEMBER(classify_character_fragments_garbage_certainty_threshold, - -3.0, "Exclude fragments that do not look like whole" - " characters from training and adaption", this->params()), - BOOL_MEMBER(classify_debug_character_fragments, FALSE, - "Bring up graphical debugging windows for fragments training", - this->params()), - BOOL_MEMBER(matcher_debug_separate_windows, FALSE, - "Use two different windows for debugging the matching: " - "One for the protos and one for the features.", this->params()), - STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning", - this->params()), - INT_MEMBER(classify_class_pruner_threshold, 229, - "Class Pruner Threshold 0-255", this->params()), - INT_MEMBER(classify_class_pruner_multiplier, 15, - "Class Pruner Multiplier 0-255: ", this->params()), - INT_MEMBER(classify_cp_cutoff_strength, 7, - "Class Pruner CutoffStrength: ", this->params()), - INT_MEMBER(classify_integer_matcher_multiplier, 10, - "Integer Matcher Multiplier 0-255: ", this->params()), - EnableLearning(true), - INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word", - this->params()), - BOOL_MEMBER(classify_bln_numeric_mode, 0, - "Assume the input is numbers [0-9].", this->params()), - double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size", - this->params()), - double_MEMBER(speckle_rating_penalty, 10.0, - "Penalty to add to worst rating for noise", this->params()), - shape_table_(NULL), - dict_(this), - static_classifier_(NULL) { + STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning", + this->params()), + INT_MEMBER(classify_class_pruner_threshold, 229, + "Class Pruner Threshold 0-255", this->params()), + INT_MEMBER(classify_class_pruner_multiplier, 15, + "Class Pruner Multiplier 0-255: ", this->params()), + INT_MEMBER(classify_cp_cutoff_strength, 7, + "Class Pruner CutoffStrength: ", this->params()), + INT_MEMBER(classify_integer_matcher_multiplier, 10, + "Integer Matcher Multiplier 0-255: ", this->params()), + EnableLearning(true), + INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word", + this->params()), + BOOL_MEMBER(classify_bln_numeric_mode, 0, + "Assume the input is numbers [0-9].", this->params()), + double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size", + this->params()), + double_MEMBER(speckle_rating_penalty, 10.0, + "Penalty to add to worst rating for noise", this->params()), + shape_table_(NULL), + dict_(this), + static_classifier_(NULL) { fontinfo_table_.set_compare_callback( NewPermanentTessCallback(CompareFontInfo)); fontinfo_table_.set_clear_callback( diff --git a/classify/classify.h b/classify/classify.h index 92333cb36..60db4a9f3 100644 --- a/classify/classify.h +++ b/classify/classify.h @@ -374,6 +374,12 @@ class Classify : public CCStruct { // Member variables. // Parameters. + // Set during training (in lang.config) to indicate whether the divisible + // blobs chopper should be used (true for latin script.) + BOOL_VAR_H(allow_blob_division, true, "Use divisible blobs chopping"); + // Set during training (in lang.config) to indicate whether the divisible + // blobs chopper should be used in preference to chopping. Set to true for + // southern Indic scripts. BOOL_VAR_H(prioritize_division, FALSE, "Prioritize blob division over chopping"); INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP"); diff --git a/wordrec/chopper.cpp b/wordrec/chopper.cpp index c1a57fcd2..69a458bc2 100644 --- a/wordrec/chopper.cpp +++ b/wordrec/chopper.cpp @@ -200,7 +200,7 @@ SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number, if (seam == NULL) { if (repair_unchopped_blobs) restore_outline_tree(blob->outlines); - if (word->latin_script) { + if (allow_blob_division && !prioritize_division) { // If the blob can simply be divided into outlines, then do that. TPOINT location; if (divisible_blob(blob, italic_blob, &location)) {