/////////////////////////////////////////////////////////////////////// // File: wordrec.cpp // Description: wordrec class. // Author: Samuel Charron // // (C) Copyright 2006, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // /////////////////////////////////////////////////////////////////////// #include "wordrec.h" #include "language_model.h" #include "params.h" namespace tesseract { Wordrec::Wordrec() : // control parameters BOOL_MEMBER(merge_fragments_in_matrix, TRUE, "Merge the fragments in the ratings matrix and delete them" " after merging", params()), BOOL_MEMBER(wordrec_no_block, FALSE, "Don't output block information", params()), BOOL_MEMBER(wordrec_enable_assoc, TRUE, "Associator Enable", params()), BOOL_MEMBER(force_word_assoc, FALSE, "force associator to run regardless of what enable_assoc is." "This is used for CJK where component grouping is necessary.", CCUtil::params()), INT_MEMBER(wordrec_num_seg_states, 30, "Segmentation states", CCUtil::params()), double_MEMBER(wordrec_worst_state, 1.0, "Worst segmentation state", params()), BOOL_MEMBER(fragments_guide_chopper, FALSE, "Use information from fragments to guide chopping process", params()), INT_MEMBER(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped", params()), double_MEMBER(tessedit_certainty_threshold, -2.25, "Good blob limit", params()), INT_MEMBER(chop_debug, 0, "Chop debug", params()), BOOL_MEMBER(chop_enable, 1, "Chop enable", params()), BOOL_MEMBER(chop_vertical_creep, 0, "Vertical creep", params()), INT_MEMBER(chop_split_length, 10000, "Split Length", params()), INT_MEMBER(chop_same_distance, 2, "Same distance", params()), INT_MEMBER(chop_min_outline_points, 6, "Min Number of Points on Outline", params()), INT_MEMBER(chop_inside_angle, -50, "Min Inside Angle Bend", params()), INT_MEMBER(chop_min_outline_area, 2000, "Min Outline Area", params()), double_MEMBER(chop_split_dist_knob, 0.5, "Split length adjustment", params()), double_MEMBER(chop_overlap_knob, 0.9, "Split overlap adjustment", params()), double_MEMBER(chop_center_knob, 0.15, "Split center adjustment", params()), double_MEMBER(chop_sharpness_knob, 0.06, "Split sharpness adjustment", params()), double_MEMBER(chop_width_change_knob, 5.0, "Width change adjustment", params()), double_MEMBER(chop_ok_split, 100.0, "OK split limit", params()), double_MEMBER(chop_good_split, 50.0, "Good split limit", params()), INT_MEMBER(chop_x_y_weight, 3, "X / Y length weight", params()), INT_MEMBER(segment_adjust_debug, 0, "Segmentation adjustment debug", params()), BOOL_MEMBER(assume_fixed_pitch_char_segment, FALSE, "include fixed-pitch heuristics in char segmentation", params()), BOOL_MEMBER(use_new_state_cost, FALSE, "use new state cost heuristics for segmentation state evaluation", params()), double_MEMBER(heuristic_segcost_rating_base, 1.25, "base factor for adding segmentation cost into word rating." "It's a multiplying factor, the larger the value above 1, " "the bigger the effect of segmentation cost.", params()), double_MEMBER(heuristic_weight_rating, 1.0, "weight associated with char rating in combined cost of state", params()), double_MEMBER(heuristic_weight_width, 1000.0, "weight associated with width evidence in combined cost of" " state", params()), double_MEMBER(heuristic_weight_seamcut, 0.0, "weight associated with seam cut in combined cost of state", params()), double_MEMBER(heuristic_max_char_wh_ratio, 2.0, "max char width-to-height ratio allowed in segmentation", params()), INT_MEMBER(wordrec_debug_level, 0, "Debug level for wordrec", params()), BOOL_MEMBER(wordrec_debug_blamer, false, "Print blamer debug messages", params()), BOOL_MEMBER(wordrec_run_blamer, false, "Try to set the blame for errors", params()), BOOL_MEMBER(enable_new_segsearch, true, "Enable new segmentation search path.", params()), INT_MEMBER(segsearch_debug_level, 0, "SegSearch debug level", params()), INT_MEMBER(segsearch_max_pain_points, 2000, "Maximum number of pain points stored in the queue", params()), INT_MEMBER(segsearch_max_futile_classifications, 10, "Maximum number of pain point classifications per word that" "did not result in finding a better word choice.", params()), double_MEMBER(segsearch_max_char_wh_ratio, 2.0, "Maximum character width-to-height ratio", params()), double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0, "Maximum character width-to-height ratio for" " fixed-pitch fonts", params()), BOOL_MEMBER(save_alt_choices, false, "Save alternative paths found during chopping" " and segmentation search", params()) { prev_word_best_choice_ = NULL; language_model_ = new LanguageModel(&get_fontinfo_table(), &(getDict())); pass2_seg_states = 0; num_joints = 0; num_pushed = 0; num_popped = 0; fill_lattice_ = NULL; } Wordrec::~Wordrec() { delete language_model_; } void Wordrec::CopyCharChoices(const BLOB_CHOICE_LIST_VECTOR &from, BLOB_CHOICE_LIST_VECTOR *to) { to->delete_data_pointers(); to->clear(); for (int i = 0; i < from.size(); ++i) { BLOB_CHOICE_LIST *cc_list = new BLOB_CHOICE_LIST(); cc_list->deep_copy(from[i], &BLOB_CHOICE::deep_copy); to->push_back(cc_list); } } bool Wordrec::ChoiceIsCorrect(const UNICHARSET &uni_set, const WERD_CHOICE *choice, const GenericVector &truth_text) { if (choice == NULL) return false; int i; STRING truth_str; for (i = 0; i < truth_text.length(); ++i) truth_str += truth_text[i]; STRING normed_choice_str; for (i = 0; i < choice->length(); ++i) { normed_choice_str += uni_set.get_normed_unichar(choice->unichar_id(i)); } return (truth_str == normed_choice_str); } void Wordrec::SaveAltChoices(const LIST &best_choices, WERD_RES *word) { ASSERT_HOST(word->alt_choices.empty()); ASSERT_HOST(word->alt_states.empty()); LIST list_it; iterate_list(list_it, best_choices) { VIABLE_CHOICE choice = reinterpret_cast(first_node(list_it)); CHAR_CHOICE *char_choice = &(choice->Blob[0]); WERD_CHOICE *alt_choice = new WERD_CHOICE(word->uch_set, choice->Length); word->alt_states.push_back(GenericVector(choice->Length)); GenericVector &alt_state = word->alt_states.back(); for (int i = 0; i < choice->Length; char_choice++, i++) { alt_choice->append_unichar_id_space_allocated( char_choice->Class, 1, 0, 0); alt_state.push_back(choice->segmentation_state[i]); } alt_choice->set_rating(choice->Rating); alt_choice->set_certainty(choice->Certainty); word->alt_choices.push_back(alt_choice); alt_choice->populate_unichars(); if (wordrec_debug_level > 0) { tprintf("SaveAltChoices: %s %g\n", alt_choice->unichar_string().string(), alt_choice->rating()); } } } } // namespace tesseract