/////////////////////////////////////////////////////////////////////// // File: wordrec.h // Description: wordrec class. // Author: Samuel Charron // // (C) Copyright 2006, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // /////////////////////////////////////////////////////////////////////// #ifndef TESSERACT_WORDREC_WORDREC_H_ #define TESSERACT_WORDREC_WORDREC_H_ #include "associate.h" #include "classify.h" #include "dict.h" #include "language_model.h" #include "ratngs.h" #include "matrix.h" #include "gradechop.h" #include "seam.h" #include "findseam.h" #include "callcpp.h" class WERD_RES; namespace tesseract { // A class for storing which nodes are to be processed by the segmentation // search. There is a single SegSearchPending for each column in the ratings // matrix, and it indicates whether the segsearch should combine all // BLOB_CHOICES in the column, or just the given row with the parents // corresponding to *this SegSearchPending, and whether only updated parent // ViterbiStateEntries should be combined, or all, with the BLOB_CHOICEs. class SegSearchPending { public: SegSearchPending() : classified_row_(-1), revisit_whole_column_(false), column_classified_(false) {} // Marks the whole column as just classified. Used to start a search on // a newly initialized ratings matrix. void SetColumnClassified() { column_classified_ = true; } // Marks the matrix entry at the given row as just classified. // Used after classifying a new matrix cell. // Additional to, not overriding a previous RevisitWholeColumn. void SetBlobClassified(int row) { classified_row_ = row; } // Marks the whole column as needing work, but not just classified. // Used when the parent vse list is updated. // Additional to, not overriding a previous SetBlobClassified. void RevisitWholeColumn() { revisit_whole_column_ = true; } // Clears *this to indicate no work to do. void Clear() { classified_row_ = -1; revisit_whole_column_ = false; column_classified_ = false; } // Returns true if there are updates to do in the column that *this // represents. bool WorkToDo() const { return revisit_whole_column_ || column_classified_ || classified_row_ >= 0; } // Returns true if the given row was just classified. bool IsRowJustClassified(int row) const { return row == classified_row_ || column_classified_; } // Returns the single row to process if there is only one, otherwise -1. int SingleRow() const { return revisit_whole_column_ || column_classified_ ? -1 : classified_row_; } private: // If non-negative, indicates the single row in the ratings matrix that has // just been classified, and so should be combined with all the parents in the // column that this SegSearchPending represents. // Operates independently of revisit_whole_column. int classified_row_; // If revisit_whole_column is true, then all BLOB_CHOICEs in this column will // be processed, but classified_row can indicate a row that is newly // classified. Overridden if column_classified is true. bool revisit_whole_column_; // If column_classified is true, parent vses are processed with all rows // regardless of whether they are just updated, overriding // revisit_whole_column and classified_row. bool column_classified_; }; /* ccmain/tstruct.cpp *********************************************************/ class FRAGMENT:public ELIST_LINK { public: FRAGMENT() { //constructor } FRAGMENT(EDGEPT *head_pt, //start EDGEPT *tail_pt); //end ICOORD head; //coords of start ICOORD tail; //coords of end EDGEPT *headpt; //start point EDGEPT *tailpt; //end point }; ELISTIZEH(FRAGMENT) class Wordrec : public Classify { public: // config parameters ******************************************************* BOOL_VAR_H(merge_fragments_in_matrix, TRUE, "Merge the fragments in the ratings matrix and delete them " "after merging"); BOOL_VAR_H(wordrec_no_block, FALSE, "Don't output block information"); BOOL_VAR_H(wordrec_enable_assoc, TRUE, "Associator Enable"); BOOL_VAR_H(force_word_assoc, FALSE, "force associator to run regardless of what enable_assoc is." "This is used for CJK where component grouping is necessary."); double_VAR_H(wordrec_worst_state, 1, "Worst segmentation state"); BOOL_VAR_H(fragments_guide_chopper, FALSE, "Use information from fragments to guide chopping process"); INT_VAR_H(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped"); double_VAR_H(tessedit_certainty_threshold, -2.25, "Good blob limit"); INT_VAR_H(chop_debug, 0, "Chop debug"); BOOL_VAR_H(chop_enable, 1, "Chop enable"); BOOL_VAR_H(chop_vertical_creep, 0, "Vertical creep"); INT_VAR_H(chop_split_length, 10000, "Split Length"); INT_VAR_H(chop_same_distance, 2, "Same distance"); INT_VAR_H(chop_min_outline_points, 6, "Min Number of Points on Outline"); INT_VAR_H(chop_seam_pile_size, 150, "Max number of seams in seam_pile"); BOOL_VAR_H(chop_new_seam_pile, 1, "Use new seam_pile"); INT_VAR_H(chop_inside_angle, -50, "Min Inside Angle Bend"); INT_VAR_H(chop_min_outline_area, 2000, "Min Outline Area"); double_VAR_H(chop_split_dist_knob, 0.5, "Split length adjustment"); double_VAR_H(chop_overlap_knob, 0.9, "Split overlap adjustment"); double_VAR_H(chop_center_knob, 0.15, "Split center adjustment"); INT_VAR_H(chop_centered_maxwidth, 90, "Width of (smaller) chopped blobs " "above which we don't care that a chop is not near the center."); double_VAR_H(chop_sharpness_knob, 0.06, "Split sharpness adjustment"); double_VAR_H(chop_width_change_knob, 5.0, "Width change adjustment"); double_VAR_H(chop_ok_split, 100.0, "OK split limit"); double_VAR_H(chop_good_split, 50.0, "Good split limit"); INT_VAR_H(chop_x_y_weight, 3, "X / Y length weight"); INT_VAR_H(segment_adjust_debug, 0, "Segmentation adjustment debug"); BOOL_VAR_H(assume_fixed_pitch_char_segment, FALSE, "include fixed-pitch heuristics in char segmentation"); INT_VAR_H(wordrec_debug_level, 0, "Debug level for wordrec"); INT_VAR_H(wordrec_max_join_chunks, 4, "Max number of broken pieces to associate"); BOOL_VAR_H(wordrec_skip_no_truth_words, false, "Only run OCR for words that had truth recorded in BlamerBundle"); BOOL_VAR_H(wordrec_debug_blamer, false, "Print blamer debug messages"); BOOL_VAR_H(wordrec_run_blamer, false, "Try to set the blame for errors"); INT_VAR_H(segsearch_debug_level, 0, "SegSearch debug level"); INT_VAR_H(segsearch_max_pain_points, 2000, "Maximum number of pain points stored in the queue"); INT_VAR_H(segsearch_max_futile_classifications, 10, "Maximum number of pain point classifications per word."); double_VAR_H(segsearch_max_char_wh_ratio, 2.0, "Maximum character width-to-height ratio"); BOOL_VAR_H(save_alt_choices, true, "Save alternative paths found during chopping " "and segmentation search"); // methods from wordrec/*.cpp *********************************************** Wordrec(); virtual ~Wordrec(); // Fills word->alt_choices with alternative paths found during // chopping/segmentation search that are kept in best_choices. void SaveAltChoices(const LIST &best_choices, WERD_RES *word); // Fills character choice lattice in the given BlamerBundle // using the given ratings matrix and best choice list. void FillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle); // Calls fill_lattice_ member function // (assumes that fill_lattice_ is not nullptr). void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) { (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle); } // tface.cpp void program_editup(const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict); void cc_recog(WERD_RES *word); void program_editdown(int32_t elasped_time); void set_pass1(); void set_pass2(); int end_recog(); BLOB_CHOICE_LIST *call_matcher(TBLOB* blob); int dict_word(const WERD_CHOICE &word); // wordclass.cpp BLOB_CHOICE_LIST *classify_blob(TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle); // segsearch.cpp // SegSearch works on the lower diagonal matrix of BLOB_CHOICE_LISTs. // Each entry in the matrix represents the classification choice // for a chunk, i.e. an entry in row 2, column 1 represents the list // of ratings for the chunks 1 and 2 classified as a single blob. // The entries on the diagonal of the matrix are classifier choice lists // for a single chunk from the maximal segmentation. // // The ratings matrix given to SegSearch represents the segmentation // graph / trellis for the current word. The nodes in the graph are the // individual BLOB_CHOICEs in each of the BLOB_CHOICE_LISTs in the ratings // matrix. The children of each node (nodes connected by outgoing links) // are the entries in the column that is equal to node's row+1. The parents // (nodes connected by the incoming links) are the entries in the row that // is equal to the node's column-1. Here is an example ratings matrix: // // 0 1 2 3 4 // ------------------------- // 0| c,( | // 1| d l,1 | // 2| o | // 3| c,( | // 4| g,y l,1 | // ------------------------- // // In the example above node "o" has children (outgoing connection to nodes) // "c","(","g","y" and parents (incoming connections from nodes) "l","1","d". // // The objective of the search is to find the least cost path, where the cost // is determined by the language model components and the properties of the // cut between the blobs on the path. SegSearch starts by populating the // matrix with the all the entries that were classified by the chopper and // finding the initial best path. Based on the classifier ratings, language // model scores and the properties of each cut, a list of "pain points" is // constructed - those are the points on the path where the choices do not // look consistent with the neighboring choices, the cuts look particularly // problematic, or the certainties of the blobs are low. The most troublesome // "pain point" is picked from the list and the new entry in the ratings // matrix corresponding to this "pain point" is filled in. Then the language // model state is updated to reflect the new classification and the new // "pain points" are added to the list and the next most troublesome // "pain point" is determined. This continues until either the word choice // composed from the best paths in the segmentation graph is "good enough" // (e.g. above a certain certainty threshold, is an unambiguous dictionary // word, etc) or there are no more "pain points" to explore. // // If associate_blobs is set to false no new classifications will be done // to combine blobs. Segmentation search will run only one "iteration" // on the classifications already recorded in chunks_record.ratings. // // Note: this function assumes that word_res, best_choice_bundle arguments // are not nullptr. void SegSearch(WERD_RES* word_res, BestChoiceBundle* best_choice_bundle, BlamerBundle* blamer_bundle); // Setup and run just the initial segsearch on an established matrix, // without doing any additional chopping or joining. // (Internal factored version that can be used as part of the main SegSearch.) void InitialSegSearch(WERD_RES* word_res, LMPainPoints* pain_points, GenericVector* pending, BestChoiceBundle* best_choice_bundle, BlamerBundle* blamer_bundle); // Runs SegSearch() function (above) without needing a best_choice_bundle // or blamer_bundle. Used for testing. void DoSegSearch(WERD_RES* word_res); // chop.cpp PRIORITY point_priority(EDGEPT *point); void add_point_to_list(PointHeap* point_heap, EDGEPT *point); // Returns true if the edgept supplied as input is an inside angle. This // is determined by the angular change of the vectors from point to point. bool is_inside_angle(EDGEPT *pt); int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3); EDGEPT *pick_close_point(EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist); void prioritize_points(TESSLINE *outline, PointHeap* points); void new_min_point(EDGEPT *local_min, PointHeap* points); void new_max_point(EDGEPT *local_max, PointHeap* points); void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point, EDGEPT** best_point, EDGEPT_CLIST *new_points); // chopper.cpp SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector& seams); SEAM *chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob, const GenericVector& seams); SEAM *chop_overlapping_blob(const GenericVector& boxes, bool italic_blob, WERD_RES *word_res, int *blob_number); SEAM *improve_one_blob(const GenericVector &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number); SEAM *chop_one_blob(const GenericVector &boxes, const GenericVector &blob_choices, WERD_RES *word_res, int *blob_number); void chop_word_main(WERD_RES *word); void improve_by_chopping(float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector* pending); int select_blob_to_split(const GenericVector &blob_choices, float rating_ceiling, bool split_next_to_fragment); int select_blob_to_split_from_fixpt(DANGERR *fixpt); // findseam.cpp void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue* seams); void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile); void combine_seam(const SeamPile& seam_pile, const SEAM* seam, SeamQueue* seam_queue); SEAM *pick_good_seam(TBLOB *blob); void try_point_pairs (EDGEPT * points[MAX_NUM_POINTS], int16_t num_points, SeamQueue* seam_queue, SeamPile* seam_pile, SEAM ** seam, TBLOB * blob); void try_vertical_splits(EDGEPT * points[MAX_NUM_POINTS], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue* seam_queue, SeamPile* seam_pile, SEAM ** seam, TBLOB * blob); // gradechop.cpp PRIORITY grade_split_length(register SPLIT *split); PRIORITY grade_sharpness(register SPLIT *split); // outlines.cpp bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt); // pieces.cpp virtual BLOB_CHOICE_LIST *classify_piece(const GenericVector& seams, int16_t start, int16_t end, const char* description, TWERD *word, BlamerBundle *blamer_bundle); // Try to merge fragments in the ratings matrix and put the result in // the corresponding row and column void merge_fragments(MATRIX *ratings, int16_t num_blobs); // Recursively go through the ratings matrix to find lists of fragments // to be merged in the function merge_and_put_fragment_lists. // current_frag is the position of the piece we are looking for. // current_row is the row in the rating matrix we are currently at. // start is the row we started initially, so that we can know where // to append the results to the matrix. num_frag_parts is the total // number of pieces we are looking for and num_blobs is the size of the // ratings matrix. void get_fragment_lists(int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists); // Merge the fragment lists in choice_lists and append it to the // ratings matrix void merge_and_put_fragment_lists(int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings); // Filter the fragment list so that the filtered_choices only contain // fragments that are in the correct position. choices is the list // that we are going to filter. fragment_pos is the position in the // fragment that we are looking for and num_frag_parts is the the // total number of pieces. The result will be appended to // filtered_choices. void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices); // Member variables. LanguageModel *language_model_; PRIORITY pass2_ok_split; // Stores the best choice for the previous word in the paragraph. // This variable is modified by PAGE_RES_IT when iterating over // words to OCR on the page. WERD_CHOICE *prev_word_best_choice_; // Sums of blame reasons computed by the blamer. GenericVector blame_reasons_; // Function used to fill char choice lattices. void (Wordrec::*fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle); protected: inline bool SegSearchDone(int num_futile_classifications) { return (language_model_->AcceptableChoiceFound() || num_futile_classifications >= segsearch_max_futile_classifications); } // Updates the language model state recorded for the child entries specified // in pending[starting_col]. Enqueues the children of the updated entries // into pending and proceeds to update (and remove from pending) all the // remaining entries in pending[col] (col >= starting_col). Upon termination // of this function all the pending[col] lists will be empty. // // The arguments: // // starting_col: index of the column in chunks_record->ratings from // which the update should be started // // pending: list of entries listing chunks_record->ratings entries // that should be updated // // pain_points: priority heap listing the pain points generated by // the language model // // temp_pain_points: temporary storage for tentative pain points generated // by the language model after a single call to LanguageModel::UpdateState() // (the argument is passed in rather than created before each // LanguageModel::UpdateState() call to avoid dynamic memory re-allocation) // // best_choice_bundle: a collection of variables that should be updated // if a new best choice is found // void UpdateSegSearchNodes( float rating_cert_scale, int starting_col, GenericVector* pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle); // Process the given pain point: classify the corresponding blob, enqueue // new pain points to join the newly classified blob with its neighbors. void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char* pain_point_type, GenericVector* pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle); // Resets enough of the results so that the Viterbi search is re-run. // Needed when the n-gram model is enabled, as the multi-length comparison // implementation will re-value existing paths to worse values. void ResetNGramSearch(WERD_RES* word_res, BestChoiceBundle* best_choice_bundle, GenericVector* pending); // Add pain points for classifying blobs on the correct segmentation path // (so that we can evaluate correct segmentation path and discover the reason // for incorrect result). void InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug); }; } // namespace tesseract #endif // TESSERACT_WORDREC_WORDREC_H_