/********************************************************************** * File: pageres.cpp (Formerly page_res.c) * Description: Results classes used by control.c * Author: Phil Cheatle * Created: Tue Sep 22 08:42:49 BST 1992 * * (C) Copyright 1992, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/ #include "mfcpch.h" #include #ifdef __UNIX__ #include #endif #include "pageres.h" #include "blobs.h" const char kBlameCorrect[] = "corr"; const char kBlameClassifier[] = "cl"; const char kBlameChopper[] = "chop"; const char kBlameClassLMTradeoff[] = "cl/LM"; const char kBlamePageLayout[] = "pglt"; const char kBlameSegsearchHeur[] = "ss_heur"; const char kBlameSegsearchPP[] = "ss_pp"; const char kBlameClassOldLMTradeoff[] = "cl/old_LM"; const char kBlameAdaption[] = "adapt"; const char kBlameNoTruthSplit[] = "no_tr_spl"; const char kBlameNoTruth[] = "no_tr"; const char kBlameUnknown[] = "unkn"; const char * const kIncorrectResultReasonNames[] = { kBlameCorrect, kBlameClassifier, kBlameChopper, kBlameClassLMTradeoff, kBlamePageLayout, kBlameSegsearchHeur, kBlameSegsearchPP, kBlameClassOldLMTradeoff, kBlameAdaption, kBlameNoTruthSplit, kBlameNoTruth, kBlameUnknown }; const char *BlamerBundle::IncorrectReasonName(IncorrectResultReason irr) { return kIncorrectResultReasonNames[irr]; } const char *BlamerBundle::IncorrectReason() const { return kIncorrectResultReasonNames[incorrect_result_reason]; } void BlamerBundle::FillDebugString(const STRING &msg, const WERD_CHOICE *choice, STRING *debug) { (*debug) += "Truth "; for (int i = 0; i < this->truth_text.length(); ++i) { (*debug) += this->truth_text[i]; } if (!this->truth_has_char_boxes) (*debug) += " (no char boxes)"; if (choice != NULL) { (*debug) += " Choice "; STRING choice_str; choice->string_and_lengths(&choice_str, NULL); (*debug) += choice_str; } if (msg.length() > 0) { (*debug) += "\n"; (*debug) += msg; } (*debug) += "\n"; } ELISTIZE (BLOCK_RES) CLISTIZE (BLOCK_RES) ELISTIZE (ROW_RES) ELISTIZE (WERD_RES) /************************************************************************* * PAGE_RES::PAGE_RES * * Constructor for page results *************************************************************************/ PAGE_RES::PAGE_RES( BLOCK_LIST *the_block_list, WERD_CHOICE **prev_word_best_choice_ptr) { Init(); BLOCK_IT block_it(the_block_list); BLOCK_RES_IT block_res_it(&block_res_list); for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { block_res_it.add_to_end(new BLOCK_RES(block_it.data())); } prev_word_best_choice = prev_word_best_choice_ptr; } /************************************************************************* * BLOCK_RES::BLOCK_RES * * Constructor for BLOCK results *************************************************************************/ BLOCK_RES::BLOCK_RES(BLOCK *the_block) { ROW_IT row_it (the_block->row_list ()); ROW_RES_IT row_res_it(&row_res_list); char_count = 0; rej_count = 0; font_class = -1; //not assigned x_height = -1.0; font_assigned = FALSE; bold = FALSE; italic = FALSE; row_count = 0; block = the_block; for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { row_res_it.add_to_end(new ROW_RES(row_it.data())); } } /************************************************************************* * ROW_RES::ROW_RES * * Constructor for ROW results *************************************************************************/ ROW_RES::ROW_RES(ROW *the_row) { WERD_IT word_it(the_row->word_list()); WERD_RES_IT word_res_it(&word_res_list); WERD_RES *combo = NULL; // current combination of fuzzies WERD_RES *word_res; // current word WERD *copy_word; char_count = 0; rej_count = 0; whole_word_rej_count = 0; row = the_row; for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) { word_res = new WERD_RES(word_it.data()); word_res->x_height = the_row->x_height(); if (word_res->word->flag(W_FUZZY_NON)) { ASSERT_HOST(combo != NULL); word_res->part_of_combo = TRUE; combo->copy_on(word_res); } if (word_it.data_relative(1)->flag(W_FUZZY_NON)) { if (combo == NULL) { copy_word = new WERD; //deep copy *copy_word = *(word_it.data()); combo = new WERD_RES(copy_word); combo->x_height = the_row->x_height(); combo->combination = TRUE; word_res_it.add_to_end(combo); } word_res->part_of_combo = TRUE; } else { combo = NULL; } word_res_it.add_to_end(word_res); } } WERD_RES& WERD_RES::operator=(const WERD_RES & source) { this->ELIST_LINK::operator=(source); Clear(); if (source.combination) { word = new WERD; *word = *(source.word); // deep copy } else { word = source.word; // pt to same word } if (source.bln_boxes != NULL) bln_boxes = new tesseract::BoxWord(*source.bln_boxes); if (source.chopped_word != NULL) chopped_word = new TWERD(*source.chopped_word); if (source.rebuild_word != NULL) rebuild_word = new TWERD(*source.rebuild_word); // TODO(rays) Do we ever need to copy the seam_array? denorm = source.denorm; if (source.box_word != NULL) box_word = new tesseract::BoxWord(*source.box_word); best_state = source.best_state; correct_text = source.correct_text; if (source.best_choice != NULL) { best_choice = new WERD_CHOICE(*source.best_choice); raw_choice = new WERD_CHOICE(*source.raw_choice); best_choice_fontinfo_ids = source.best_choice_fontinfo_ids; } else { best_choice = NULL; raw_choice = NULL; if (!best_choice_fontinfo_ids.empty()) { best_choice_fontinfo_ids.clear(); } } for (int i = 0; i < source.alt_choices.length(); ++i) { const WERD_CHOICE *choice = source.alt_choices[i]; ASSERT_HOST(choice != NULL); alt_choices.push_back(new WERD_CHOICE(*choice)); } alt_states = source.alt_states; if (source.ep_choice != NULL) { ep_choice = new WERD_CHOICE(*source.ep_choice); } else { ep_choice = NULL; } reject_map = source.reject_map; combination = source.combination; part_of_combo = source.part_of_combo; CopySimpleFields(source); if (source.blamer_bundle != NULL) { blamer_bundle = new BlamerBundle(*(source.blamer_bundle)); } return *this; } // Copies basic fields that don't involve pointers that might be useful // to copy when making one WERD_RES from another. void WERD_RES::CopySimpleFields(const WERD_RES& source) { tess_failed = source.tess_failed; tess_accepted = source.tess_accepted; tess_would_adapt = source.tess_would_adapt; done = source.done; unlv_crunch_mode = source.unlv_crunch_mode; small_caps = source.small_caps; italic = source.italic; bold = source.bold; fontinfo = source.fontinfo; fontinfo2 = source.fontinfo2; fontinfo_id_count = source.fontinfo_id_count; fontinfo_id2_count = source.fontinfo_id2_count; x_height = source.x_height; caps_height = source.caps_height; guessed_x_ht = source.guessed_x_ht; guessed_caps_ht = source.guessed_caps_ht; reject_spaces = source.reject_spaces; uch_set = source.uch_set; tesseract = source.tesseract; } // Initializes a blank (default constructed) WERD_RES from one that has // already been recognized. // Use SetupFor*Recognition afterwards to complete the setup and make // it ready for a retry recognition. void WERD_RES::InitForRetryRecognition(const WERD_RES& source) { word = source.word; CopySimpleFields(source); if (source.blamer_bundle != NULL) { blamer_bundle = new BlamerBundle(); blamer_bundle->CopyTruth(*source.blamer_bundle); } } // Sets up the members used in recognition: // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice. // Returns false if the word is empty and sets up fake results. bool WERD_RES::SetupForTessRecognition(const UNICHARSET& unicharset_in, tesseract::Tesseract* tess, Pix* pix, bool numeric_mode, bool use_body_size, ROW *row, BLOCK* block) { tesseract = tess; POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL; if (word->cblob_list()->empty() || (pb != NULL && !pb->IsText())) { // Empty words occur when all the blobs have been moved to the rej_blobs // list, which seems to occur frequently in junk. SetupFake(unicharset_in); word->set_flag(W_REP_CHAR, false); return false; } ClearResults(); SetupWordScript(unicharset_in); chopped_word = TWERD::PolygonalCopy(word); if (use_body_size && row->body_size() > 0.0f) { chopped_word->SetupBLNormalize(block, row, row->body_size(), numeric_mode, &denorm); } else { chopped_word->SetupBLNormalize(block, row, x_height, numeric_mode, &denorm); } // The image will be 8-bit grey if the input was grey or color. Note that in // a grey image 0 is black and 255 is white. If the input was binary, then // the pix will be binary and 0 is white, with 1 being black. // To tell the difference pixGetDepth() will return 8 or 1. denorm.set_pix(pix); // The inverse flag will be true iff the word has been determined to be white // on black, and is independent of whether the pix is 8 bit or 1 bit. denorm.set_inverse(word->flag(W_INVERSE)); chopped_word->Normalize(denorm); bln_boxes = tesseract::BoxWord::CopyFromNormalized(NULL, chopped_word); seam_array = start_seam_list(chopped_word->blobs); best_choice = new WERD_CHOICE(&unicharset_in); best_choice->make_bad(); raw_choice = new WERD_CHOICE(&unicharset_in); raw_choice->make_bad(); SetupBlamerBundle(); return true; } // Sets up the members used in recognition: // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice. // Returns false if the word is empty and sets up fake results. bool WERD_RES::SetupForCubeRecognition(const UNICHARSET& unicharset_in, tesseract::Tesseract* tess, const BLOCK* block) { tesseract = tess; POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL; if (pb != NULL && !pb->IsText()) { // Ignore words in graphic regions. SetupFake(unicharset_in); word->set_flag(W_REP_CHAR, false); return false; } ClearResults(); SetupWordScript(unicharset_in); TBOX word_box = word->bounding_box(); denorm.SetupNormalization(block, NULL, NULL, NULL, NULL, 0, word_box.left(), word_box.bottom(), 1.0f, 1.0f, 0.0f, 0.0f); SetupBlamerBundle(); return true; } // Sets up the members used in recognition for an empty recognition result: // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice. void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) { ClearResults(); SetupWordScript(unicharset_in); chopped_word = new TWERD; rebuild_word = new TWERD; bln_boxes = new tesseract::BoxWord; box_word = new tesseract::BoxWord; int blob_count = word->cblob_list()->length(); best_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f, TOP_CHOICE_PERM, unicharset_in); raw_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f, TOP_CHOICE_PERM, unicharset_in); if (blob_count > 0) { BLOB_CHOICE** fake_choices = new BLOB_CHOICE*[blob_count]; // For non-text blocks, just pass any blobs through to the box_word // and call the word failed with a fake classification. C_BLOB_IT b_it(word->cblob_list()); int blob_id = 0; for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { TBOX box = b_it.data()->bounding_box(); box_word->InsertBox(box_word->length(), box); fake_choices[blob_id++] = new BLOB_CHOICE(0, 10.0f, -1.0f, -1, -1, -1, 0, 0, false); } FakeClassifyWord(blob_count, fake_choices); delete [] fake_choices; } tess_failed = true; } void WERD_RES::SetupWordScript(const UNICHARSET& uch) { uch_set = &uch; int script = uch.default_sid(); word->set_script_id(script); word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight()); word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid()); } // Sets up the blamer_bundle if it is not null, using the initialized denorm. void WERD_RES::SetupBlamerBundle() { if (blamer_bundle != NULL) { blamer_bundle->norm_box_tolerance = kBlamerBoxTolerance * denorm.x_scale(); TPOINT topleft; TPOINT botright; TPOINT norm_topleft; TPOINT norm_botright; for (int b = 0; b < blamer_bundle->truth_word.length(); ++b) { const TBOX &box = blamer_bundle->truth_word.BlobBox(b); topleft.x = box.left(); topleft.y = box.top(); botright.x = box.right(); botright.y = box.bottom(); denorm.NormTransform(topleft, &norm_topleft); denorm.NormTransform(botright, &norm_botright); TBOX norm_box(norm_topleft.x, norm_botright.y, norm_botright.x, norm_topleft.y); blamer_bundle->norm_truth_word.InsertBox(b, norm_box); } } } // Simple helper moves the ownership of the pointer data from src to dest, // first deleting anything in dest, and nulling out src afterwards. template static void MovePointerData(T** dest, T**src) { delete *dest; *dest = *src; *src = NULL; } // Moves the results fields from word to this. This takes ownership of all // the data, so src can be destructed. void WERD_RES::ConsumeWordResults(WERD_RES* word) { denorm = word->denorm; MovePointerData(&chopped_word, &word->chopped_word); MovePointerData(&rebuild_word, &word->rebuild_word); MovePointerData(&box_word, &word->box_word); if (seam_array != NULL) free_seam_list(seam_array); seam_array = word->seam_array; word->seam_array = NULL; best_state.move(&word->best_state); correct_text.move(&word->correct_text); MovePointerData(&best_choice, &word->best_choice); MovePointerData(&raw_choice, &word->raw_choice); alt_choices.delete_data_pointers(); alt_choices.move(&word->alt_choices); alt_states.move(&word->alt_states); reject_map = word->reject_map; if (word->blamer_bundle != NULL) { assert(blamer_bundle != NULL); blamer_bundle->CopyResults(*(word->blamer_bundle)); } CopySimpleFields(*word); } // Replace the best choice and rebuild box word. void WERD_RES::ReplaceBestChoice( const WERD_CHOICE& choice, const GenericVector& segmentation_state) { delete best_choice; best_choice = new WERD_CHOICE(choice); best_state = segmentation_state; RebuildBestState(); SetupBoxWord(); // Make up a fake reject map of the right length to keep the // rejection pass happy. reject_map.initialise(segmentation_state.length()); done = tess_accepted = tess_would_adapt = true; SetScriptPositions(); } // Builds the rebuild_word from the chopped_word and the best_state. void WERD_RES::RebuildBestState() { if (rebuild_word != NULL) delete rebuild_word; rebuild_word = new TWERD; if (seam_array == NULL) { seam_array = start_seam_list(chopped_word->blobs); } TBLOB* prev_blob = NULL; int start = 0; for (int i = 0; i < best_state.size(); ++i) { int length = best_state[i]; join_pieces(chopped_word->blobs, seam_array, start, start + length - 1); TBLOB* blob = chopped_word->blobs; for (int i = 0; i < start; ++i) blob = blob->next; TBLOB* copy_blob = new TBLOB(*blob); if (prev_blob == NULL) rebuild_word->blobs = copy_blob; else prev_blob->next = copy_blob; prev_blob = copy_blob; break_pieces(blob, seam_array, start, start + length - 1); start += length; } } // Copies the chopped_word to the rebuild_word, faking a best_state as well. // Also sets up the output box_word. void WERD_RES::CloneChoppedToRebuild() { if (rebuild_word != NULL) delete rebuild_word; rebuild_word = new TWERD(*chopped_word); SetupBoxWord(); int word_len = box_word->length(); best_state.reserve(word_len); correct_text.reserve(word_len); for (int i = 0; i < word_len; ++i) { best_state.push_back(1); correct_text.push_back(STRING("")); } } // Sets/replaces the box_word with one made from the rebuild_word. void WERD_RES::SetupBoxWord() { if (box_word != NULL) delete box_word; rebuild_word->ComputeBoundingBoxes(); box_word = tesseract::BoxWord::CopyFromNormalized(&denorm, rebuild_word); box_word->ClipToOriginalWord(denorm.block(), word); } // Sets up the script positions in the output boxword using the best_choice // to get the unichars, and the unicharset to get the target positions. void WERD_RES::SetScriptPositions() { box_word->SetScriptPositions(*uch_set, small_caps, rebuild_word, best_choice); } void WERD_RES::WithoutFootnoteSpan(int *pstart, int *pend) const { int end = best_choice->length(); while (end > 0 && uch_set->get_isdigit(best_choice->unichar_ids()[end - 1]) && box_word->BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) { end--; } int start = 0; while (start < end && uch_set->get_isdigit(best_choice->unichar_ids()[start]) && box_word->BlobPosition(start) == tesseract::SP_SUPERSCRIPT) { start++; } *pstart = start; *pend = end; } void WERD_RES::WithoutFootnoteSpan( const WERD_CHOICE &word, const GenericVector &state, int *pstart, int *pend) const { int len = word.length(); *pstart = 0; *pend = len; if (len < 2) return; if (!word.unicharset()->get_isdigit(word.unichar_ids()[len - 1]) && !word.unicharset()->get_isdigit(word.unichar_ids()[0])) return; // ok, now that we know the word ends in digits, do the expensive bit of // figuring out if they're superscript. WERD_RES copy(*this); copy.ReplaceBestChoice(word, state); copy.WithoutFootnoteSpan(pstart, pend); } // Classifies the word with some already-calculated BLOB_CHOICEs. // The choices are an array of blob_count pointers to BLOB_CHOICE, // providing a single classifier result for each blob. // The BLOB_CHOICEs are consumed and the word takes ownership. // The number of blobs in the outword must match blob_count. void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) { // Setup the WERD_RES. ASSERT_HOST(box_word != NULL); ASSERT_HOST(blob_count == box_word->length()); ASSERT_HOST(best_choice != NULL); BLOB_CHOICE_LIST_CLIST* word_choices = new BLOB_CHOICE_LIST_CLIST; BLOB_CHOICE_LIST_C_IT bc_it(word_choices); for (int c = 0; c < blob_count; ++c) { best_choice->append_unichar_id( choices[c]->unichar_id(), 1, choices[c]->rating(), choices[c]->certainty()); BLOB_CHOICE_LIST* choice_list = new BLOB_CHOICE_LIST; BLOB_CHOICE_IT choice_it(choice_list); choice_it.add_after_then_move(choices[c]); bc_it.add_after_then_move(choice_list); } best_choice->set_blob_choices(word_choices); delete raw_choice; raw_choice = new WERD_CHOICE(*best_choice); reject_map.initialise(blob_count); } // Copies the best_choice strings to the correct_text for adaption/training. void WERD_RES::BestChoiceToCorrectText() { correct_text.clear(); ASSERT_HOST(best_choice != NULL); for (int i = 0; i < best_choice->length(); ++i) { UNICHAR_ID choice_id = best_choice->unichar_id(i); const char* blob_choice = uch_set->id_to_unichar(choice_id); correct_text.push_back(STRING(blob_choice)); } } // Merges 2 adjacent blobs in the result if the permanent callback // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent // callback box_cb is NULL or returns true, setting the merged blob // result to the class returned from class_cb. // Returns true if anything was merged. bool WERD_RES::ConditionalBlobMerge( TessResultCallback2* class_cb, TessResultCallback2* box_cb, BLOB_CHOICE_LIST_CLIST *blob_choices) { bool modified = false; for (int i = 0; i + 1 < best_choice->length(); ++i) { UNICHAR_ID new_id = class_cb->Run(best_choice->unichar_id(i), best_choice->unichar_id(i+1)); if (new_id != INVALID_UNICHAR_ID && (box_cb == NULL || box_cb->Run(box_word->BlobBox(i), box_word->BlobBox(i + 1)))) { if (reject_map.length() == best_choice->length()) reject_map.remove_pos(i); best_choice->set_unichar_id(new_id, i); best_choice->remove_unichar_id(i + 1); raw_choice->set_unichar_id(new_id, i); raw_choice->remove_unichar_id(i + 1); modified = true; rebuild_word->MergeBlobs(i, i + 2); box_word->MergeBoxes(i, i + 2); if (i + 1 < best_state.length()) { best_state[i] += best_state[i + 1]; best_state.remove(i + 1); } BLOB_CHOICE_LIST_C_IT blob_choices_it(blob_choices); for (int j = 0; j < i; ++j) blob_choices_it.forward(); BLOB_CHOICE_IT it1(blob_choices_it.data()); // first choices BLOB_CHOICE_LIST* target_choices = blob_choices_it.data_relative(1); BLOB_CHOICE_IT it2(target_choices); // second choices float certainty = it2.data()->certainty(); float rating = it2.data()->rating(); if (it1.data()->certainty() < certainty) { certainty = it1.data()->certainty(); rating = it1.data()->rating(); target_choices = blob_choices_it.data(); blob_choices_it.forward(); } delete blob_choices_it.extract(); // get rid of spare // TODO(rays) Fix the choices so they contain the desired result. // Do we really need to ? Only needed for fix_quotes, which should be // going away. } } delete class_cb; delete box_cb; return modified; } // TODO(tkielbus) Decide between keeping this behavior here or modifying the // training data. // Utility function for fix_quotes // Return true if the next character in the string (given the UTF8 length in // bytes) is a quote character. static int is_simple_quote(const char* signed_str, int length) { const unsigned char* str = reinterpret_cast(signed_str); // Standard 1 byte quotes. return (length == 1 && (*str == '\'' || *str == '`')) || // UTF-8 3 bytes curved quotes. (length == 3 && ((*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x98) || (*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x99))); } // Callback helper for fix_quotes returns a double quote if both // arguments are quote, otherwise INVALID_UNICHAR_ID. UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) { const char *ch = uch_set->id_to_unichar(id1); const char *next_ch = uch_set->id_to_unichar(id2); if (is_simple_quote(ch, strlen(ch)) && is_simple_quote(next_ch, strlen(next_ch))) return uch_set->unichar_to_id("\""); return INVALID_UNICHAR_ID; } // Change pairs of quotes to double quotes. void WERD_RES::fix_quotes(BLOB_CHOICE_LIST_CLIST* blob_choices) { if (!uch_set->contains_unichar("\"") || !uch_set->get_enabled(uch_set->unichar_to_id("\""))) return; // Don't create it if it is disallowed. ConditionalBlobMerge( NewPermanentTessCallback(this, &WERD_RES::BothQuotes), NULL, blob_choices); } // Callback helper for fix_hyphens returns UNICHAR_ID of - if both // arguments are hyphen, otherwise INVALID_UNICHAR_ID. UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) { const char *ch = uch_set->id_to_unichar(id1); const char *next_ch = uch_set->id_to_unichar(id2); if (strlen(ch) == 1 && strlen(next_ch) == 1 && (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~')) return uch_set->unichar_to_id("-"); return INVALID_UNICHAR_ID; } // Callback helper for fix_hyphens returns true if box1 and box2 overlap // (assuming both on the same textline, are in order and a chopped em dash.) bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) { return box1.right() >= box2.left(); } // Change pairs of hyphens to a single hyphen if the bounding boxes touch // Typically a long dash which has been segmented. void WERD_RES::fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices) { if (!uch_set->contains_unichar("-") || !uch_set->get_enabled(uch_set->unichar_to_id("-"))) return; // Don't create it if it is disallowed. ConditionalBlobMerge( NewPermanentTessCallback(this, &WERD_RES::BothHyphens), NewPermanentTessCallback(this, &WERD_RES::HyphenBoxesOverlap), blob_choices); } // Callback helper for merge_tess_fails returns a space if both // arguments are space, otherwise INVALID_UNICHAR_ID. UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) { if (id1 == id2 && id1 == uch_set->unichar_to_id(" ")) return id1; else return INVALID_UNICHAR_ID; } // Change pairs of tess failures to a single one void WERD_RES::merge_tess_fails() { if (ConditionalBlobMerge( NewPermanentTessCallback(this, &WERD_RES::BothSpaces), NULL, best_choice->blob_choices())) { int len = best_choice->length(); ASSERT_HOST(reject_map.length() == len); ASSERT_HOST(box_word->length() == len); } } // Returns true if the collection of count pieces, starting at start, are all // natural connected components, ie there are no real chops involved. bool WERD_RES::PiecesAllNatural(int start, int count) const { // all seams must have no splits. for (int index = start; index < start + count - 1; ++index) { if (index >= 0 && index < array_count(seam_array)) { SEAM* seam = reinterpret_cast(array_value(seam_array, index)); if (seam != NULL && seam->split1 != NULL) return false; } } return true; } WERD_RES::~WERD_RES () { Clear(); } void WERD_RES::InitNonPointers() { tess_failed = FALSE; tess_accepted = FALSE; tess_would_adapt = FALSE; done = FALSE; unlv_crunch_mode = CR_NONE; small_caps = false; italic = FALSE; bold = FALSE; // The fontinfos and tesseract count as non-pointers as they point to // data owned elsewhere. fontinfo = NULL; fontinfo2 = NULL; tesseract = NULL; fontinfo_id_count = 0; fontinfo_id2_count = 0; x_height = 0.0; caps_height = 0.0; guessed_x_ht = TRUE; guessed_caps_ht = TRUE; combination = FALSE; part_of_combo = FALSE; reject_spaces = FALSE; } void WERD_RES::InitPointers() { word = NULL; bln_boxes = NULL; uch_set = NULL; chopped_word = NULL; rebuild_word = NULL; box_word = NULL; seam_array = NULL; best_choice = NULL; raw_choice = NULL; ep_choice = NULL; blamer_bundle = NULL; } void WERD_RES::Clear() { if (word != NULL && combination) { delete word; } word = NULL; delete blamer_bundle; blamer_bundle = NULL; ClearResults(); } void WERD_RES::ClearResults() { done = false; fontinfo = NULL; fontinfo2 = NULL; fontinfo_id_count = 0; fontinfo_id2_count = 0; if (bln_boxes != NULL) { delete bln_boxes; bln_boxes = NULL; } if (chopped_word != NULL) { delete chopped_word; chopped_word = NULL; } if (rebuild_word != NULL) { delete rebuild_word; rebuild_word = NULL; } if (box_word != NULL) { delete box_word; box_word = NULL; } best_state.clear(); correct_text.clear(); if (seam_array != NULL) { free_seam_list(seam_array); seam_array = NULL; } if (best_choice != NULL) { delete best_choice; delete raw_choice; best_choice = NULL; raw_choice = NULL; } if (!alt_choices.empty()) { alt_choices.delete_data_pointers(); alt_choices.clear(); } alt_states.clear(); if (ep_choice != NULL) { delete ep_choice; ep_choice = NULL; } if (blamer_bundle != NULL) blamer_bundle->ClearResults(); } bool PAGE_RES_IT::operator ==(const PAGE_RES_IT &other) const { return word_res == other.word_res && row_res == other.row_res && block_res == other.block_res; } int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const { ASSERT_HOST(page_res == other.page_res); if (other.block_res == NULL) { // other points to the end of the page. if (block_res == NULL) return 0; return -1; } if (block_res == NULL) { return 1; // we point to the end of the page. } if (block_res == other.block_res) { if (other.row_res == NULL || row_res == NULL) { // this should only happen if we hit an image block. return 0; } if (row_res == other.row_res) { // we point to the same block and row. ASSERT_HOST(other.word_res != NULL && word_res != NULL); if (word_res == other.word_res) { // we point to the same word! return 0; } WERD_RES_IT word_res_it(&row_res->word_res_list); for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) { if (word_res_it.data() == word_res) { return -1; } else if (word_res_it.data() == other.word_res) { return 1; } } ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL); } // we both point to the same block, but different rows. ROW_RES_IT row_res_it(&block_res->row_res_list); for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) { if (row_res_it.data() == row_res) { return -1; } else if (row_res_it.data() == other.row_res) { return 1; } } ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL); } // We point to different blocks. BLOCK_RES_IT block_res_it(&page_res->block_res_list); for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) { if (block_res_it.data() == block_res) { return -1; } else if (block_res_it.data() == other.block_res) { return 1; } } // Shouldn't happen... ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL); return 0; } // Inserts the new_word and a corresponding WERD_RES before the current // position. The simple fields of the WERD_RES are copied from clone_res and // the resulting WERD_RES is returned for further setup with best_choice etc. WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word) { // Insert new_word into the ROW. WERD_IT w_it(row()->row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { WERD* word = w_it.data(); if (word == word_res->word) break; } ASSERT_HOST(!w_it.cycled_list()); w_it.add_before_then_move(new_word); // Make a WERD_RES for the new_word. WERD_RES* new_res = new WERD_RES(new_word); new_res->CopySimpleFields(clone_res); // Insert into the appropriate place in the ROW_RES. WERD_RES_IT wr_it(&row()->word_res_list); for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { WERD_RES* word = wr_it.data(); if (word == word_res) break; } ASSERT_HOST(!wr_it.cycled_list()); wr_it.add_before_then_move(new_res); if (wr_it.at_first()) { // This is the new first word, so reset the member iterator so it // detects the cycled_list state correctly. ResetWordIterator(); } return new_res; } // Deletes the current WERD_RES and its underlying WERD. void PAGE_RES_IT::DeleteCurrentWord() { // Check that this word is as we expect. part_of_combos are NEVER iterated // by the normal iterator, so we should never be trying to delete them. ASSERT_HOST(!word_res->part_of_combo); if (!word_res->combination) { // Combinations own their own word, so we won't find the word on the // row's word_list, but it is legitimate to try to delete them. // Delete word from the ROW when not a combination. WERD_IT w_it(row()->row->word_list()); for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { if (w_it.data() == word_res->word) { break; } } ASSERT_HOST(!w_it.cycled_list()); delete w_it.extract(); } // Remove the WERD_RES for the new_word. // Remove the WORD_RES from the ROW_RES. WERD_RES_IT wr_it(&row()->word_res_list); for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { if (wr_it.data() == word_res) { word_res = NULL; break; } } ASSERT_HOST(!wr_it.cycled_list()); delete wr_it.extract(); ResetWordIterator(); } /************************************************************************* * PAGE_RES_IT::restart_page * * Set things up at the start of the page *************************************************************************/ WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) { block_res_it.set_to_list(&page_res->block_res_list); block_res_it.mark_cycle_pt(); prev_block_res = NULL; prev_row_res = NULL; prev_word_res = NULL; block_res = NULL; row_res = NULL; word_res = NULL; next_block_res = NULL; next_row_res = NULL; next_word_res = NULL; internal_forward(true, empty_ok); return internal_forward(false, empty_ok); } // Recovers from operations on the current word, such as in InsertCloneWord // and DeleteCurrentWord. // Resets the word_res_it so that it is one past the next_word_res, as // it should be after internal_forward. If next_row_res != row_res, // then the next_word_res is in the next row, so there is no need to do // anything, since operations on the current word will not have disturbed // the word_res_it. void PAGE_RES_IT::ResetWordIterator() { if (row_res == next_row_res) { // Reset the member iterator so it can move forward and detect the // cycled_list state correctly. word_res_it.move_to_first(); word_res_it.mark_cycle_pt(); while (!word_res_it.cycled_list() && word_res_it.data() != next_word_res) word_res_it.forward(); ASSERT_HOST(!word_res_it.cycled_list()); word_res_it.forward(); } } /************************************************************************* * PAGE_RES_IT::internal_forward * * Find the next word on the page. If empty_ok is true, then non-text blocks * and text blocks with no text are visited as if they contain a single * imaginary word in a single imaginary row. (word() and row() both return NULL * in such a block and the return value is NULL.) * If empty_ok is false, the old behaviour is maintained. Each real word * is visited and empty and non-text blocks and rows are skipped. * new_block is used to initialize the iterators for a new block. * The iterator maintains pointers to block, row and word for the previous, * current and next words. These are correct, regardless of block/row * boundaries. NULL values denote start and end of the page. *************************************************************************/ WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) { bool new_row = false; prev_block_res = block_res; prev_row_res = row_res; prev_word_res = word_res; block_res = next_block_res; row_res = next_row_res; word_res = next_word_res; next_block_res = NULL; next_row_res = NULL; next_word_res = NULL; while (!block_res_it.cycled_list()) { if (new_block) { new_block = false; row_res_it.set_to_list(&block_res_it.data()->row_res_list); row_res_it.mark_cycle_pt(); if (row_res_it.empty() && empty_ok) { next_block_res = block_res_it.data(); break; } new_row = true; } while (!row_res_it.cycled_list()) { if (new_row) { new_row = false; word_res_it.set_to_list(&row_res_it.data()->word_res_list); word_res_it.mark_cycle_pt(); } // Skip any part_of_combo words. while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo) word_res_it.forward(); if (!word_res_it.cycled_list()) { next_block_res = block_res_it.data(); next_row_res = row_res_it.data(); next_word_res = word_res_it.data(); word_res_it.forward(); goto foundword; } // end of row reached row_res_it.forward(); new_row = true; } // end of block reached block_res_it.forward(); new_block = true; } foundword: // Update prev_word_best_choice pointer. if (page_res != NULL && page_res->prev_word_best_choice != NULL) { *page_res->prev_word_best_choice = (new_block || prev_word_res == NULL) ? NULL : prev_word_res->best_choice; } return word_res; } /************************************************************************* * PAGE_RES_IT::restart_row() * * Move to the beginning (leftmost word) of the current row. *************************************************************************/ WERD_RES *PAGE_RES_IT::restart_row() { ROW_RES *row = this->row(); if (!row) return NULL; for (restart_page(); this->row() != row; forward()) { // pass } return word(); } /************************************************************************* * PAGE_RES_IT::forward_paragraph * * Move to the beginning of the next paragraph, allowing empty blocks. *************************************************************************/ WERD_RES *PAGE_RES_IT::forward_paragraph() { while (block_res == next_block_res && (next_row_res != NULL && next_row_res->row != NULL && row_res->row->para() == next_row_res->row->para())) { internal_forward(false, true); } return internal_forward(false, true); } /************************************************************************* * PAGE_RES_IT::forward_block * * Move to the beginning of the next block, allowing empty blocks. *************************************************************************/ WERD_RES *PAGE_RES_IT::forward_block() { while (block_res == next_block_res) { internal_forward(false, true); } return internal_forward(false, true); } void PAGE_RES_IT::rej_stat_word() { inT16 chars_in_word; inT16 rejects_in_word = 0; chars_in_word = word_res->reject_map.length (); page_res->char_count += chars_in_word; block_res->char_count += chars_in_word; row_res->char_count += chars_in_word; rejects_in_word = word_res->reject_map.reject_count (); page_res->rej_count += rejects_in_word; block_res->rej_count += rejects_in_word; row_res->rej_count += rejects_in_word; if (chars_in_word == rejects_in_word) row_res->whole_word_rej_count += rejects_in_word; }