mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-09 19:32:40 +08:00

http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=658634 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@675 d0cd1f9f-072b-0410-8dd7-cf729c803f20
1154 lines
39 KiB
C++
1154 lines
39 KiB
C++
/**********************************************************************
|
|
* File: pageres.cpp (Formerly page_res.c)
|
|
* Description: Results classes used by control.c
|
|
* Author: Phil Cheatle
|
|
* Created: Tue Sep 22 08:42:49 BST 1992
|
|
*
|
|
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
#include "mfcpch.h"
|
|
#include <stdlib.h>
|
|
#ifdef __UNIX__
|
|
#include <assert.h>
|
|
#endif
|
|
#include "pageres.h"
|
|
#include "blobs.h"
|
|
|
|
const char kBlameCorrect[] = "corr";
|
|
const char kBlameClassifier[] = "cl";
|
|
const char kBlameChopper[] = "chop";
|
|
const char kBlameClassLMTradeoff[] = "cl/LM";
|
|
const char kBlamePageLayout[] = "pglt";
|
|
const char kBlameSegsearchHeur[] = "ss_heur";
|
|
const char kBlameSegsearchPP[] = "ss_pp";
|
|
const char kBlameClassOldLMTradeoff[] = "cl/old_LM";
|
|
const char kBlameAdaption[] = "adapt";
|
|
const char kBlameNoTruthSplit[] = "no_tr_spl";
|
|
const char kBlameNoTruth[] = "no_tr";
|
|
const char kBlameUnknown[] = "unkn";
|
|
|
|
const char * const kIncorrectResultReasonNames[] = {
|
|
kBlameCorrect,
|
|
kBlameClassifier,
|
|
kBlameChopper,
|
|
kBlameClassLMTradeoff,
|
|
kBlamePageLayout,
|
|
kBlameSegsearchHeur,
|
|
kBlameSegsearchPP,
|
|
kBlameClassOldLMTradeoff,
|
|
kBlameAdaption,
|
|
kBlameNoTruthSplit,
|
|
kBlameNoTruth,
|
|
kBlameUnknown
|
|
};
|
|
|
|
const char *BlamerBundle::IncorrectReasonName(IncorrectResultReason irr) {
|
|
return kIncorrectResultReasonNames[irr];
|
|
}
|
|
|
|
const char *BlamerBundle::IncorrectReason() const {
|
|
return kIncorrectResultReasonNames[incorrect_result_reason];
|
|
}
|
|
|
|
void BlamerBundle::FillDebugString(const STRING &msg,
|
|
const WERD_CHOICE *choice,
|
|
STRING *debug) {
|
|
(*debug) += "Truth ";
|
|
for (int i = 0; i < this->truth_text.length(); ++i) {
|
|
(*debug) += this->truth_text[i];
|
|
}
|
|
if (!this->truth_has_char_boxes) (*debug) += " (no char boxes)";
|
|
if (choice != NULL) {
|
|
(*debug) += " Choice ";
|
|
STRING choice_str;
|
|
choice->string_and_lengths(&choice_str, NULL);
|
|
(*debug) += choice_str;
|
|
}
|
|
if (msg.length() > 0) {
|
|
(*debug) += "\n";
|
|
(*debug) += msg;
|
|
}
|
|
(*debug) += "\n";
|
|
}
|
|
|
|
ELISTIZE (BLOCK_RES)
|
|
CLISTIZE (BLOCK_RES) ELISTIZE (ROW_RES) ELISTIZE (WERD_RES)
|
|
/*************************************************************************
|
|
* PAGE_RES::PAGE_RES
|
|
*
|
|
* Constructor for page results
|
|
*************************************************************************/
|
|
PAGE_RES::PAGE_RES(
|
|
BLOCK_LIST *the_block_list,
|
|
WERD_CHOICE **prev_word_best_choice_ptr) {
|
|
Init();
|
|
BLOCK_IT block_it(the_block_list);
|
|
BLOCK_RES_IT block_res_it(&block_res_list);
|
|
for (block_it.mark_cycle_pt();
|
|
!block_it.cycled_list(); block_it.forward()) {
|
|
block_res_it.add_to_end(new BLOCK_RES(block_it.data()));
|
|
}
|
|
prev_word_best_choice = prev_word_best_choice_ptr;
|
|
}
|
|
|
|
/*************************************************************************
|
|
* BLOCK_RES::BLOCK_RES
|
|
*
|
|
* Constructor for BLOCK results
|
|
*************************************************************************/
|
|
|
|
BLOCK_RES::BLOCK_RES(BLOCK *the_block) {
|
|
ROW_IT row_it (the_block->row_list ());
|
|
ROW_RES_IT row_res_it(&row_res_list);
|
|
|
|
char_count = 0;
|
|
rej_count = 0;
|
|
font_class = -1; //not assigned
|
|
x_height = -1.0;
|
|
font_assigned = FALSE;
|
|
bold = FALSE;
|
|
italic = FALSE;
|
|
row_count = 0;
|
|
|
|
block = the_block;
|
|
|
|
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
|
|
row_res_it.add_to_end(new ROW_RES(row_it.data()));
|
|
}
|
|
}
|
|
|
|
|
|
/*************************************************************************
|
|
* ROW_RES::ROW_RES
|
|
*
|
|
* Constructor for ROW results
|
|
*************************************************************************/
|
|
|
|
ROW_RES::ROW_RES(ROW *the_row) {
|
|
WERD_IT word_it(the_row->word_list());
|
|
WERD_RES_IT word_res_it(&word_res_list);
|
|
WERD_RES *combo = NULL; // current combination of fuzzies
|
|
WERD_RES *word_res; // current word
|
|
WERD *copy_word;
|
|
|
|
char_count = 0;
|
|
rej_count = 0;
|
|
whole_word_rej_count = 0;
|
|
|
|
row = the_row;
|
|
for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
|
|
word_res = new WERD_RES(word_it.data());
|
|
word_res->x_height = the_row->x_height();
|
|
|
|
if (word_res->word->flag(W_FUZZY_NON)) {
|
|
ASSERT_HOST(combo != NULL);
|
|
word_res->part_of_combo = TRUE;
|
|
combo->copy_on(word_res);
|
|
}
|
|
if (word_it.data_relative(1)->flag(W_FUZZY_NON)) {
|
|
if (combo == NULL) {
|
|
copy_word = new WERD;
|
|
//deep copy
|
|
*copy_word = *(word_it.data());
|
|
combo = new WERD_RES(copy_word);
|
|
combo->x_height = the_row->x_height();
|
|
combo->combination = TRUE;
|
|
word_res_it.add_to_end(combo);
|
|
}
|
|
word_res->part_of_combo = TRUE;
|
|
} else {
|
|
combo = NULL;
|
|
}
|
|
word_res_it.add_to_end(word_res);
|
|
}
|
|
}
|
|
|
|
|
|
WERD_RES& WERD_RES::operator=(const WERD_RES & source) {
|
|
this->ELIST_LINK::operator=(source);
|
|
Clear();
|
|
if (source.combination) {
|
|
word = new WERD;
|
|
*word = *(source.word); // deep copy
|
|
} else {
|
|
word = source.word; // pt to same word
|
|
}
|
|
if (source.bln_boxes != NULL)
|
|
bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
|
|
if (source.chopped_word != NULL)
|
|
chopped_word = new TWERD(*source.chopped_word);
|
|
if (source.rebuild_word != NULL)
|
|
rebuild_word = new TWERD(*source.rebuild_word);
|
|
// TODO(rays) Do we ever need to copy the seam_array?
|
|
denorm = source.denorm;
|
|
if (source.box_word != NULL)
|
|
box_word = new tesseract::BoxWord(*source.box_word);
|
|
best_state = source.best_state;
|
|
correct_text = source.correct_text;
|
|
|
|
if (source.best_choice != NULL) {
|
|
best_choice = new WERD_CHOICE(*source.best_choice);
|
|
raw_choice = new WERD_CHOICE(*source.raw_choice);
|
|
best_choice_fontinfo_ids = source.best_choice_fontinfo_ids;
|
|
}
|
|
else {
|
|
best_choice = NULL;
|
|
raw_choice = NULL;
|
|
if (!best_choice_fontinfo_ids.empty()) {
|
|
best_choice_fontinfo_ids.clear();
|
|
}
|
|
}
|
|
for (int i = 0; i < source.alt_choices.length(); ++i) {
|
|
const WERD_CHOICE *choice = source.alt_choices[i];
|
|
ASSERT_HOST(choice != NULL);
|
|
alt_choices.push_back(new WERD_CHOICE(*choice));
|
|
}
|
|
alt_states = source.alt_states;
|
|
if (source.ep_choice != NULL) {
|
|
ep_choice = new WERD_CHOICE(*source.ep_choice);
|
|
} else {
|
|
ep_choice = NULL;
|
|
}
|
|
reject_map = source.reject_map;
|
|
combination = source.combination;
|
|
part_of_combo = source.part_of_combo;
|
|
CopySimpleFields(source);
|
|
if (source.blamer_bundle != NULL) {
|
|
blamer_bundle = new BlamerBundle(*(source.blamer_bundle));
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
// Copies basic fields that don't involve pointers that might be useful
|
|
// to copy when making one WERD_RES from another.
|
|
void WERD_RES::CopySimpleFields(const WERD_RES& source) {
|
|
tess_failed = source.tess_failed;
|
|
tess_accepted = source.tess_accepted;
|
|
tess_would_adapt = source.tess_would_adapt;
|
|
done = source.done;
|
|
unlv_crunch_mode = source.unlv_crunch_mode;
|
|
small_caps = source.small_caps;
|
|
italic = source.italic;
|
|
bold = source.bold;
|
|
fontinfo = source.fontinfo;
|
|
fontinfo2 = source.fontinfo2;
|
|
fontinfo_id_count = source.fontinfo_id_count;
|
|
fontinfo_id2_count = source.fontinfo_id2_count;
|
|
x_height = source.x_height;
|
|
caps_height = source.caps_height;
|
|
guessed_x_ht = source.guessed_x_ht;
|
|
guessed_caps_ht = source.guessed_caps_ht;
|
|
reject_spaces = source.reject_spaces;
|
|
uch_set = source.uch_set;
|
|
tesseract = source.tesseract;
|
|
}
|
|
|
|
// Initializes a blank (default constructed) WERD_RES from one that has
|
|
// already been recognized.
|
|
// Use SetupFor*Recognition afterwards to complete the setup and make
|
|
// it ready for a retry recognition.
|
|
void WERD_RES::InitForRetryRecognition(const WERD_RES& source) {
|
|
word = source.word;
|
|
CopySimpleFields(source);
|
|
if (source.blamer_bundle != NULL) {
|
|
blamer_bundle = new BlamerBundle();
|
|
blamer_bundle->CopyTruth(*source.blamer_bundle);
|
|
}
|
|
}
|
|
|
|
// Sets up the members used in recognition:
|
|
// bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
|
|
// Returns false if the word is empty and sets up fake results.
|
|
bool WERD_RES::SetupForTessRecognition(const UNICHARSET& unicharset_in,
|
|
tesseract::Tesseract* tess, Pix* pix,
|
|
bool numeric_mode,
|
|
bool use_body_size,
|
|
ROW *row, BLOCK* block) {
|
|
tesseract = tess;
|
|
POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL;
|
|
if (word->cblob_list()->empty() || (pb != NULL && !pb->IsText())) {
|
|
// Empty words occur when all the blobs have been moved to the rej_blobs
|
|
// list, which seems to occur frequently in junk.
|
|
SetupFake(unicharset_in);
|
|
word->set_flag(W_REP_CHAR, false);
|
|
return false;
|
|
}
|
|
ClearResults();
|
|
SetupWordScript(unicharset_in);
|
|
chopped_word = TWERD::PolygonalCopy(word);
|
|
if (use_body_size && row->body_size() > 0.0f) {
|
|
chopped_word->SetupBLNormalize(block, row, row->body_size(),
|
|
numeric_mode, &denorm);
|
|
} else {
|
|
chopped_word->SetupBLNormalize(block, row, x_height, numeric_mode, &denorm);
|
|
}
|
|
// The image will be 8-bit grey if the input was grey or color. Note that in
|
|
// a grey image 0 is black and 255 is white. If the input was binary, then
|
|
// the pix will be binary and 0 is white, with 1 being black.
|
|
// To tell the difference pixGetDepth() will return 8 or 1.
|
|
denorm.set_pix(pix);
|
|
// The inverse flag will be true iff the word has been determined to be white
|
|
// on black, and is independent of whether the pix is 8 bit or 1 bit.
|
|
denorm.set_inverse(word->flag(W_INVERSE));
|
|
chopped_word->Normalize(denorm);
|
|
bln_boxes = tesseract::BoxWord::CopyFromNormalized(NULL, chopped_word);
|
|
seam_array = start_seam_list(chopped_word->blobs);
|
|
best_choice = new WERD_CHOICE(&unicharset_in);
|
|
best_choice->make_bad();
|
|
raw_choice = new WERD_CHOICE(&unicharset_in);
|
|
raw_choice->make_bad();
|
|
SetupBlamerBundle();
|
|
return true;
|
|
}
|
|
|
|
// Sets up the members used in recognition:
|
|
// bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
|
|
// Returns false if the word is empty and sets up fake results.
|
|
bool WERD_RES::SetupForCubeRecognition(const UNICHARSET& unicharset_in,
|
|
tesseract::Tesseract* tess,
|
|
const BLOCK* block) {
|
|
tesseract = tess;
|
|
POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL;
|
|
if (pb != NULL && !pb->IsText()) {
|
|
// Ignore words in graphic regions.
|
|
SetupFake(unicharset_in);
|
|
word->set_flag(W_REP_CHAR, false);
|
|
return false;
|
|
}
|
|
ClearResults();
|
|
SetupWordScript(unicharset_in);
|
|
TBOX word_box = word->bounding_box();
|
|
denorm.SetupNormalization(block, NULL, NULL, NULL, NULL, 0,
|
|
word_box.left(), word_box.bottom(),
|
|
1.0f, 1.0f, 0.0f, 0.0f);
|
|
SetupBlamerBundle();
|
|
return true;
|
|
}
|
|
|
|
// Sets up the members used in recognition for an empty recognition result:
|
|
// bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
|
|
void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
|
|
ClearResults();
|
|
SetupWordScript(unicharset_in);
|
|
chopped_word = new TWERD;
|
|
rebuild_word = new TWERD;
|
|
bln_boxes = new tesseract::BoxWord;
|
|
box_word = new tesseract::BoxWord;
|
|
int blob_count = word->cblob_list()->length();
|
|
best_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
|
|
TOP_CHOICE_PERM, unicharset_in);
|
|
raw_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f,
|
|
TOP_CHOICE_PERM, unicharset_in);
|
|
if (blob_count > 0) {
|
|
BLOB_CHOICE** fake_choices = new BLOB_CHOICE*[blob_count];
|
|
// For non-text blocks, just pass any blobs through to the box_word
|
|
// and call the word failed with a fake classification.
|
|
C_BLOB_IT b_it(word->cblob_list());
|
|
int blob_id = 0;
|
|
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
|
|
TBOX box = b_it.data()->bounding_box();
|
|
box_word->InsertBox(box_word->length(), box);
|
|
fake_choices[blob_id++] = new BLOB_CHOICE(0, 10.0f, -1.0f,
|
|
-1, -1, -1, 0, 0, false);
|
|
}
|
|
FakeClassifyWord(blob_count, fake_choices);
|
|
delete [] fake_choices;
|
|
}
|
|
tess_failed = true;
|
|
}
|
|
|
|
void WERD_RES::SetupWordScript(const UNICHARSET& uch) {
|
|
uch_set = &uch;
|
|
int script = uch.default_sid();
|
|
word->set_script_id(script);
|
|
word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight());
|
|
word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
|
|
}
|
|
|
|
// Sets up the blamer_bundle if it is not null, using the initialized denorm.
|
|
void WERD_RES::SetupBlamerBundle() {
|
|
if (blamer_bundle != NULL) {
|
|
blamer_bundle->norm_box_tolerance = kBlamerBoxTolerance * denorm.x_scale();
|
|
TPOINT topleft;
|
|
TPOINT botright;
|
|
TPOINT norm_topleft;
|
|
TPOINT norm_botright;
|
|
for (int b = 0; b < blamer_bundle->truth_word.length(); ++b) {
|
|
const TBOX &box = blamer_bundle->truth_word.BlobBox(b);
|
|
topleft.x = box.left();
|
|
topleft.y = box.top();
|
|
botright.x = box.right();
|
|
botright.y = box.bottom();
|
|
denorm.NormTransform(topleft, &norm_topleft);
|
|
denorm.NormTransform(botright, &norm_botright);
|
|
TBOX norm_box(norm_topleft.x, norm_botright.y,
|
|
norm_botright.x, norm_topleft.y);
|
|
blamer_bundle->norm_truth_word.InsertBox(b, norm_box);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Simple helper moves the ownership of the pointer data from src to dest,
|
|
// first deleting anything in dest, and nulling out src afterwards.
|
|
template<class T> static void MovePointerData(T** dest, T**src) {
|
|
delete *dest;
|
|
*dest = *src;
|
|
*src = NULL;
|
|
}
|
|
|
|
// Moves the results fields from word to this. This takes ownership of all
|
|
// the data, so src can be destructed.
|
|
void WERD_RES::ConsumeWordResults(WERD_RES* word) {
|
|
denorm = word->denorm;
|
|
MovePointerData(&chopped_word, &word->chopped_word);
|
|
MovePointerData(&rebuild_word, &word->rebuild_word);
|
|
MovePointerData(&box_word, &word->box_word);
|
|
if (seam_array != NULL)
|
|
free_seam_list(seam_array);
|
|
seam_array = word->seam_array;
|
|
word->seam_array = NULL;
|
|
best_state.move(&word->best_state);
|
|
correct_text.move(&word->correct_text);
|
|
MovePointerData(&best_choice, &word->best_choice);
|
|
MovePointerData(&raw_choice, &word->raw_choice);
|
|
alt_choices.delete_data_pointers();
|
|
alt_choices.move(&word->alt_choices);
|
|
alt_states.move(&word->alt_states);
|
|
reject_map = word->reject_map;
|
|
if (word->blamer_bundle != NULL) {
|
|
assert(blamer_bundle != NULL);
|
|
blamer_bundle->CopyResults(*(word->blamer_bundle));
|
|
}
|
|
CopySimpleFields(*word);
|
|
}
|
|
|
|
// Replace the best choice and rebuild box word.
|
|
void WERD_RES::ReplaceBestChoice(
|
|
const WERD_CHOICE& choice,
|
|
const GenericVector<int>& segmentation_state) {
|
|
delete best_choice;
|
|
best_choice = new WERD_CHOICE(choice);
|
|
best_state = segmentation_state;
|
|
RebuildBestState();
|
|
SetupBoxWord();
|
|
// Make up a fake reject map of the right length to keep the
|
|
// rejection pass happy.
|
|
reject_map.initialise(segmentation_state.length());
|
|
done = tess_accepted = tess_would_adapt = true;
|
|
SetScriptPositions();
|
|
}
|
|
|
|
// Builds the rebuild_word from the chopped_word and the best_state.
|
|
void WERD_RES::RebuildBestState() {
|
|
if (rebuild_word != NULL)
|
|
delete rebuild_word;
|
|
rebuild_word = new TWERD;
|
|
if (seam_array == NULL) {
|
|
seam_array = start_seam_list(chopped_word->blobs);
|
|
}
|
|
TBLOB* prev_blob = NULL;
|
|
int start = 0;
|
|
for (int i = 0; i < best_state.size(); ++i) {
|
|
int length = best_state[i];
|
|
join_pieces(chopped_word->blobs, seam_array, start, start + length - 1);
|
|
TBLOB* blob = chopped_word->blobs;
|
|
for (int i = 0; i < start; ++i)
|
|
blob = blob->next;
|
|
TBLOB* copy_blob = new TBLOB(*blob);
|
|
if (prev_blob == NULL)
|
|
rebuild_word->blobs = copy_blob;
|
|
else
|
|
prev_blob->next = copy_blob;
|
|
prev_blob = copy_blob;
|
|
break_pieces(blob, seam_array, start, start + length - 1);
|
|
start += length;
|
|
}
|
|
}
|
|
|
|
// Copies the chopped_word to the rebuild_word, faking a best_state as well.
|
|
// Also sets up the output box_word.
|
|
void WERD_RES::CloneChoppedToRebuild() {
|
|
if (rebuild_word != NULL)
|
|
delete rebuild_word;
|
|
rebuild_word = new TWERD(*chopped_word);
|
|
SetupBoxWord();
|
|
int word_len = box_word->length();
|
|
best_state.reserve(word_len);
|
|
correct_text.reserve(word_len);
|
|
for (int i = 0; i < word_len; ++i) {
|
|
best_state.push_back(1);
|
|
correct_text.push_back(STRING(""));
|
|
}
|
|
}
|
|
|
|
// Sets/replaces the box_word with one made from the rebuild_word.
|
|
void WERD_RES::SetupBoxWord() {
|
|
if (box_word != NULL)
|
|
delete box_word;
|
|
rebuild_word->ComputeBoundingBoxes();
|
|
box_word = tesseract::BoxWord::CopyFromNormalized(&denorm, rebuild_word);
|
|
box_word->ClipToOriginalWord(denorm.block(), word);
|
|
}
|
|
|
|
// Sets up the script positions in the output boxword using the best_choice
|
|
// to get the unichars, and the unicharset to get the target positions.
|
|
void WERD_RES::SetScriptPositions() {
|
|
box_word->SetScriptPositions(*uch_set, small_caps, rebuild_word,
|
|
best_choice);
|
|
}
|
|
|
|
void WERD_RES::WithoutFootnoteSpan(int *pstart, int *pend) const {
|
|
int end = best_choice->length();
|
|
while (end > 0 &&
|
|
uch_set->get_isdigit(best_choice->unichar_ids()[end - 1]) &&
|
|
box_word->BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) {
|
|
end--;
|
|
}
|
|
int start = 0;
|
|
while (start < end &&
|
|
uch_set->get_isdigit(best_choice->unichar_ids()[start]) &&
|
|
box_word->BlobPosition(start) == tesseract::SP_SUPERSCRIPT) {
|
|
start++;
|
|
}
|
|
*pstart = start;
|
|
*pend = end;
|
|
}
|
|
|
|
void WERD_RES::WithoutFootnoteSpan(
|
|
const WERD_CHOICE &word, const GenericVector<int> &state,
|
|
int *pstart, int *pend) const {
|
|
int len = word.length();
|
|
*pstart = 0;
|
|
*pend = len;
|
|
if (len < 2) return;
|
|
if (!word.unicharset()->get_isdigit(word.unichar_ids()[len - 1]) &&
|
|
!word.unicharset()->get_isdigit(word.unichar_ids()[0])) return;
|
|
|
|
// ok, now that we know the word ends in digits, do the expensive bit of
|
|
// figuring out if they're superscript.
|
|
WERD_RES copy(*this);
|
|
copy.ReplaceBestChoice(word, state);
|
|
copy.WithoutFootnoteSpan(pstart, pend);
|
|
}
|
|
|
|
// Classifies the word with some already-calculated BLOB_CHOICEs.
|
|
// The choices are an array of blob_count pointers to BLOB_CHOICE,
|
|
// providing a single classifier result for each blob.
|
|
// The BLOB_CHOICEs are consumed and the word takes ownership.
|
|
// The number of blobs in the outword must match blob_count.
|
|
void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
|
|
// Setup the WERD_RES.
|
|
ASSERT_HOST(box_word != NULL);
|
|
ASSERT_HOST(blob_count == box_word->length());
|
|
ASSERT_HOST(best_choice != NULL);
|
|
BLOB_CHOICE_LIST_CLIST* word_choices = new BLOB_CHOICE_LIST_CLIST;
|
|
BLOB_CHOICE_LIST_C_IT bc_it(word_choices);
|
|
for (int c = 0; c < blob_count; ++c) {
|
|
best_choice->append_unichar_id(
|
|
choices[c]->unichar_id(), 1,
|
|
choices[c]->rating(), choices[c]->certainty());
|
|
BLOB_CHOICE_LIST* choice_list = new BLOB_CHOICE_LIST;
|
|
BLOB_CHOICE_IT choice_it(choice_list);
|
|
choice_it.add_after_then_move(choices[c]);
|
|
bc_it.add_after_then_move(choice_list);
|
|
}
|
|
best_choice->set_blob_choices(word_choices);
|
|
delete raw_choice;
|
|
raw_choice = new WERD_CHOICE(*best_choice);
|
|
reject_map.initialise(blob_count);
|
|
}
|
|
|
|
// Copies the best_choice strings to the correct_text for adaption/training.
|
|
void WERD_RES::BestChoiceToCorrectText() {
|
|
correct_text.clear();
|
|
ASSERT_HOST(best_choice != NULL);
|
|
for (int i = 0; i < best_choice->length(); ++i) {
|
|
UNICHAR_ID choice_id = best_choice->unichar_id(i);
|
|
const char* blob_choice = uch_set->id_to_unichar(choice_id);
|
|
correct_text.push_back(STRING(blob_choice));
|
|
}
|
|
}
|
|
|
|
// Merges 2 adjacent blobs in the result if the permanent callback
|
|
// class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
|
|
// callback box_cb is NULL or returns true, setting the merged blob
|
|
// result to the class returned from class_cb.
|
|
// Returns true if anything was merged.
|
|
bool WERD_RES::ConditionalBlobMerge(
|
|
TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb,
|
|
TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb,
|
|
|
|
BLOB_CHOICE_LIST_CLIST *blob_choices) {
|
|
bool modified = false;
|
|
for (int i = 0; i + 1 < best_choice->length(); ++i) {
|
|
UNICHAR_ID new_id = class_cb->Run(best_choice->unichar_id(i),
|
|
best_choice->unichar_id(i+1));
|
|
if (new_id != INVALID_UNICHAR_ID &&
|
|
(box_cb == NULL || box_cb->Run(box_word->BlobBox(i),
|
|
box_word->BlobBox(i + 1)))) {
|
|
if (reject_map.length() == best_choice->length())
|
|
reject_map.remove_pos(i);
|
|
best_choice->set_unichar_id(new_id, i);
|
|
best_choice->remove_unichar_id(i + 1);
|
|
raw_choice->set_unichar_id(new_id, i);
|
|
raw_choice->remove_unichar_id(i + 1);
|
|
modified = true;
|
|
rebuild_word->MergeBlobs(i, i + 2);
|
|
box_word->MergeBoxes(i, i + 2);
|
|
if (i + 1 < best_state.length()) {
|
|
best_state[i] += best_state[i + 1];
|
|
best_state.remove(i + 1);
|
|
}
|
|
|
|
BLOB_CHOICE_LIST_C_IT blob_choices_it(blob_choices);
|
|
for (int j = 0; j < i; ++j)
|
|
blob_choices_it.forward();
|
|
BLOB_CHOICE_IT it1(blob_choices_it.data()); // first choices
|
|
BLOB_CHOICE_LIST* target_choices = blob_choices_it.data_relative(1);
|
|
BLOB_CHOICE_IT it2(target_choices); // second choices
|
|
float certainty = it2.data()->certainty();
|
|
float rating = it2.data()->rating();
|
|
if (it1.data()->certainty() < certainty) {
|
|
certainty = it1.data()->certainty();
|
|
rating = it1.data()->rating();
|
|
target_choices = blob_choices_it.data();
|
|
blob_choices_it.forward();
|
|
}
|
|
delete blob_choices_it.extract(); // get rid of spare
|
|
// TODO(rays) Fix the choices so they contain the desired result.
|
|
// Do we really need to ? Only needed for fix_quotes, which should be
|
|
// going away.
|
|
}
|
|
}
|
|
delete class_cb;
|
|
delete box_cb;
|
|
return modified;
|
|
}
|
|
|
|
// TODO(tkielbus) Decide between keeping this behavior here or modifying the
|
|
// training data.
|
|
|
|
// Utility function for fix_quotes
|
|
// Return true if the next character in the string (given the UTF8 length in
|
|
// bytes) is a quote character.
|
|
static int is_simple_quote(const char* signed_str, int length) {
|
|
const unsigned char* str =
|
|
reinterpret_cast<const unsigned char*>(signed_str);
|
|
// Standard 1 byte quotes.
|
|
return (length == 1 && (*str == '\'' || *str == '`')) ||
|
|
// UTF-8 3 bytes curved quotes.
|
|
(length == 3 && ((*str == 0xe2 &&
|
|
*(str + 1) == 0x80 &&
|
|
*(str + 2) == 0x98) ||
|
|
(*str == 0xe2 &&
|
|
*(str + 1) == 0x80 &&
|
|
*(str + 2) == 0x99)));
|
|
}
|
|
|
|
// Callback helper for fix_quotes returns a double quote if both
|
|
// arguments are quote, otherwise INVALID_UNICHAR_ID.
|
|
UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) {
|
|
const char *ch = uch_set->id_to_unichar(id1);
|
|
const char *next_ch = uch_set->id_to_unichar(id2);
|
|
if (is_simple_quote(ch, strlen(ch)) &&
|
|
is_simple_quote(next_ch, strlen(next_ch)))
|
|
return uch_set->unichar_to_id("\"");
|
|
return INVALID_UNICHAR_ID;
|
|
}
|
|
|
|
// Change pairs of quotes to double quotes.
|
|
void WERD_RES::fix_quotes(BLOB_CHOICE_LIST_CLIST* blob_choices) {
|
|
if (!uch_set->contains_unichar("\"") ||
|
|
!uch_set->get_enabled(uch_set->unichar_to_id("\"")))
|
|
return; // Don't create it if it is disallowed.
|
|
|
|
ConditionalBlobMerge(
|
|
NewPermanentTessCallback(this, &WERD_RES::BothQuotes),
|
|
NULL,
|
|
blob_choices);
|
|
}
|
|
|
|
// Callback helper for fix_hyphens returns UNICHAR_ID of - if both
|
|
// arguments are hyphen, otherwise INVALID_UNICHAR_ID.
|
|
UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) {
|
|
const char *ch = uch_set->id_to_unichar(id1);
|
|
const char *next_ch = uch_set->id_to_unichar(id2);
|
|
if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
|
|
(*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~'))
|
|
return uch_set->unichar_to_id("-");
|
|
return INVALID_UNICHAR_ID;
|
|
}
|
|
|
|
// Callback helper for fix_hyphens returns true if box1 and box2 overlap
|
|
// (assuming both on the same textline, are in order and a chopped em dash.)
|
|
bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) {
|
|
return box1.right() >= box2.left();
|
|
}
|
|
|
|
// Change pairs of hyphens to a single hyphen if the bounding boxes touch
|
|
// Typically a long dash which has been segmented.
|
|
void WERD_RES::fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices) {
|
|
if (!uch_set->contains_unichar("-") ||
|
|
!uch_set->get_enabled(uch_set->unichar_to_id("-")))
|
|
return; // Don't create it if it is disallowed.
|
|
|
|
ConditionalBlobMerge(
|
|
NewPermanentTessCallback(this, &WERD_RES::BothHyphens),
|
|
NewPermanentTessCallback(this, &WERD_RES::HyphenBoxesOverlap),
|
|
blob_choices);
|
|
}
|
|
|
|
// Callback helper for merge_tess_fails returns a space if both
|
|
// arguments are space, otherwise INVALID_UNICHAR_ID.
|
|
UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) {
|
|
if (id1 == id2 && id1 == uch_set->unichar_to_id(" "))
|
|
return id1;
|
|
else
|
|
return INVALID_UNICHAR_ID;
|
|
}
|
|
|
|
// Change pairs of tess failures to a single one
|
|
void WERD_RES::merge_tess_fails() {
|
|
if (ConditionalBlobMerge(
|
|
NewPermanentTessCallback(this, &WERD_RES::BothSpaces), NULL,
|
|
best_choice->blob_choices())) {
|
|
int len = best_choice->length();
|
|
ASSERT_HOST(reject_map.length() == len);
|
|
ASSERT_HOST(box_word->length() == len);
|
|
}
|
|
}
|
|
|
|
// Returns true if the collection of count pieces, starting at start, are all
|
|
// natural connected components, ie there are no real chops involved.
|
|
bool WERD_RES::PiecesAllNatural(int start, int count) const {
|
|
// all seams must have no splits.
|
|
for (int index = start; index < start + count - 1; ++index) {
|
|
if (index >= 0 && index < array_count(seam_array)) {
|
|
SEAM* seam = reinterpret_cast<SEAM *>(array_value(seam_array, index));
|
|
if (seam != NULL && seam->split1 != NULL)
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
WERD_RES::~WERD_RES () {
|
|
Clear();
|
|
}
|
|
|
|
void WERD_RES::InitNonPointers() {
|
|
tess_failed = FALSE;
|
|
tess_accepted = FALSE;
|
|
tess_would_adapt = FALSE;
|
|
done = FALSE;
|
|
unlv_crunch_mode = CR_NONE;
|
|
small_caps = false;
|
|
italic = FALSE;
|
|
bold = FALSE;
|
|
// The fontinfos and tesseract count as non-pointers as they point to
|
|
// data owned elsewhere.
|
|
fontinfo = NULL;
|
|
fontinfo2 = NULL;
|
|
tesseract = NULL;
|
|
fontinfo_id_count = 0;
|
|
fontinfo_id2_count = 0;
|
|
x_height = 0.0;
|
|
caps_height = 0.0;
|
|
guessed_x_ht = TRUE;
|
|
guessed_caps_ht = TRUE;
|
|
combination = FALSE;
|
|
part_of_combo = FALSE;
|
|
reject_spaces = FALSE;
|
|
}
|
|
|
|
void WERD_RES::InitPointers() {
|
|
word = NULL;
|
|
bln_boxes = NULL;
|
|
uch_set = NULL;
|
|
chopped_word = NULL;
|
|
rebuild_word = NULL;
|
|
box_word = NULL;
|
|
seam_array = NULL;
|
|
best_choice = NULL;
|
|
raw_choice = NULL;
|
|
ep_choice = NULL;
|
|
blamer_bundle = NULL;
|
|
}
|
|
|
|
void WERD_RES::Clear() {
|
|
if (word != NULL && combination) {
|
|
delete word;
|
|
}
|
|
word = NULL;
|
|
delete blamer_bundle;
|
|
blamer_bundle = NULL;
|
|
ClearResults();
|
|
}
|
|
|
|
void WERD_RES::ClearResults() {
|
|
done = false;
|
|
fontinfo = NULL;
|
|
fontinfo2 = NULL;
|
|
fontinfo_id_count = 0;
|
|
fontinfo_id2_count = 0;
|
|
if (bln_boxes != NULL) {
|
|
delete bln_boxes;
|
|
bln_boxes = NULL;
|
|
}
|
|
if (chopped_word != NULL) {
|
|
delete chopped_word;
|
|
chopped_word = NULL;
|
|
}
|
|
if (rebuild_word != NULL) {
|
|
delete rebuild_word;
|
|
rebuild_word = NULL;
|
|
}
|
|
if (box_word != NULL) {
|
|
delete box_word;
|
|
box_word = NULL;
|
|
}
|
|
best_state.clear();
|
|
correct_text.clear();
|
|
if (seam_array != NULL) {
|
|
free_seam_list(seam_array);
|
|
seam_array = NULL;
|
|
}
|
|
if (best_choice != NULL) {
|
|
delete best_choice;
|
|
delete raw_choice;
|
|
best_choice = NULL;
|
|
raw_choice = NULL;
|
|
}
|
|
if (!alt_choices.empty()) {
|
|
alt_choices.delete_data_pointers();
|
|
alt_choices.clear();
|
|
}
|
|
alt_states.clear();
|
|
if (ep_choice != NULL) {
|
|
delete ep_choice;
|
|
ep_choice = NULL;
|
|
}
|
|
if (blamer_bundle != NULL) blamer_bundle->ClearResults();
|
|
}
|
|
|
|
bool PAGE_RES_IT::operator ==(const PAGE_RES_IT &other) const {
|
|
return word_res == other.word_res &&
|
|
row_res == other.row_res &&
|
|
block_res == other.block_res;
|
|
}
|
|
|
|
int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
|
|
ASSERT_HOST(page_res == other.page_res);
|
|
if (other.block_res == NULL) {
|
|
// other points to the end of the page.
|
|
if (block_res == NULL)
|
|
return 0;
|
|
return -1;
|
|
}
|
|
if (block_res == NULL) {
|
|
return 1; // we point to the end of the page.
|
|
}
|
|
if (block_res == other.block_res) {
|
|
if (other.row_res == NULL || row_res == NULL) {
|
|
// this should only happen if we hit an image block.
|
|
return 0;
|
|
}
|
|
if (row_res == other.row_res) {
|
|
// we point to the same block and row.
|
|
ASSERT_HOST(other.word_res != NULL && word_res != NULL);
|
|
if (word_res == other.word_res) {
|
|
// we point to the same word!
|
|
return 0;
|
|
}
|
|
|
|
WERD_RES_IT word_res_it(&row_res->word_res_list);
|
|
for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
|
|
word_res_it.forward()) {
|
|
if (word_res_it.data() == word_res) {
|
|
return -1;
|
|
} else if (word_res_it.data() == other.word_res) {
|
|
return 1;
|
|
}
|
|
}
|
|
ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
|
|
}
|
|
|
|
// we both point to the same block, but different rows.
|
|
ROW_RES_IT row_res_it(&block_res->row_res_list);
|
|
for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
|
|
row_res_it.forward()) {
|
|
if (row_res_it.data() == row_res) {
|
|
return -1;
|
|
} else if (row_res_it.data() == other.row_res) {
|
|
return 1;
|
|
}
|
|
}
|
|
ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
|
|
}
|
|
|
|
// We point to different blocks.
|
|
BLOCK_RES_IT block_res_it(&page_res->block_res_list);
|
|
for (block_res_it.mark_cycle_pt();
|
|
!block_res_it.cycled_list(); block_res_it.forward()) {
|
|
if (block_res_it.data() == block_res) {
|
|
return -1;
|
|
} else if (block_res_it.data() == other.block_res) {
|
|
return 1;
|
|
}
|
|
}
|
|
// Shouldn't happen...
|
|
ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
|
|
return 0;
|
|
}
|
|
|
|
// Inserts the new_word and a corresponding WERD_RES before the current
|
|
// position. The simple fields of the WERD_RES are copied from clone_res and
|
|
// the resulting WERD_RES is returned for further setup with best_choice etc.
|
|
WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res,
|
|
WERD* new_word) {
|
|
// Insert new_word into the ROW.
|
|
WERD_IT w_it(row()->row->word_list());
|
|
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
|
|
WERD* word = w_it.data();
|
|
if (word == word_res->word)
|
|
break;
|
|
}
|
|
ASSERT_HOST(!w_it.cycled_list());
|
|
w_it.add_before_then_move(new_word);
|
|
// Make a WERD_RES for the new_word.
|
|
WERD_RES* new_res = new WERD_RES(new_word);
|
|
new_res->CopySimpleFields(clone_res);
|
|
// Insert into the appropriate place in the ROW_RES.
|
|
WERD_RES_IT wr_it(&row()->word_res_list);
|
|
for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
|
|
WERD_RES* word = wr_it.data();
|
|
if (word == word_res)
|
|
break;
|
|
}
|
|
ASSERT_HOST(!wr_it.cycled_list());
|
|
wr_it.add_before_then_move(new_res);
|
|
if (wr_it.at_first()) {
|
|
// This is the new first word, so reset the member iterator so it
|
|
// detects the cycled_list state correctly.
|
|
ResetWordIterator();
|
|
}
|
|
return new_res;
|
|
}
|
|
|
|
// Deletes the current WERD_RES and its underlying WERD.
|
|
void PAGE_RES_IT::DeleteCurrentWord() {
|
|
// Check that this word is as we expect. part_of_combos are NEVER iterated
|
|
// by the normal iterator, so we should never be trying to delete them.
|
|
ASSERT_HOST(!word_res->part_of_combo);
|
|
if (!word_res->combination) {
|
|
// Combinations own their own word, so we won't find the word on the
|
|
// row's word_list, but it is legitimate to try to delete them.
|
|
// Delete word from the ROW when not a combination.
|
|
WERD_IT w_it(row()->row->word_list());
|
|
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
|
|
if (w_it.data() == word_res->word) {
|
|
break;
|
|
}
|
|
}
|
|
ASSERT_HOST(!w_it.cycled_list());
|
|
delete w_it.extract();
|
|
}
|
|
// Remove the WERD_RES for the new_word.
|
|
// Remove the WORD_RES from the ROW_RES.
|
|
WERD_RES_IT wr_it(&row()->word_res_list);
|
|
for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
|
|
if (wr_it.data() == word_res) {
|
|
word_res = NULL;
|
|
break;
|
|
}
|
|
}
|
|
ASSERT_HOST(!wr_it.cycled_list());
|
|
delete wr_it.extract();
|
|
ResetWordIterator();
|
|
}
|
|
|
|
/*************************************************************************
|
|
* PAGE_RES_IT::restart_page
|
|
*
|
|
* Set things up at the start of the page
|
|
*************************************************************************/
|
|
|
|
WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) {
|
|
block_res_it.set_to_list(&page_res->block_res_list);
|
|
block_res_it.mark_cycle_pt();
|
|
prev_block_res = NULL;
|
|
prev_row_res = NULL;
|
|
prev_word_res = NULL;
|
|
block_res = NULL;
|
|
row_res = NULL;
|
|
word_res = NULL;
|
|
next_block_res = NULL;
|
|
next_row_res = NULL;
|
|
next_word_res = NULL;
|
|
internal_forward(true, empty_ok);
|
|
return internal_forward(false, empty_ok);
|
|
}
|
|
|
|
// Recovers from operations on the current word, such as in InsertCloneWord
|
|
// and DeleteCurrentWord.
|
|
// Resets the word_res_it so that it is one past the next_word_res, as
|
|
// it should be after internal_forward. If next_row_res != row_res,
|
|
// then the next_word_res is in the next row, so there is no need to do
|
|
// anything, since operations on the current word will not have disturbed
|
|
// the word_res_it.
|
|
void PAGE_RES_IT::ResetWordIterator() {
|
|
if (row_res == next_row_res) {
|
|
// Reset the member iterator so it can move forward and detect the
|
|
// cycled_list state correctly.
|
|
word_res_it.move_to_first();
|
|
word_res_it.mark_cycle_pt();
|
|
while (!word_res_it.cycled_list() && word_res_it.data() != next_word_res)
|
|
word_res_it.forward();
|
|
ASSERT_HOST(!word_res_it.cycled_list());
|
|
word_res_it.forward();
|
|
}
|
|
}
|
|
|
|
/*************************************************************************
|
|
* PAGE_RES_IT::internal_forward
|
|
*
|
|
* Find the next word on the page. If empty_ok is true, then non-text blocks
|
|
* and text blocks with no text are visited as if they contain a single
|
|
* imaginary word in a single imaginary row. (word() and row() both return NULL
|
|
* in such a block and the return value is NULL.)
|
|
* If empty_ok is false, the old behaviour is maintained. Each real word
|
|
* is visited and empty and non-text blocks and rows are skipped.
|
|
* new_block is used to initialize the iterators for a new block.
|
|
* The iterator maintains pointers to block, row and word for the previous,
|
|
* current and next words. These are correct, regardless of block/row
|
|
* boundaries. NULL values denote start and end of the page.
|
|
*************************************************************************/
|
|
|
|
WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
|
|
bool new_row = false;
|
|
|
|
prev_block_res = block_res;
|
|
prev_row_res = row_res;
|
|
prev_word_res = word_res;
|
|
block_res = next_block_res;
|
|
row_res = next_row_res;
|
|
word_res = next_word_res;
|
|
next_block_res = NULL;
|
|
next_row_res = NULL;
|
|
next_word_res = NULL;
|
|
|
|
while (!block_res_it.cycled_list()) {
|
|
if (new_block) {
|
|
new_block = false;
|
|
row_res_it.set_to_list(&block_res_it.data()->row_res_list);
|
|
row_res_it.mark_cycle_pt();
|
|
if (row_res_it.empty() && empty_ok) {
|
|
next_block_res = block_res_it.data();
|
|
break;
|
|
}
|
|
new_row = true;
|
|
}
|
|
while (!row_res_it.cycled_list()) {
|
|
if (new_row) {
|
|
new_row = false;
|
|
word_res_it.set_to_list(&row_res_it.data()->word_res_list);
|
|
word_res_it.mark_cycle_pt();
|
|
}
|
|
// Skip any part_of_combo words.
|
|
while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo)
|
|
word_res_it.forward();
|
|
if (!word_res_it.cycled_list()) {
|
|
next_block_res = block_res_it.data();
|
|
next_row_res = row_res_it.data();
|
|
next_word_res = word_res_it.data();
|
|
word_res_it.forward();
|
|
goto foundword;
|
|
}
|
|
// end of row reached
|
|
row_res_it.forward();
|
|
new_row = true;
|
|
}
|
|
// end of block reached
|
|
block_res_it.forward();
|
|
new_block = true;
|
|
}
|
|
foundword:
|
|
// Update prev_word_best_choice pointer.
|
|
if (page_res != NULL && page_res->prev_word_best_choice != NULL) {
|
|
*page_res->prev_word_best_choice =
|
|
(new_block || prev_word_res == NULL) ? NULL : prev_word_res->best_choice;
|
|
}
|
|
return word_res;
|
|
}
|
|
|
|
/*************************************************************************
|
|
* PAGE_RES_IT::restart_row()
|
|
*
|
|
* Move to the beginning (leftmost word) of the current row.
|
|
*************************************************************************/
|
|
WERD_RES *PAGE_RES_IT::restart_row() {
|
|
ROW_RES *row = this->row();
|
|
if (!row) return NULL;
|
|
for (restart_page(); this->row() != row; forward()) {
|
|
// pass
|
|
}
|
|
return word();
|
|
}
|
|
|
|
/*************************************************************************
|
|
* PAGE_RES_IT::forward_paragraph
|
|
*
|
|
* Move to the beginning of the next paragraph, allowing empty blocks.
|
|
*************************************************************************/
|
|
|
|
WERD_RES *PAGE_RES_IT::forward_paragraph() {
|
|
while (block_res == next_block_res &&
|
|
(next_row_res != NULL && next_row_res->row != NULL &&
|
|
row_res->row->para() == next_row_res->row->para())) {
|
|
internal_forward(false, true);
|
|
}
|
|
return internal_forward(false, true);
|
|
}
|
|
|
|
/*************************************************************************
|
|
* PAGE_RES_IT::forward_block
|
|
*
|
|
* Move to the beginning of the next block, allowing empty blocks.
|
|
*************************************************************************/
|
|
|
|
WERD_RES *PAGE_RES_IT::forward_block() {
|
|
while (block_res == next_block_res) {
|
|
internal_forward(false, true);
|
|
}
|
|
return internal_forward(false, true);
|
|
}
|
|
|
|
void PAGE_RES_IT::rej_stat_word() {
|
|
inT16 chars_in_word;
|
|
inT16 rejects_in_word = 0;
|
|
|
|
chars_in_word = word_res->reject_map.length ();
|
|
page_res->char_count += chars_in_word;
|
|
block_res->char_count += chars_in_word;
|
|
row_res->char_count += chars_in_word;
|
|
|
|
rejects_in_word = word_res->reject_map.reject_count ();
|
|
|
|
page_res->rej_count += rejects_in_word;
|
|
block_res->rej_count += rejects_in_word;
|
|
row_res->rej_count += rejects_in_word;
|
|
if (chars_in_word == rejects_in_word)
|
|
row_res->whole_word_rej_count += rejects_in_word;
|
|
}
|