Major refactor of beam search, elimination of dead code, misc bug fixes, updates to Makefile.am, Changelog etc.

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@878 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
theraysmith@gmail.com 2013-09-23 15:26:50 +00:00
parent 2c909702c9
commit 4d514d5a60
187 changed files with 40713 additions and 14004 deletions

View File

@ -1,3 +1,15 @@
2013-09-20 v3.03
* Added Renderer to API to allow document-level processing and output
of document formats, like hOCR, PDF.
* Major refactor of word-level recognition, beam search, eliminating dead code.
* Refactored classifier to make it easier to add new ones.
* Generalized feature extractor to allow feature extraction from greyscale.
* Improved sub/superscript treatment.
* Improved baseline fit.
* Added set_unicharset_properties to training tools.
* Many bug fixes.
2012-02-01 - v3.02
* Moved ResultIterator/PageIterator to ccmain.
* Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic.

View File

@ -9,7 +9,7 @@ if VISIBILITY
AM_CPPFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
endif
include_HEADERS = apitypes.h baseapi.h capi.h
include_HEADERS = apitypes.h baseapi.h capi.h renderer.h
lib_LTLIBRARIES =
if !USING_MULTIPLELIBS
@ -35,7 +35,7 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS)
if VISIBILITY
libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS
endif
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp
lib_LTLIBRARIES += libtesseract.la
libtesseract_la_LDFLAGS =

View File

@ -2,6 +2,8 @@
# define TESS_CAPI_INCLUDE_BASEAPI
#endif
#include "capi.h"
#include "genericvector.h"
#include "strngs.h"
TESS_API const char* TESS_CALL TessVersion()
{
@ -382,10 +384,10 @@ TESS_API BOOL TESS_CALL TessBaseAPIDetectOS(TessBaseAPI* handle, OSResults* resu
return handle->DetectOS(results) ? TRUE : FALSE;
}
TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, INT_FEATURE_ARRAY int_features,
TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, INT_FEATURE_STRUCT* int_features,
int* num_features, int* FeatureOutlineIndex)
{
handle->GetFeaturesForBlob(blob, *denorm, int_features, num_features, FeatureOutlineIndex);
handle->GetFeaturesForBlob(blob, int_features, num_features, FeatureOutlineIndex);
}
TESS_API ROW* TESS_CALL TessFindRowForBox(BLOCK_LIST* blocks, int left, int top, int right, int bottom)
@ -393,10 +395,10 @@ TESS_API ROW* TESS_CALL TessFindRowForBox(BLOCK_LIST* blocks, int left, int top,
return TessBaseAPI::FindRowForBox(blocks, left, top, right, bottom);
}
TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, int num_max_matches,
TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, int num_max_matches,
int* unichar_ids, float* ratings, int* num_matches_returned)
{
handle->RunAdaptiveClassifier(blob, *denorm, num_max_matches, unichar_ids, ratings, num_matches_returned);
handle->RunAdaptiveClassifier(blob, num_max_matches, unichar_ids, ratings, num_matches_returned);
}
TESS_API const char* TESS_CALL TessBaseAPIGetUnichar(TessBaseAPI* handle, int unichar_id)
@ -424,9 +426,9 @@ TESS_API TBLOB* TESS_CALL TessMakeTBLOB(struct Pix *pix)
return TessBaseAPI::MakeTBLOB(pix);
}
TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode, DENORM *denorm)
TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode)
{
TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode != FALSE, denorm);
TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode != FALSE);
}
TESS_API TessOcrEngineMode TESS_CALL TessBaseAPIOem(const TessBaseAPI* handle)

View File

@ -205,11 +205,11 @@ TESS_API void TESS_CALL TessBaseAPISetProbabilityInContextFunc(TessBaseAPI* han
TESS_API void TESS_CALL TessBaseAPISetFillLatticeFunc(TessBaseAPI* handle, TessFillLatticeFunc f);
TESS_API BOOL TESS_CALL TessBaseAPIDetectOS(TessBaseAPI* handle, OSResults* results);
TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, INT_FEATURE_ARRAY int_features,
TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, INT_FEATURE_STRUCT* int_features,
int* num_features, int* FeatureOutlineIndex);
TESS_API ROW* TESS_CALL TessFindRowForBox(BLOCK_LIST* blocks, int left, int top, int right, int bottom);
TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, int num_max_matches,
TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, int num_max_matches,
int* unichar_ids, float* ratings, int* num_matches_returned);
#endif
@ -226,7 +226,7 @@ TESS_API int TESS_CALL TessBaseAPINumDawgs(const TessBaseAPI* handle);
TESS_API ROW* TESS_CALL TessMakeTessOCRRow(float baseline, float xheight, float descender, float ascender);
TESS_API TBLOB*
TESS_CALL TessMakeTBLOB(Pix *pix);
TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode, DENORM *denorm);
TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode);
TESS_API TessOcrEngineMode
TESS_CALL TessBaseAPIOem(const TessBaseAPI* handle);

View File

@ -19,7 +19,7 @@ noinst_HEADERS = \
equationdetect.h fixspace.h imgscale.h mutableiterator.h osdetect.h \
output.h paragraphs.h paragraphs_internal.h paramsd.h pgedit.h \
reject.h scaleimg.h tessbox.h tessedit.h tesseractclass.h \
tesseract_cube_combiner.h tessvars.h tfacep.h tfacepp.h werdit.h
tesseract_cube_combiner.h tessvars.h werdit.h
if !USING_MULTIPLELIBS
noinst_LTLIBRARIES = libtesseract_main.la
@ -46,7 +46,7 @@ libtesseract_main_la_SOURCES = \
imgscale.cpp ltrresultiterator.cpp \
osdetect.cpp output.cpp pageiterator.cpp pagesegmain.cpp \
pagewalk.cpp paragraphs.cpp paramsd.cpp pgedit.cpp recogtraining.cpp \
reject.cpp resultiterator.cpp scaleimg.cpp \
reject.cpp resultiterator.cpp scaleimg.cpp superscript.cpp \
tesseract_cube_combiner.cpp \
tessbox.cpp tessedit.cpp tesseractclass.cpp tessvars.cpp \
tfacepp.cpp thresholder.cpp \

View File

@ -114,27 +114,12 @@ BOOL8 Tesseract::word_adaptable( //should we adapt?
return FALSE;
}
// if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word))
if (flags.bit (CHECK_AMBIG_WERD) &&
!getDict().NoDangerousAmbig(word->best_choice, NULL, false, NULL, NULL)) {
word->best_choice->dangerous_ambig_found()) {
if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
return FALSE;
}
// Do not adapt to words that are composed from fragments if
// tessedit_adapt_to_char_fragments is false.
if (!tessedit_adapt_to_char_fragments) {
const char *fragment_lengths = word->best_choice->fragment_lengths();
if (fragment_lengths != NULL && *fragment_lengths != '\0') {
for (int i = 0; i < word->best_choice->length(); ++i) {
if (fragment_lengths[i] > 1) {
if (tessedit_adaption_debug) tprintf("won't adapt to fragments\n");
return false; // found a character composed from fragments
}
}
}
}
if (tessedit_adaption_debug) {
tprintf("returning status %d\n", status);
}

View File

@ -235,21 +235,6 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
return page_res;
}
// Helper to make a WERD_CHOICE from the BLOB_CHOICE_LIST_VECTOR using only
// the top choices. Avoids problems with very long words.
static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices,
const UNICHARSET& unicharset,
WERD_CHOICE* word_choice) {
*word_choice = WERD_CHOICE(&unicharset); // clear the word choice.
word_choice->make_bad();
for (int i = 0; i < char_choices.size(); ++i) {
BLOB_CHOICE_IT it(char_choices[i]);
BLOB_CHOICE* bc = it.data();
word_choice->append_unichar_id(bc->unichar_id(), 1,
bc->rating(), bc->certainty());
}
}
// Tests the chopper by exhaustively running chop_one_blob.
// The word_res will contain filled chopped_word, seam_array, denorm,
// box_word and best_state for the maximally chopped word.
@ -257,7 +242,8 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
BLOCK* block, ROW* row,
WERD_RES* word_res) {
if (!word_res->SetupForTessRecognition(unicharset, this, BestPix(), false,
this->textord_use_cjk_fp_model,
textord_use_cjk_fp_model,
poly_allow_detailed_fx,
row, block)) {
word_res->CloneChoppedToRebuild();
return;
@ -266,13 +252,10 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
tprintf("Maximally chopping word at:");
word_res->word->bounding_box().print();
}
blob_match_table.init_match_table();
BLOB_CHOICE_LIST *match_result;
BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
ASSERT_HOST(word_res->chopped_word->blobs != NULL);
GenericVector<BLOB_CHOICE*> blob_choices;
ASSERT_HOST(!word_res->chopped_word->blobs.empty());
float rating = static_cast<float>(MAX_INT8);
for (TBLOB* blob = word_res->chopped_word->blobs; blob != NULL;
blob = blob->next) {
for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
// The rating and certainty are not quite arbitrary. Since
// select_blob_to_chop uses the worst certainty to choose, they all have
// to be different, so starting with MAX_INT8, subtract 1/8 for each blob
@ -281,32 +264,33 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
// produced, however much chopping is required. The chops are thus only
// limited by the ability of the chopper to find suitable chop points,
// and not by the value of the certainties.
match_result = fake_classify_blob(0, rating, -rating);
modify_blob_choice(match_result, 0);
ASSERT_HOST(!match_result->empty());
*char_choices += match_result;
BLOB_CHOICE* choice =
new BLOB_CHOICE(0, rating, -rating, -1, -1, 0, 0, 0, 0, BCC_FAKE);
blob_choices.push_back(choice);
rating -= 0.125f;
}
inT32 blob_number;
const double e = exp(1.0); // The base of natural logs.
int blob_number;
int right_chop_index = 0;
if (!assume_fixed_pitch_char_segment) {
// We only chop if the language is not fixed pitch like CJK.
if (prioritize_division) {
while (chop_one_blob2(boxes, word_res, &word_res->seam_array));
} else {
while (chop_one_blob(word_res->chopped_word, char_choices,
&blob_number, &word_res->seam_array,
&right_chop_index));
SEAM* seam = NULL;
while ((seam = chop_one_blob(boxes, blob_choices, word_res,
&blob_number)) != NULL) {
word_res->InsertSeam(blob_number, seam);
BLOB_CHOICE* left_choice = blob_choices[blob_number];
rating = left_choice->rating() / e;
left_choice->set_rating(rating);
left_choice->set_certainty(-rating);
// combine confidence w/ serial #
BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
rating - 0.125f, -rating,
-1, -1, 0, 0, 0, 0, BCC_FAKE);
blob_choices.insert(right_choice, blob_number + 1);
}
}
MakeWordChoice(*char_choices, unicharset, word_res->best_choice);
MakeWordChoice(*char_choices, unicharset, word_res->raw_choice);
word_res->CloneChoppedToRebuild();
blob_match_table.end_match_table();
if (char_choices != NULL) {
char_choices->delete_data_pointers();
delete char_choices;
}
word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
}
// Helper to compute the dispute resolution metric.
@ -558,7 +542,6 @@ bool Tesseract::ConvertStringToUnichars(const char* utf8,
// substitutions ARE used.
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
WERD_RES* word_res) {
blob_match_table.init_match_table();
// Classify all required combinations of blobs and save results in choices.
int word_length = word_res->box_word->length();
GenericVector<BLOB_CHOICE_LIST*>* choices =
@ -566,8 +549,8 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
for (int i = 0; i < word_length; ++i) {
for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
BLOB_CHOICE_LIST* match_result = classify_piece(
word_res->chopped_word->blobs, word_res->denorm, word_res->seam_array,
i, i + j - 1, word_res->blamer_bundle);
word_res->seam_array, i, i + j - 1, "Applybox",
word_res->chopped_word, word_res->blamer_bundle);
if (applybox_debug > 2) {
tprintf("%d+%d:", i, j);
print_ratings_list("Segment:", match_result, unicharset);
@ -583,7 +566,6 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
float best_rating = 0.0f;
SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
&search_segmentation, &best_rating, &word_res->best_state);
blob_match_table.end_match_table();
for (int i = 0; i < word_length; ++i)
choices[i].delete_data_pointers();
delete [] choices;
@ -591,9 +573,8 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
// Build the original segmentation and if it is the same length as the
// truth, assume it will do.
int blob_count = 1;
for (int s = 0; s < array_count(word_res->seam_array); ++s) {
SEAM* seam =
reinterpret_cast<SEAM*>(array_value(word_res->seam_array, s));
for (int s = 0; s < word_res->seam_array.size(); ++s) {
SEAM* seam = word_res->seam_array[s];
if (seam->split1 == NULL) {
word_res->best_state.push_back(blob_count);
blob_count = 1;
@ -707,21 +688,25 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
WERD_RES* word_res;
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
int ok_in_word = 0;
BLOB_CHOICE_LIST_VECTOR char_choices;
for (int i = word_res->correct_text.size() - 1; i >= 0; i--) {
if (word_res->correct_text[i].length() > 0) {
int blob_count = word_res->correct_text.size();
WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
word_choice->set_permuter(TOP_CHOICE_PERM);
for (int c = 0; c < blob_count; ++c) {
if (word_res->correct_text[c].length() > 0) {
++ok_in_word;
}
// Since we only need a fake word_res->best_choice, the actual
// unichar_ids do not matter. Which is fortunate, since TidyUp()
// can be called while training Tesseract, at the stage where
// unicharset is not meaningful yet.
char_choices += fake_classify_blob(INVALID_UNICHAR_ID, 1.0, -1.0);
word_choice->append_unichar_id_space_allocated(
INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
}
if (ok_in_word > 0) {
ok_blob_count += ok_in_word;
bad_blob_count += word_res->correct_text.size() - ok_in_word;
MakeWordChoice(char_choices, unicharset, word_res->best_choice);
word_res->LogNewRawChoice(word_choice);
word_res->LogNewCookedChoice(1, false, word_choice);
} else {
++unlabelled_words;
if (applybox_debug > 0) {
@ -730,7 +715,6 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
}
pr_it.DeleteCurrentWord();
}
char_choices.delete_data_pointers();
}
pr_it.restart_page();
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
@ -772,11 +756,13 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
GenericVector<STRING> tokens;
word_res->correct_text[i].split(' ', &tokens);
UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f);
choice->append_unichar_id_space_allocated(char_id,
word_res->best_state[i],
0.0f, 0.0f);
}
if (word_res->best_choice != NULL)
delete word_res->best_choice;
word_res->best_choice = choice;
word_res->ClearWordChoices();
word_res->LogNewRawChoice(choice);
word_res->LogNewCookedChoice(1, false, choice);
}
}
@ -787,7 +773,7 @@ void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) {
int word_count = 0;
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
word_res = pr_it.forward()) {
LearnWord(filename.string(), NULL, word_res);
LearnWord(filename.string(), word_res);
++word_count;
}
tprintf("Generated training data for %d words\n", word_count);

View File

@ -29,7 +29,6 @@
#include "ocrclass.h"
#include "werdit.h"
#include "drawfx.h"
#include "tfacep.h"
#include "tessbox.h"
#include "tessvars.h"
#include "pgedit.h"
@ -55,6 +54,9 @@
const char* const kBackUpConfigFile = "tempconfigdata.config";
// Multiple of x-height to make a repeated word have spaces in it.
const double kRepcharGapThreshold = 0.5;
// Min believable x-height for any text when refitting as a fraction of
// original x-height
const double kMinRefitXHeightFraction = 0.5;
/**
@ -293,9 +295,9 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
// Update misadaption log (we only need to do it on pass 1, since
// adaption only happens on this pass).
if (page_res_it.word()->blamer_bundle != NULL &&
page_res_it.word()->blamer_bundle->misadaption_debug.length() > 0) {
page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
page_res->misadaption_log.push_back(
page_res_it.word()->blamer_bundle->misadaption_debug);
page_res_it.word()->blamer_bundle->misadaption_debug());
}
page_res_it.forward();
@ -308,7 +310,8 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
page_res_it.restart_page();
word_index = 0;
most_recently_used_ = this;
while (!tessedit_test_adaption && page_res_it.word() != NULL) {
while (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption &&
page_res_it.word() != NULL) {
set_global_loc_code(LOC_PASS2);
word_index++;
if (monitor != NULL) {
@ -382,17 +385,6 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
blamer_pass(page_res);
}
if (!save_blob_choices) {
// We aren't saving the blob choices so get rid of them now.
// set_blob_choices() does a deep clear.
page_res_it.restart_page();
while (page_res_it.word() != NULL) {
WERD_RES* word = page_res_it.word();
word->best_choice->set_blob_choices(NULL);
page_res_it.forward();
}
}
// Write results pass.
set_global_loc_code(LOC_WRITE_RESULTS);
// This is now redundant, but retained commented so show how to obtain
@ -436,39 +428,21 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
continue;
}
// Two words sharing the same language model, excellent!
if (w->alt_choices.empty()) {
if (tessedit_bigram_debug) {
tprintf("Alt choices not set up for word choice: %s\n",
w->best_choice->unichar_string().string());
}
continue;
}
if (w_prev->alt_choices.empty()) {
if (tessedit_bigram_debug) {
tprintf("Alt choices not set up for word choice: %s\n",
w_prev->best_choice->unichar_string().string());
}
continue;
}
// We saved alternate choices, excellent!
GenericVector<WERD_CHOICE *> overrides_word1;
GenericVector<GenericVector<int> *> overrides_word1_state;
GenericVector<WERD_CHOICE *> overrides_word2;
GenericVector<GenericVector<int> *> overrides_word2_state;
STRING orig_w1_str = w_prev->best_choice->unichar_string();
STRING orig_w2_str = w->best_choice->unichar_string();
WERD_CHOICE prev_best(w->uch_set);
{
int w1start, w1end;
w_prev->WithoutFootnoteSpan(&w1start, &w1end);
w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
}
WERD_CHOICE this_best(w->uch_set);
{
int w2start, w2end;
w->WithoutFootnoteSpan(&w2start, &w2end);
w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
this_best = w->best_choice->shallow_copy(w2start, w2end);
}
@ -484,37 +458,36 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
orig_w1_str.string(), orig_w2_str.string());
}
if (tessedit_bigram_debug > 1) {
if (w_prev->alt_choices.size() > 1) {
print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices);
if (!w_prev->best_choices.singleton()) {
w_prev->PrintBestChoices();
}
if (w->alt_choices.size() > 1) {
print_word_alternates_list(w->best_choice, &w->alt_choices);
if (!w->best_choices.singleton()) {
w->PrintBestChoices();
}
}
float best_rating = 0.0;
int best_idx = 0;
for (int i = 0; i < w_prev->alt_choices.size(); i++) {
WERD_CHOICE *p1 = w_prev->alt_choices.get(i);
WERD_CHOICE_IT prev_it(&w_prev->best_choices);
for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
WERD_CHOICE *p1 = prev_it.data();
WERD_CHOICE strip1(w->uch_set);
{
int p1start, p1end;
w_prev->WithoutFootnoteSpan(*p1, w_prev->alt_states.get(i),
&p1start, &p1end);
p1->GetNonSuperscriptSpan(&p1start, &p1end);
strip1 = p1->shallow_copy(p1start, p1end);
}
for (int j = 0; j < w->alt_choices.size(); j++) {
WERD_CHOICE *p2 = w->alt_choices.get(j);
WERD_CHOICE_IT w_it(&w->best_choices);
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
WERD_CHOICE *p2 = w_it.data();
WERD_CHOICE strip2(w->uch_set);
{
int p2start, p2end;
w->WithoutFootnoteSpan(*p2, w->alt_states.get(j), &p2start, &p2end);
p2->GetNonSuperscriptSpan(&p2start, &p2end);
strip2 = p2->shallow_copy(p2start, p2end);
}
if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
overrides_word1.push_back(p1);
overrides_word1_state.push_back(&w_prev->alt_states.get(i));
overrides_word2.push_back(p2);
overrides_word2_state.push_back(&w->alt_states.get(j));
if (overrides_word1.size() == 1 ||
p1->rating() + p2->rating() < best_rating) {
best_rating = p1->rating() + p2->rating();
@ -538,12 +511,10 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
if (new_w1_str != orig_w1_str) {
w_prev->ReplaceBestChoice(*overrides_word1[best_idx],
*overrides_word1_state[best_idx]);
w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
}
if (new_w2_str != orig_w2_str) {
w->ReplaceBestChoice(*overrides_word2[best_idx],
*overrides_word2_state[best_idx]);
w->ReplaceBestChoice(overrides_word2[best_idx]);
}
if (tessedit_bigram_debug > 0) {
STRING choices_description;
@ -684,34 +655,8 @@ void Tesseract::blamer_pass(PAGE_RES* page_res) {
for (page_res_it.restart_page(); page_res_it.word() != NULL;
page_res_it.forward()) {
WERD_RES *word = page_res_it.word();
if (word->blamer_bundle == NULL) {
word->blamer_bundle = new BlamerBundle();
word->blamer_bundle->incorrect_result_reason = IRR_PAGE_LAYOUT;
word->blamer_bundle->debug = word->blamer_bundle->IncorrectReason();
word->blamer_bundle->debug += " to blame";
} else if (word->blamer_bundle->incorrect_result_reason ==
IRR_NO_TRUTH) {
word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth",
word->best_choice, wordrec_debug_blamer);
} else {
bool correct = ChoiceIsCorrect(*word->uch_set, word->best_choice,
word->blamer_bundle->truth_text);
IncorrectResultReason irr =
word->blamer_bundle->incorrect_result_reason;
if (irr == IRR_CORRECT && !correct) {
STRING debug = "Choice is incorrect after recognition";
word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug,
word->best_choice,
wordrec_debug_blamer);
} else if (irr != IRR_CORRECT && correct) {
if (wordrec_debug_blamer) {
tprintf("Corrected %s\n", word->blamer_bundle->debug.string());
}
word->blamer_bundle->incorrect_result_reason = IRR_CORRECT;
word->blamer_bundle->debug = "";
}
}
page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason]++;
BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);
page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;
}
tprintf("Blame reasons:\n");
for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
@ -730,7 +675,9 @@ void Tesseract::blamer_pass(PAGE_RES* page_res) {
// Helper returns true if the new_word is better than the word, using a
// simple test of better certainty AND rating (to reduce false positives
// from cube) or a dictionary vs non-dictionary word.
static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word) {
static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word,
double rating_ratio,
double certainty_margin) {
if (new_word.best_choice == NULL) {
return false; // New one no good.
}
@ -742,7 +689,11 @@ static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word) {
return true; // New word has better confidence.
}
if (!Dict::valid_word_permuter(word.best_choice->permuter(), false) &&
Dict::valid_word_permuter(new_word.best_choice->permuter(), false)) {
Dict::valid_word_permuter(new_word.best_choice->permuter(), false) &&
new_word.best_choice->rating() <
word.best_choice->rating() * rating_ratio &&
new_word.best_choice->certainty() >
word.best_choice->certainty() - certainty_margin) {
return true; // New word is from a dictionary.
}
return false; // New word is no better.
@ -764,7 +715,9 @@ bool Tesseract::RetryWithLanguage(WERD_RES *word, BLOCK* block, ROW *row,
// (to reduce false positives from cube) or a dictionary vs non-dictionary
// word.
(this->*recognizer)(block, row, &lang_word);
bool new_is_better = NewWordBetter(*word, lang_word);
bool new_is_better = NewWordBetter(*word, lang_word,
classify_max_rating_ratio,
classify_max_certainty_margin);
if (classify_debug_level || cube_debug_level) {
if (lang_word.best_choice == NULL) {
tprintf("New result %s better:%s\n",
@ -793,6 +746,7 @@ void Tesseract::classify_word_and_language(WordRecognizer recognizer,
BLOCK* block,
ROW *row,
WERD_RES *word) {
clock_t start_t = clock();
if (classify_debug_level || cube_debug_level) {
tprintf("Processing word with lang %s at:",
most_recently_used_->lang.string());
@ -811,12 +765,15 @@ void Tesseract::classify_word_and_language(WordRecognizer recognizer,
if (!word->tess_failed && word->tess_accepted)
result_type = "Accepted";
if (classify_debug_level || cube_debug_level) {
tprintf("%s result: %s r=%g, c=%g, accepted=%d, adaptable=%d\n",
tprintf("%s result: %s r=%.4g, c=%.4g, accepted=%d, adaptable=%d"
" xht=[%g,%g]\n",
result_type,
word->best_choice->unichar_string().string(),
word->best_choice->rating(),
word->best_choice->certainty(),
word->tess_accepted, word->tess_would_adapt);
word->tess_accepted, word->tess_would_adapt,
word->best_choice->min_x_height(),
word->best_choice->max_x_height());
}
if (word->tess_failed || !word->tess_accepted) {
// Try all the other languages to see if they are any better.
@ -846,6 +803,12 @@ void Tesseract::classify_word_and_language(WordRecognizer recognizer,
}
}
}
clock_t ocr_t = clock();
if (tessedit_timing_debug) {
tprintf("%s (ocr took %.2f sec)\n",
word->best_choice->unichar_string().string(),
static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
}
}
/**
@ -860,92 +823,25 @@ void Tesseract::classify_word_pass1(BLOCK* block, ROW *row, WERD_RES *word) {
cube_word_pass1(block, row, word);
return;
}
match_word_pass_n(1, word, row, block);
if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
word->tess_would_adapt = AdaptableWord(word);
bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
BOOL8 adapt_ok;
const char *rejmap;
inT16 index;
STRING mapstr = "";
check_debug_pt(word, 0);
if (word->SetupForTessRecognition(unicharset, this, BestPix(),
classify_bln_numeric_mode,
this->textord_use_cjk_fp_model,
row, block))
tess_segment_pass1(word, blob_choices);
if (!word->tess_failed) {
/*
The adaption step used to be here. It has been moved to after
make_reject_map so that we know whether the word will be accepted in the
first pass or not. This move will PREVENT adaption to words containing
double quotes because the word will not be identical to what tess thinks
its best choice is. (See CurrentBestChoiceIs in
stopper.cpp which is used by AdaptableWord in
adaptmatch.cpp)
*/
if (!word->word->flag(W_REP_CHAR)) {
// TODO(daria) delete these hacks when replaced by more generic code.
// Convert '' (double single) to " (single double).
word->fix_quotes(blob_choices);
if (tessedit_fix_hyphens) // turn -- to -
word->fix_hyphens(blob_choices);
word->tess_accepted = tess_acceptable_word(word->best_choice,
word->raw_choice);
word->tess_would_adapt = word->best_choice && word->raw_choice &&
AdaptableWord(word->rebuild_word,
*word->best_choice,
*word->raw_choice);
// Also sets word->done flag
make_reject_map(word, blob_choices, row, 1);
adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
if (adapt_ok || tessedit_tess_adapt_to_rejmap) {
if (!tessedit_tess_adapt_to_rejmap) {
rejmap = NULL;
} else {
ASSERT_HOST(word->reject_map.length() ==
word->best_choice->length());
for (index = 0; index < word->reject_map.length(); index++) {
if (adapt_ok || word->reject_map[index].accepted())
mapstr += '1';
else
mapstr += '0';
}
rejmap = mapstr.string();
}
// Send word to adaptive classifier for training.
word->BestChoiceToCorrectText();
set_word_fonts(word, blob_choices);
LearnWord(NULL, rejmap, word);
// Mark misadaptions if running blamer.
if (word->blamer_bundle != NULL &&
word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH &&
!ChoiceIsCorrect(*word->uch_set, word->best_choice,
word->blamer_bundle->truth_text)) {
word->blamer_bundle->misadaption_debug ="misadapt to word (";
word->blamer_bundle->misadaption_debug +=
word->best_choice->permuter_name();
word->blamer_bundle->misadaption_debug += "): ";
word->blamer_bundle->FillDebugString(
"", word->best_choice, &(word->blamer_bundle->misadaption_debug));
if (wordrec_debug_blamer) {
tprintf("%s\n", word->blamer_bundle->misadaption_debug.string());
}
}
if (adapt_ok) {
// Send word to adaptive classifier for training.
word->BestChoiceToCorrectText();
LearnWord(NULL, word);
// Mark misadaptions if running blamer.
if (word->blamer_bundle != NULL) {
word->blamer_bundle->SetMisAdaptionDebug(word->best_choice,
wordrec_debug_blamer);
}
if (tessedit_enable_doc_dict)
tess_add_doc_word(word->best_choice);
}
}
// Save best choices in the WERD_CHOICE if needed
word->best_choice->set_blob_choices(blob_choices);
if (tessedit_enable_doc_dict && !word->IsAmbiguous())
tess_add_doc_word(word->best_choice);
}
}
// Helper to report the result of the xheight fix.
@ -976,7 +872,7 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
if (original_misfits == 0)
return false;
float new_x_ht = ComputeCompatibleXheight(word);
if (new_x_ht > 0.0f) {
if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
WERD_RES new_x_ht_word(word->word);
if (word->blamer_bundle != NULL) {
new_x_ht_word.blamer_bundle = new BlamerBundle();
@ -984,7 +880,7 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
}
new_x_ht_word.x_height = new_x_ht;
new_x_ht_word.caps_height = 0.0;
match_word_pass2(&new_x_ht_word, row, block);
match_word_pass_n(2, &new_x_ht_word, row, block);
if (!new_x_ht_word.tess_failed) {
int new_misfits = CountMisfitTops(&new_x_ht_word);
if (debug_x_ht_level >= 1) {
@ -1026,26 +922,24 @@ void Tesseract::classify_word_pass2(BLOCK* block, ROW *row, WERD_RES *word) {
tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED)
return;
bool done_this_pass = false;
set_global_subloc_code(SUBLOC_NORM);
check_debug_pt(word, 30);
if (!word->done || tessedit_training_tess) {
word->caps_height = 0.0;
if (word->x_height == 0.0f)
word->x_height = row->x_height();
match_word_pass2(word, row, block);
done_this_pass = TRUE;
match_word_pass_n(2, word, row, block);
check_debug_pt(word, 40);
}
SubAndSuperscriptFix(word);
if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
bool accept_new_xht = false;
if (unicharset.top_bottom_useful() && unicharset.script_has_xheight()) {
if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() &&
block->classify_rotation().y() == 0.0f) {
// Use the tops and bottoms since they are available.
accept_new_xht = TrainedXheightFix(word, block, row);
TrainedXheightFix(word, block, row);
}
if (accept_new_xht)
done_this_pass = true;
// Test for small caps. Word capheight must be close to block xheight,
// and word must contain no lower case letters, and at least one upper case.
double small_cap_xheight = block->x_height() * kXHeightCapRatio;
@ -1092,60 +986,38 @@ void Tesseract::classify_word_pass2(BLOCK* block, ROW *row, WERD_RES *word) {
* Baseline normalize the word and pass it to Tess.
*/
void Tesseract::match_word_pass2(WERD_RES *word, //word to do
ROW *row,
BLOCK* block) {
BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word,
ROW *row, BLOCK* block) {
if (word->SetupForTessRecognition(unicharset, this, BestPix(),
classify_bln_numeric_mode,
this->textord_use_cjk_fp_model,
textord_use_cjk_fp_model,
poly_allow_detailed_fx,
row, block))
tess_segment_pass2(word, blob_choices);
tess_segment_pass_n(pass_n, word);
if (!word->tess_failed) {
if (!word->word->flag (W_REP_CHAR)) {
word->fix_quotes(blob_choices);
word->fix_quotes();
if (tessedit_fix_hyphens)
word->fix_hyphens(blob_choices);
word->fix_hyphens();
/* Dont trust fix_quotes! - though I think I've fixed the bug */
if (word->best_choice->length() != word->box_word->length() ||
word->best_choice->length() != blob_choices->length()) {
if (word->best_choice->length() != word->box_word->length()) {
tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
" #Blobs=%d; #Choices=%d\n",
" #Blobs=%d\n",
word->best_choice->debug_string().string(),
word->best_choice->length(),
word->box_word->length(), blob_choices->length());
word->box_word->length());
}
word->tess_accepted = tess_acceptable_word(word->best_choice,
word->raw_choice);
word->tess_accepted = tess_acceptable_word(word);
make_reject_map (word, blob_choices, row, 2);
// Also sets word->done flag
make_reject_map(word, row, pass_n);
}
}
set_word_fonts(word);
// Save best choices in the WERD_CHOICE if needed
word->best_choice->set_blob_choices(blob_choices);
set_word_fonts(word, blob_choices);
assert (word->raw_choice != NULL);
}
// Helper to find the BLOB_CHOICE in the bc_list that matches the given
// unichar_id, or NULL if there is no match.
static BLOB_CHOICE* FindMatchingChoice(UNICHAR_ID char_id,
BLOB_CHOICE_LIST* bc_list) {
// Find the corresponding best BLOB_CHOICE.
BLOB_CHOICE_IT choice_it(bc_list);
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
choice_it.forward()) {
BLOB_CHOICE* choice = choice_it.data();
if (choice->unichar_id() == char_id) {
return choice;
}
}
return NULL;
ASSERT_HOST(word->raw_choice != NULL);
}
// Helper to return the best rated BLOB_CHOICE in the whole word that matches
@ -1154,9 +1026,9 @@ static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
WERD_RES* word_res) {
// Find the corresponding best BLOB_CHOICE from any position in the word_res.
BLOB_CHOICE* best_choice = NULL;
BLOB_CHOICE_LIST_C_IT bc_it(word_res->best_choice->blob_choices());
for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
BLOB_CHOICE* choice = FindMatchingChoice(char_id, bc_it.data());
for (int i = 0; i < word_res->best_choice->length(); ++i) {
BLOB_CHOICE* choice = FindMatchingChoice(char_id,
word_res->GetBlobChoices(i));
if (choice != NULL) {
if (best_choice == NULL || choice->rating() < best_choice->rating())
best_choice = choice;
@ -1171,12 +1043,11 @@ static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice,
WERD_RES* word_res) {
WERD_CHOICE* word = word_res->best_choice;
BLOB_CHOICE_LIST_C_IT bc_it(word->blob_choices());
for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
for (int i = 0; i < word_res->best_choice->length(); ++i) {
BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(),
bc_it.data());
word_res->GetBlobChoices(i));
if (choice == NULL) {
BLOB_CHOICE_IT choice_it(bc_it.data());
BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
}
}
@ -1267,7 +1138,8 @@ void Tesseract::ExplodeRepeatedWord(BLOB_CHOICE* best_choice,
// Setup the single char WERD_RES
if (rep_word->SetupForTessRecognition(*word_res->uch_set, this, BestPix(),
false,
this->textord_use_cjk_fp_model,
textord_use_cjk_fp_model,
poly_allow_detailed_fx,
page_res_it->row()->row,
page_res_it->block()->block)) {
rep_word->CloneChoppedToRebuild();
@ -1494,16 +1366,14 @@ static void find_modal_font( //good chars in word
*
* Get the fonts for the word.
*/
void Tesseract::set_word_fonts(WERD_RES *word,
BLOB_CHOICE_LIST_CLIST *blob_choices) {
if (blob_choices == NULL) return;
void Tesseract::set_word_fonts(WERD_RES *word) {
// Don't try to set the word fonts for a cube word, as the configs
// will be meaningless.
if (word->chopped_word == NULL) return;
ASSERT_HOST(word->best_choice != NULL);
inT32 index; // char id index
// character iterator
BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
BLOB_CHOICE_IT choice_it; // choice iterator
int fontinfo_size = get_fontinfo_table().size();
int fontset_size = get_fontset_table().size();
@ -1516,10 +1386,9 @@ void Tesseract::set_word_fonts(WERD_RES *word,
word->best_choice_fontinfo_ids.clear();
}
// Compute the modal font for the word
for (char_it.mark_cycle_pt(), index = 0;
!char_it.cycled_list(); ++index, char_it.forward()) {
for (index = 0; index < word->best_choice->length(); ++index) {
UNICHAR_ID word_ch_id = word->best_choice->unichar_id(index);
choice_it.set_to_list(char_it.data());
choice_it.set_to_list(word->GetBlobChoices(index));
if (tessedit_debug_fonts) {
tprintf("Examining fonts in %s\n",
word->best_choice->debug_string().string());

View File

@ -144,54 +144,6 @@ bool Tesseract::create_cube_box_word(Boxa *char_boxes,
return true;
}
/**********************************************************************
* create_werd_choice
*
**********************************************************************/
static WERD_CHOICE *create_werd_choice(
CharSamp** char_samples,
int num_chars,
const char* str,
float certainty,
const UNICHARSET &unicharset,
CharSet* cube_char_set
) {
// Insert unichar ids into WERD_CHOICE
WERD_CHOICE *werd_choice = new WERD_CHOICE(&unicharset, num_chars);
// within a word, cube recognizes the word in reading order.
werd_choice->set_unichars_in_script_order(true);
ASSERT_HOST(werd_choice != NULL);
UNICHAR_ID uch_id;
for (int i = 0; i < num_chars; ++i) {
uch_id = cube_char_set->UnicharID(char_samples[i]->StrLabel());
if (uch_id != INVALID_UNICHAR_ID)
werd_choice->append_unichar_id_space_allocated(
uch_id, 1, 0.0, certainty);
}
BLOB_CHOICE *blob_choice;
BLOB_CHOICE_LIST *choices_list;
BLOB_CHOICE_IT choices_list_it;
BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
BLOB_CHOICE_LIST_C_IT blob_choices_it;
blob_choices_it.set_to_list(blob_choices);
for (int i = 0; i < werd_choice->length(); ++i) {
// Create new BLOB_CHOICE_LIST for this unichar
choices_list = new BLOB_CHOICE_LIST();
choices_list_it.set_to_list(choices_list);
// Add a single BLOB_CHOICE to the list
blob_choice = new BLOB_CHOICE(werd_choice->unichar_id(i),
0.0, certainty, -1, -1, 0, 0, 0, false);
choices_list_it.add_after_then_move(blob_choice);
// Add list to the clist
blob_choices_it.add_to_end(choices_list);
}
werd_choice->set_certainty(certainty);
werd_choice->set_blob_choices(blob_choices);
return werd_choice;
}
/**********************************************************************
* init_cube_objects
*
@ -419,29 +371,32 @@ bool Tesseract::cube_recognize(CubeObject *cube_obj, BLOCK* block,
return false;
}
// Create cube's best choice.
WERD_CHOICE* cube_werd_choice = create_werd_choice(
char_samples, num_chars, cube_best_str.c_str(), cube_certainty,
unicharset, cube_cntxt_->CharacterSet());
delete []char_samples;
// Fill tesseract result's fields with cube results
fill_werd_res(cube_box_word, cube_best_str.c_str(), word);
if (!cube_werd_choice) {
if (cube_debug_level > 0) {
tprintf("Cube WARNING (Tesseract::cube_recognize): Could not "
"create cube WERD_CHOICE\n");
}
word->SetupFake(unicharset);
return false;
// Create cube's best choice.
BLOB_CHOICE** choices = new BLOB_CHOICE*[num_chars];
for (int i = 0; i < num_chars; ++i) {
UNICHAR_ID uch_id =
cube_cntxt_->CharacterSet()->UnicharID(char_samples[i]->StrLabel());
choices[i] = new BLOB_CHOICE(uch_id, 0.0, cube_certainty, -1, -1,
0, 0, 0, 0, BCC_STATIC_CLASSIFIER);
}
word->FakeClassifyWord(num_chars, choices);
// within a word, cube recognizes the word in reading order.
word->best_choice->set_unichars_in_script_order(true);
delete [] choices;
delete [] char_samples;
// Some sanity checks
ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
if (cube_debug_level || classify_debug_level) {
tprintf("Cube result: %s r=%g, c=%g\n",
cube_werd_choice->unichar_string().string(),
cube_werd_choice->rating(),
cube_werd_choice->certainty());
word->best_choice->unichar_string().string(),
word->best_choice->rating(),
word->best_choice->certainty());
}
// Fill tesseract result's fields with cube results
fill_werd_res(cube_box_word, cube_werd_choice, cube_best_str.c_str(), word);
return true;
}
@ -452,13 +407,8 @@ bool Tesseract::cube_recognize(CubeObject *cube_obj, BLOCK* block,
*
**********************************************************************/
void Tesseract::fill_werd_res(const BoxWord& cube_box_word,
WERD_CHOICE* cube_werd_choice,
const char* cube_best_str,
WERD_RES* tess_werd_res) {
// Replace tesseract results's best choice with cube's
tess_werd_res->best_choice = cube_werd_choice;
tess_werd_res->raw_choice = new WERD_CHOICE(*cube_werd_choice);
delete tess_werd_res->box_word;
tess_werd_res->box_word = new BoxWord(cube_box_word);
tess_werd_res->box_word->ClipToOriginalWord(tess_werd_res->denorm.block(),
@ -466,23 +416,13 @@ void Tesseract::fill_werd_res(const BoxWord& cube_box_word,
// Fill text and remaining fields
tess_werd_res->word->set_text(cube_best_str);
tess_werd_res->tess_failed = FALSE;
tess_werd_res->tess_accepted =
tess_acceptable_word(tess_werd_res->best_choice,
tess_werd_res->raw_choice);
tess_werd_res->tess_accepted = tess_acceptable_word(tess_werd_res);
// There is no output word, so we can' call AdaptableWord, but then I don't
// think we need to. Fudge the result with accepted.
tess_werd_res->tess_would_adapt = tess_werd_res->tess_accepted;
// Initialize the reject_map and set it to done, i.e., ignore all of
// tesseract's tests for rejection
tess_werd_res->reject_map.initialise(cube_werd_choice->length());
// Set word to done, i.e., ignore all of tesseract's tests for rejection
tess_werd_res->done = tess_werd_res->tess_accepted;
// Some sanity checks
ASSERT_HOST(tess_werd_res->best_choice->length() ==
tess_werd_res->best_choice->blob_choices()->length());
ASSERT_HOST(tess_werd_res->best_choice->length() ==
tess_werd_res->reject_map.length());
}
} // namespace tesseract

View File

@ -23,7 +23,6 @@
#include <ctype.h>
#include "docqual.h"
#include "tfacep.h"
#include "reject.h"
#include "tesscallback.h"
#include "tessvars.h"
@ -66,7 +65,7 @@ struct DocQualCallbacks {
*************************************************************************/
inT16 Tesseract::word_blob_quality(WERD_RES *word, ROW *row) {
if (word->bln_boxes == NULL ||
word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
return 0;
DocQualCallbacks cb(word);
@ -81,8 +80,8 @@ inT16 Tesseract::word_outline_errs(WERD_RES *word) {
inT16 err_count = 0;
if (word->rebuild_word != NULL) {
TBLOB* blob = word->rebuild_word->blobs;
for (; blob != NULL; blob = blob->next) {
for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
TBLOB* blob = word->rebuild_word->blobs[b];
err_count += count_outline_errs(word->best_choice->unichar_string()[i],
blob->NumOutlines());
i++;
@ -101,7 +100,7 @@ void Tesseract::word_char_quality(WERD_RES *word,
inT16 *match_count,
inT16 *accepted_match_count) {
if (word->bln_boxes == NULL ||
word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
return;
DocQualCallbacks cb(word);
@ -118,7 +117,7 @@ void Tesseract::word_char_quality(WERD_RES *word,
*************************************************************************/
void Tesseract::unrej_good_chs(WERD_RES *word, ROW *row) {
if (word->bln_boxes == NULL ||
word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
return;
DocQualCallbacks cb(word);
@ -990,7 +989,8 @@ BOOL8 Tesseract::noise_outlines(TWERD *word) {
inT16 max_dimension;
float small_limit = kBlnXHeight * crunch_small_outlines_size;
for (TBLOB* blob = word->blobs; blob != NULL; blob = blob->next) {
for (int b = 0; b < word->NumBlobs(); ++b) {
TBLOB* blob = word->blobs[b];
for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
outline_count++;
box = ol->bounding_box();
@ -1002,6 +1002,7 @@ BOOL8 Tesseract::noise_outlines(TWERD *word) {
small_outline_count++;
}
}
return (small_outline_count >= outline_count);
return small_outline_count >= outline_count;
}
} // namespace tesseract

View File

@ -19,7 +19,7 @@
#ifdef _MSC_VER
#pragma warning(disable:4244) // Conversion warnings
#include "mathfix.h"
#include <mathfix.h>
#endif
#ifdef __MINGW32__
@ -173,21 +173,21 @@ void EquationDetect::IdentifySpecialText(
BLOB_CHOICE_LIST ratings_equ, ratings_lang;
C_BLOB* blob = blobnbox->cblob();
TBLOB* tblob = TBLOB::PolygonalCopy(blob);
// TODO(joeliu/rays) Fix this. We may have to normalize separately for
// each classifier here, as they may require different PolygonalCopy.
TBLOB* tblob = TBLOB::PolygonalCopy(false, blob);
const TBOX& box = tblob->bounding_box();
// Normalize the blob. Set the origin to the place we want to be the
// bottom-middle, and scaling is to make the height the x-height.
float scaling = static_cast<float>(kBlnXHeight) / box.height();
DENORM denorm;
float x_orig = (box.left() + box.right()) / 2.0f, y_orig = box.bottom();
denorm.SetupNormalization(NULL, NULL, NULL, NULL, NULL, 0,
x_orig, y_orig, scaling, scaling,
0.0f, static_cast<float>(kBlnBaselineOffset));
TBLOB* normed_blob = new TBLOB(*tblob);
normed_blob->Normalize(denorm);
equ_tesseract_->AdaptiveClassifier(normed_blob, denorm, &ratings_equ, NULL);
lang_tesseract_->AdaptiveClassifier(normed_blob, denorm, &ratings_lang, NULL);
normed_blob->Normalize(NULL, NULL, NULL, x_orig, y_orig, scaling, scaling,
0.0f, static_cast<float>(kBlnBaselineOffset),
false, NULL);
equ_tesseract_->AdaptiveClassifier(normed_blob, &ratings_equ, NULL);
lang_tesseract_->AdaptiveClassifier(normed_blob, &ratings_lang, NULL);
delete normed_blob;
delete tblob;

View File

@ -35,6 +35,7 @@
#define MAXSPACING 128 /*max expected spacing in pix */
namespace tesseract {
/**
* @name fix_fuzzy_spaces()
* Walk over the page finding sequences of words joined by fuzzy spaces. Extract
@ -183,7 +184,7 @@ void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
src_wd = src_it.data();
if (!src_wd->combination) {
new_wd = new WERD_RES(*src_wd);
new_wd = WERD_RES::deep_copy(src_wd);
new_wd->combination = FALSE;
new_wd->part_of_combo = FALSE;
new_it.add_after_then_move(new_wd);
@ -502,86 +503,6 @@ void Tesseract::dump_words(WERD_RES_LIST &perm, inT16 score,
}
}
/**
* @name uniformly_spaced()
* Return true if one of the following are true:
* - All inter-char gaps are the same width
* - The largest gap is no larger than twice the mean/median of the others
* - The largest gap is < normalised_max_nonspace
* **** REMEMBER - WE'RE NOW WORKING WITH A BLN WERD !!!
*/
BOOL8 Tesseract::uniformly_spaced(WERD_RES *word) {
TBOX box;
inT16 prev_right = -MAX_INT16;
inT16 gap;
inT16 max_gap = -MAX_INT16;
inT16 max_gap_count = 0;
STATS gap_stats(0, MAXSPACING);
BOOL8 result;
const ROW *row = word->denorm.row();
float max_non_space;
float normalised_max_nonspace;
inT16 i = 0;
inT16 offset = 0;
STRING punct_chars = "\"`',.:;";
for (TBLOB* blob = word->rebuild_word->blobs; blob != NULL;
blob = blob->next) {
box = blob->bounding_box();
if ((prev_right > -MAX_INT16) &&
(!punct_chars.contains(
word->best_choice->unichar_string()
[offset - word->best_choice->unichar_lengths()[i - 1]]) &&
!punct_chars.contains(
word->best_choice->unichar_string()[offset]))) {
gap = box.left() - prev_right;
if (gap < max_gap) {
gap_stats.add(gap, 1);
} else if (gap == max_gap) {
max_gap_count++;
} else {
if (max_gap_count > 0)
gap_stats.add(max_gap, max_gap_count);
max_gap = gap;
max_gap_count = 1;
}
}
prev_right = box.right();
offset += word->best_choice->unichar_lengths()[i++];
}
max_non_space = (row->space() + 3 * row->kern()) / 4;
normalised_max_nonspace = max_non_space * kBlnXHeight / row->x_height();
result = (
gap_stats.get_total() == 0 ||
max_gap <= normalised_max_nonspace ||
(gap_stats.get_total() > 2 && max_gap <= 2 * gap_stats.median()) ||
(gap_stats.get_total() <= 2 && max_gap <= 2 * gap_stats.mean()));
#ifndef SECURE_NAMES
if ((debug_fix_space_level > 1)) {
if (result) {
tprintf(
"ACCEPT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d "
"total=%d mean=%f median=%f\n",
word->best_choice->unichar_string().string(), normalised_max_nonspace,
max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(),
gap_stats.median());
} else {
tprintf(
"REJECT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d "
"total=%d mean=%f median=%f\n",
word->best_choice->unichar_string().string(), normalised_max_nonspace,
max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(),
gap_stats.median());
}
}
#endif
return result;
}
BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
if (word->done)
return TRUE;
@ -655,7 +576,6 @@ void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
WERD_RES_LIST current_perm;
WERD_RES_IT current_perm_it(&current_perm);
WERD_RES *old_word_res;
WERD_RES *new_word_res;
inT16 current_score;
BOOL8 improved = FALSE;
@ -663,12 +583,12 @@ void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
dump_words(best_perm, best_score, 1, improved);
new_word_res = new WERD_RES;
old_word_res = best_perm_it.data();
// Even deep_copy doesn't copy the underlying WERD unless its combination
// flag is true!.
old_word_res->combination = TRUE; // Kludge to force deep copy
*new_word_res = *old_word_res; // deep copy
current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
old_word_res->combination = FALSE; // Undo kludge
current_perm_it.add_to_end(new_word_res);
break_noisiest_blob_word(current_perm);
@ -774,7 +694,6 @@ inT16 Tesseract::worst_noise_blob(WERD_RES *word_res,
if (word_res->rebuild_word == NULL)
return -1; // Can't handle cube words.
TBLOB* blob = word_res->rebuild_word->blobs;
// Normalised.
int blob_count = word_res->box_word->length();
ASSERT_HOST(blob_count <= 512);
@ -789,7 +708,8 @@ inT16 Tesseract::worst_noise_blob(WERD_RES *word_res,
word_res->best_choice->unichar_string().string());
#endif
for (i = 0; i < blob_count && blob != NULL; i++, blob = blob->next) {
for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
TBLOB* blob = word_res->rebuild_word->blobs[i];
if (word_res->reject_map[i].accepted())
noise_score[i] = non_noise_limit;
else
@ -929,10 +849,10 @@ inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
word->best_choice->permuter() == FREQ_DAWG_PERM ||
word->best_choice->permuter() == USER_DAWG_PERM ||
safe_dict_word(word) > 0) {
TBLOB* blob = word->rebuild_word->blobs;
int num_blobs = word->rebuild_word->NumBlobs();
UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
for (i = 0; i < word->best_choice->length() && blob != NULL;
++i, blob = blob->next) {
for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
TBLOB* blob = word->rebuild_word->blobs[i];
if (word->best_choice->unichar_id(i) == space ||
blob_noise_score(blob) < small_limit) {
score -= 1; // penalise possibly erroneous non-space

View File

@ -62,9 +62,9 @@ const int kMaxCharTopRange = 48;
// Returns the number of misfit blob tops in this word.
int Tesseract::CountMisfitTops(WERD_RES *word_res) {
int bad_blobs = 0;
TBLOB* blob = word_res->rebuild_word->blobs;
int blob_id = 0;
for (; blob != NULL; blob = blob->next, ++blob_id) {
int num_blobs = word_res->rebuild_word->NumBlobs();
for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
int top = blob->bounding_box().top();
@ -94,9 +94,9 @@ int Tesseract::CountMisfitTops(WERD_RES *word_res) {
// See comment above for overall algorithm.
float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) {
STATS top_stats(0, MAX_UINT8);
TBLOB* blob = word_res->rebuild_word->blobs;
int blob_id = 0;
for (; blob != NULL; blob = blob->next, ++blob_id) {
int num_blobs = word_res->rebuild_word->NumBlobs();
for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
int top = blob->bounding_box().top();

View File

@ -33,7 +33,7 @@
#include <stdio.h>
#include <stdlib.h>
#include "errcode.h"
#include "globaloc.h" // For err_exit.
#define f(xc, yc) ((xc - factor*yc)*(xc - factor*yc))

View File

@ -132,23 +132,7 @@ float LTRResultIterator::Confidence(PageIteratorLevel level) const {
++certainty_count;
break;
case RIL_SYMBOL:
BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
if (choices != NULL) {
BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
for (int blob = 0; blob < blob_index_; ++blob)
blob_choices_it.forward();
BLOB_CHOICE_IT choice_it(blob_choices_it.data());
for (choice_it.mark_cycle_pt();
!choice_it.cycled_list();
choice_it.forward()) {
if (choice_it.data()->unichar_id() ==
best_choice->unichar_id(blob_index_))
break;
}
mean_certainty += choice_it.data()->certainty();
} else {
mean_certainty += best_choice->certainty();
}
mean_certainty += best_choice->certainty(blob_index_);
++certainty_count;
}
if (certainty_count > 0) {
@ -237,55 +221,83 @@ bool LTRResultIterator::WordIsNumeric() const {
// Returns true if the word contains blamer information.
bool LTRResultIterator::HasBlamerInfo() const {
return (it_->word() != NULL && it_->word()->blamer_bundle != NULL &&
(it_->word()->blamer_bundle->debug.length() > 0 ||
it_->word()->blamer_bundle->misadaption_debug.length() > 0));
return it_->word() != NULL && it_->word()->blamer_bundle != NULL &&
it_->word()->blamer_bundle->HasDebugInfo();
}
// Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
// of the current word.
void *LTRResultIterator::GetParamsTrainingBundle() const {
const void *LTRResultIterator::GetParamsTrainingBundle() const {
return (it_->word() != NULL && it_->word()->blamer_bundle != NULL) ?
&(it_->word()->blamer_bundle->params_training_bundle) : NULL;
&(it_->word()->blamer_bundle->params_training_bundle()) : NULL;
}
// Returns the pointer to the string with blamer information for this word.
// Assumes that the word's blamer_bundle is not NULL.
const char *LTRResultIterator::GetBlamerDebug() const {
return it_->word()->blamer_bundle->debug.string();
return it_->word()->blamer_bundle->debug().string();
}
// Returns the pointer to the string with misadaption information for this word.
// Assumes that the word's blamer_bundle is not NULL.
const char *LTRResultIterator::GetBlamerMisadaptionDebug() const {
return it_->word()->blamer_bundle->misadaption_debug.string();
return it_->word()->blamer_bundle->misadaption_debug().string();
}
// Returns true if a truth string was recorded for the current word.
bool LTRResultIterator::HasTruthString() const {
if (it_->word() == NULL) return false; // Already at the end!
if (it_->word()->blamer_bundle == NULL ||
it_->word()->blamer_bundle->NoTruth()) {
return false; // no truth information for this word
}
return true;
}
// Returns true if the given string is equivalent to the truth string for
// the current word.
bool LTRResultIterator::EquivalentToTruth(const char *str) const {
if (!HasTruthString()) return false;
ASSERT_HOST(it_->word()->uch_set != NULL);
WERD_CHOICE str_wd(str, *(it_->word()->uch_set));
return it_->word()->blamer_bundle->ChoiceIsCorrect(&str_wd);
}
// Returns the null terminated UTF-8 encoded truth string for the current word.
// Use delete [] to free after use.
char* LTRResultIterator::WordTruthUTF8Text() const {
if (it_->word() == NULL) return NULL; // Already at the end!
if (it_->word()->blamer_bundle == NULL ||
it_->word()->blamer_bundle->incorrect_result_reason == IRR_NO_TRUTH) {
return NULL; // no truth information for this word
}
const GenericVector<STRING> &truth_vec =
it_->word()->blamer_bundle->truth_text;
STRING truth_text;
for (int i = 0; i < truth_vec.size(); ++i) truth_text += truth_vec[i];
if (!HasTruthString()) return NULL;
STRING truth_text = it_->word()->blamer_bundle->TruthString();
int length = truth_text.length() + 1;
char* result = new char[length];
strncpy(result, truth_text.string(), length);
return result;
}
// Returns the null terminated UTF-8 encoded normalized OCR string for the
// current word. Use delete [] to free after use.
char* LTRResultIterator::WordNormedUTF8Text() const {
if (it_->word() == NULL) return NULL; // Already at the end!
STRING ocr_text;
WERD_CHOICE* best_choice = it_->word()->best_choice;
const UNICHARSET *unicharset = it_->word()->uch_set;
ASSERT_HOST(best_choice != NULL);
for (int i = 0; i < best_choice->length(); ++i) {
ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));
}
int length = ocr_text.length() + 1;
char* result = new char[length];
strncpy(result, ocr_text.string(), length);
return result;
}
// Returns a pointer to serialized choice lattice.
// Fills lattice_size with the number of bytes in lattice data.
const char *LTRResultIterator::WordLattice(int *lattice_size) const {
if (it_->word() == NULL) return NULL; // Already at the end!
if (it_->word()->blamer_bundle == NULL) return NULL;
*lattice_size = it_->word()->blamer_bundle->lattice_size;
return it_->word()->blamer_bundle->lattice_data;
*lattice_size = it_->word()->blamer_bundle->lattice_size();
return it_->word()->blamer_bundle->lattice_data();
}
// Returns true if the current symbol is a superscript.
@ -293,7 +305,8 @@ const char *LTRResultIterator::WordLattice(int *lattice_size) const {
// this will return the attributes of the first symbol in that word.
bool LTRResultIterator::SymbolIsSuperscript() const {
if (cblob_it_ == NULL && it_->word() != NULL)
return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUPERSCRIPT;
return it_->word()->best_choice->BlobPosition(blob_index_) ==
SP_SUPERSCRIPT;
return false;
}
@ -302,7 +315,7 @@ bool LTRResultIterator::SymbolIsSuperscript() const {
// this will return the attributes of the first symbol in that word.
bool LTRResultIterator::SymbolIsSubscript() const {
if (cblob_it_ == NULL && it_->word() != NULL)
return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUBSCRIPT;
return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUBSCRIPT;
return false;
}
@ -311,7 +324,7 @@ bool LTRResultIterator::SymbolIsSubscript() const {
// this will return the attributes of the first symbol in that word.
bool LTRResultIterator::SymbolIsDropcap() const {
if (cblob_it_ == NULL && it_->word() != NULL)
return it_->word()->box_word->BlobPosition(blob_index_) == SP_DROPCAP;
return it_->word()->best_choice->BlobPosition(blob_index_) == SP_DROPCAP;
return false;
}
@ -319,13 +332,11 @@ ChoiceIterator::ChoiceIterator(const LTRResultIterator& result_it) {
ASSERT_HOST(result_it.it_->word() != NULL);
word_res_ = result_it.it_->word();
PAGE_RES_IT res_it(*result_it.it_);
WERD_CHOICE* best_choice = word_res_->best_choice;
BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
BLOB_CHOICE_LIST* choices = NULL;
if (word_res_->ratings != NULL)
choices = word_res_->GetBlobChoices(result_it.blob_index_);
if (choices != NULL && !choices->empty()) {
BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
for (int blob = 0; blob < result_it.blob_index_; ++blob)
blob_choices_it.forward();
choice_it_ = new BLOB_CHOICE_IT(blob_choices_it.data());
choice_it_ = new BLOB_CHOICE_IT(choices);
choice_it_->mark_cycle_pt();
} else {
choice_it_ = NULL;

View File

@ -23,7 +23,7 @@
#include "platform.h"
#include "pageiterator.h"
#include "unicharset.h"
#include "unichar.h"
class BLOB_CHOICE_IT;
class WERD_RES;
@ -128,7 +128,7 @@ class TESS_API LTRResultIterator : public PageIterator {
// Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
// of the current word.
void *GetParamsTrainingBundle() const;
const void *GetParamsTrainingBundle() const;
// Returns a pointer to the string with blamer information for this word.
// Assumes that the word's blamer_bundle is not NULL.
@ -138,10 +138,21 @@ class TESS_API LTRResultIterator : public PageIterator {
// Assumes that the word's blamer_bundle is not NULL.
const char *GetBlamerMisadaptionDebug() const;
// Returns true if a truth string was recorded for the current word.
bool HasTruthString() const;
// Returns true if the given string is equivalent to the truth string for
// the current word.
bool EquivalentToTruth(const char *str) const;
// Returns a null terminated UTF-8 encoded truth string for the current word.
// Use delete [] to free after use.
char* WordTruthUTF8Text() const;
// Returns a null terminated UTF-8 encoded normalized OCR string for the
// current word. Use delete [] to free after use.
char* WordNormedUTF8Text() const;
// Returns a pointer to serialized choice lattice.
// Fills lattice_size with the number of bytes in lattice data.
const char *WordLattice(int *lattice_size) const;

View File

@ -29,14 +29,12 @@
#include <errno.h>
#endif
#include "helpers.h"
#include "tfacep.h"
#include "tessvars.h"
#include "control.h"
#include "secname.h"
#include "reject.h"
#include "docqual.h"
#include "output.h"
#include "bestfirst.h"
#include "globals.h"
#include "tesseractclass.h"
@ -242,13 +240,7 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
(word->best_choice->unichar_id(0) == space)) {
/* Prevent adjacent tilde across words - we know that adjacent tildes within
words have been removed */
word->best_choice->remove_unichar_id(0);
if (word->best_choice->blob_choices() != NULL) {
BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
if (!blob_choices_it.empty()) delete blob_choices_it.extract();
}
word->reject_map.remove_pos (0);
word->box_word->DeleteBox(0);
word->MergeAdjacentBlobs(0);
}
if (newline_type ||
(word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))

View File

@ -303,16 +303,22 @@ bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
bool PageIterator::BoundingBox(PageIteratorLevel level,
int* left, int* top,
int* right, int* bottom) const {
return BoundingBox(level, 0, left, top, right, bottom);
}
bool PageIterator::BoundingBox(PageIteratorLevel level, const int padding,
int* left, int* top,
int* right, int* bottom) const {
if (!BoundingBoxInternal(level, left, top, right, bottom))
return false;
// Convert to the coordinate system of the original image.
*left = ClipToRange(*left / scale_ + rect_left_,
*left = ClipToRange(*left / scale_ + rect_left_ - padding,
rect_left_, rect_left_ + rect_width_);
*top = ClipToRange(*top / scale_ + rect_top_,
*top = ClipToRange(*top / scale_ + rect_top_ - padding,
rect_top_, rect_top_ + rect_height_);
*right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_,
*right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_ + padding,
*left, rect_left_ + rect_width_);
*bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_,
*bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_ + padding,
*top, rect_top_ + rect_height_);
return true;
}
@ -546,14 +552,15 @@ void PageIterator::BeginWord(int offset) {
// Recognition has been done, so we are using the box_word, which
// is already baseline denormalized.
word_length_ = word_res->best_choice->length();
ASSERT_HOST(word_res->box_word != NULL);
if (word_res->box_word->length() != word_length_) {
tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ",
word_length_, word_res->best_choice->unichar_string().string(),
word_res->box_word->length());
word_res->box_word->bounding_box().print();
if (word_res->box_word != NULL) {
if (word_res->box_word->length() != word_length_) {
tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ",
word_length_, word_res->best_choice->unichar_string().string(),
word_res->box_word->length());
word_res->box_word->bounding_box().print();
}
ASSERT_HOST(word_res->box_word->length() == word_length_);
}
ASSERT_HOST(word_res->box_word->length() == word_length_);
word_ = NULL;
// We will be iterating the box_word.
if (cblob_it_ != NULL) {
@ -574,4 +581,13 @@ void PageIterator::BeginWord(int offset) {
}
}
bool PageIterator::SetWordBlamerBundle(BlamerBundle *blamer_bundle) {
if (it_->word() != NULL) {
it_->word()->blamer_bundle = blamer_bundle;
return true;
} else {
return false;
}
}
} // namespace tesseract.

View File

@ -24,6 +24,7 @@
#include "publictypes.h"
#include "platform.h"
class BlamerBundle;
class C_BLOB_IT;
class PBLOB_IT;
class PAGE_RES;
@ -189,6 +190,8 @@ class TESS_API PageIterator {
*/
bool BoundingBox(PageIteratorLevel level,
int* left, int* top, int* right, int* bottom) const;
bool BoundingBox(PageIteratorLevel level, const int padding,
int* left, int* top, int* right, int* bottom) const;
/**
* Returns the bounding rectangle of the object in a coordinate system of the
* working image rectangle having its origin at (rect_left_, rect_top_) with
@ -282,6 +285,12 @@ class TESS_API PageIterator {
bool *is_crown,
int *first_line_indent) const;
// If the current WERD_RES (it_->word()) is not NULL, sets the BlamerBundle
// of the current word to the given pointer (takes ownership of the pointer)
// and returns true.
// Can only be used when iterating on the word level.
bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
protected:
/**
* Sets up the internal data for iterating the blobs of a new word, then

View File

@ -16,8 +16,8 @@
** limitations under the License.
*
**********************************************************************/
#ifdef _MSC_VER
#define __func__ __FUNCTION__
#ifdef _MSC_VER
#define __func__ __FUNCTION__
#endif
#include <ctype.h>
@ -40,11 +40,6 @@
namespace tesseract {
// The tab vectors for a given line should be ignored if both its tab vectors
// are infrequent, specifically, if both tab vectors appear at most once per
// kStrayLinePer lines in a block.
const int kStrayLinePer = 6;
// Special "weak" ParagraphModels.
const ParagraphModel *kCrownLeft
= reinterpret_cast<ParagraphModel *>(0xDEAD111F);
@ -727,7 +722,15 @@ void CalculateTabStops(GenericVector<RowScratchRegisters> *rows,
// tab stop is frequent.
SimpleClusterer lefts(tolerance);
SimpleClusterer rights(tolerance);
int infrequent_enough_to_ignore = (row_end - row_start) / kStrayLinePer;
// Outlier elimination. We might want to switch this to test outlier-ness
// based on how strange a position an outlier is in instead of or in addition
// to how rare it is. These outliers get re-added if we end up having too
// few tab stops, to work with, however.
int infrequent_enough_to_ignore = 0;
if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
for (int i = row_start; i < row_end; i++) {
int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
@ -739,6 +742,54 @@ void CalculateTabStops(GenericVector<RowScratchRegisters> *rows,
}
lefts.GetClusters(left_tabs);
rights.GetClusters(right_tabs);
if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
(right_tabs->size() == 1 && left_tabs->size() >= 4)) {
// One side is really ragged, and the other only has one tab stop,
// so those "insignificant outliers" are probably important, actually.
// This often happens on a page of an index. Add back in the ones
// we omitted in the first pass.
for (int i = row_start; i < row_end; i++) {
int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
lefts.Add((*rows)[i].lindent_);
rights.Add((*rows)[i].rindent_);
}
}
}
lefts.GetClusters(left_tabs);
rights.GetClusters(right_tabs);
// If one side is almost a two-indent aligned side, and the other clearly
// isn't, try to prune out the least frequent tab stop from that side.
if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
int to_prune = -1;
for (int i = left_tabs->size() - 1; i >= 0; i--) {
if (to_prune < 0 ||
(*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
to_prune = i;
}
}
if (to_prune >= 0 &&
(*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
left_tabs->remove(to_prune);
}
}
if (right_tabs->size() == 3 && right_tabs->size() >= 4) {
int to_prune = -1;
for (int i = right_tabs->size() - 1; i >= 0; i--) {
if (to_prune < 0 ||
(*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
to_prune = i;
}
}
if (to_prune >= 0 &&
(*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
right_tabs->remove(to_prune);
}
}
}
// Given a paragraph model mark rows[row_start, row_end) as said model
@ -816,6 +867,11 @@ struct GeometricClassifierState {
tolerance = InterwordSpace(*r, r_start, r_end);
CalculateTabStops(r, r_start, r_end, tolerance,
&left_tabs, &right_tabs);
if (debug_level >= 3) {
tprintf("Geometry: TabStop cluster tolerance = %d; "
"%d left tabs; %d right tabs\n",
tolerance, left_tabs.size(), right_tabs.size());
}
ltr = (*r)[r_start].ri_->ltr;
}
@ -1079,16 +1135,18 @@ void GeometricClassify(int debug_level,
firsts[s.AlignsideTabIndex(s.row_start)]++;
// For each line, if the first word would have fit on the previous
// line count it as a likely paragraph start line.
bool jam_packed = true;
for (int i = s.row_start + 1; i < s.row_end; i++) {
if (s.FirstWordWouldHaveFit(i - 1, i)) {
firsts[s.AlignsideTabIndex(i)]++;
jam_packed = false;
}
}
// Make an extra accounting for the last line of the paragraph just
// in case it's the only short line in the block. That is, take its
// first word as typical and see if this looks like the *last* line
// of a paragraph. If so, mark the *other* indent as probably a first.
if (s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
}
@ -1543,24 +1601,26 @@ void RecomputeMarginsAndClearHypotheses(
}
}
// Return the minimum inter-word space in rows[row_start, row_end).
// Return the median inter-word space in rows[row_start, row_end).
int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
int row_start, int row_end) {
if (row_end < row_start + 1) return 1;
bool legit = false;
int natural_space = rows[row_start].ri_->average_interword_space;
int word_height = (rows[row_start].ri_->lword_box.height() +
rows[row_end - 1].ri_->lword_box.height()) / 2;
int word_width = (rows[row_start].ri_->lword_box.width() +
rows[row_end - 1].ri_->lword_box.width()) / 2;
STATS spacing_widths(0, 5 + word_width);
for (int i = row_start; i < row_end; i++) {
if (rows[i].ri_->num_words > 1) {
if (!legit) {
natural_space = rows[i].ri_->average_interword_space;
legit = true;
} else {
if (rows[i].ri_->average_interword_space < natural_space)
natural_space = rows[i].ri_->average_interword_space;
}
spacing_widths.add(rows[i].ri_->average_interword_space, 1);
}
}
return natural_space;
int minimum_reasonable_space = word_height / 3;
if (minimum_reasonable_space < 2)
minimum_reasonable_space = 2;
int median = spacing_widths.median();
return (median > minimum_reasonable_space)
? median : minimum_reasonable_space;
}
// Return whether the first word on the after line can fit in the space at
@ -2274,6 +2334,7 @@ void DetectParagraphs(int debug_level,
GeometricClassify(debug_level, &rows,
leftovers[i].begin, leftovers[i].end, &theory);
}
// Undo any flush models for which there's little evidence.
DowngradeWeakestToCrowns(debug_level, &theory, &rows);

View File

@ -23,7 +23,6 @@
#include "control.h"
#include "cutil.h"
#include "host.h"
#include "permute.h"
#include "ratngs.h"
#include "reject.h"
#include "stopper.h"
@ -38,10 +37,6 @@ FILE *Tesseract::init_recog_training(const STRING &fname) {
if (tessedit_ambigs_training) {
tessedit_tess_adaption_mode.set_value(0); // turn off adaption
tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
save_blob_choices.set_value(1); // save individual char choices
getDict().save_raw_choices.set_value(1); // save raw choices
getDict().permute_only_top.set_value(true); // use only top choice permuter
tessedit_ok_mode.set_value(0); // turn off context checking
// Explore all segmentations.
getDict().stopper_no_acceptable_choices.set_value(1);
}
@ -156,6 +151,47 @@ void Tesseract::recog_training_segmented(const STRING &fname,
examined_words, total_words);
}
// Helper prints the given set of blob choices.
static void PrintPath(int length, const BLOB_CHOICE** blob_choices,
const UNICHARSET& unicharset,
const char *label, FILE *output_file) {
float rating = 0.0f;
float certainty = 0.0f;
for (int i = 0; i < length; ++i) {
const BLOB_CHOICE* blob_choice = blob_choices[i];
fprintf(output_file, "%s",
unicharset.id_to_unichar(blob_choice->unichar_id()));
rating += blob_choice->rating();
if (certainty > blob_choice->certainty())
certainty = blob_choice->certainty();
}
fprintf(output_file, "\t%s\t%.4f\t%.4f\n",
label, rating, certainty);
}
// Helper recursively prints all paths through the ratings matrix, starting
// at column col.
static void PrintMatrixPaths(int col, int dim,
const MATRIX& ratings,
int length, const BLOB_CHOICE** blob_choices,
const UNICHARSET& unicharset,
const char *label, FILE *output_file) {
for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
if (ratings.get(col, row) != NOT_CLASSIFIED) {
BLOB_CHOICE_IT bc_it(ratings.get(col, row));
for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
blob_choices[length] = bc_it.data();
if (row + 1 < dim) {
PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices,
unicharset, label, output_file);
} else {
PrintPath(length + 1, blob_choices, unicharset, label, output_file);
}
}
}
}
}
// Runs classify_word_pass1() on the current word. Outputs Tesseract's
// raw choice as a result of the classification. For words labeled with a
// single unichar also outputs all alternatives from blob_choices of the
@ -165,44 +201,25 @@ void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res,
BLOCK_RES *block_res,
const char *label,
FILE *output_file) {
int offset;
// Classify word.
fflush(stdout);
classify_word_pass1(block_res->block, row_res->row, werd_res);
WERD_CHOICE *best_choice = werd_res->best_choice;
ASSERT_HOST(best_choice != NULL);
ASSERT_HOST(best_choice->blob_choices() != NULL);
// Compute the number of unichars in the label.
int label_num_unichars = 0;
int step = 1; // should be non-zero on the first iteration
for (offset = 0; label[offset] != '\0' && step > 0;
step = werd_res->uch_set->step(label + offset),
offset += step, ++label_num_unichars);
if (step == 0) {
GenericVector<UNICHAR_ID> encoding;
if (!unicharset.encode_string(label, true, &encoding, NULL, NULL)) {
tprintf("Not outputting illegal unichar %s\n", label);
return;
}
// Output all classifier choices for the unigrams (1->1 classifications).
if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) {
BLOB_CHOICE_LIST_C_IT outer_blob_choice_it;
outer_blob_choice_it.set_to_list(best_choice->blob_choices());
BLOB_CHOICE_IT blob_choice_it;
blob_choice_it.set_to_list(outer_blob_choice_it.data());
for (blob_choice_it.mark_cycle_pt();
!blob_choice_it.cycled_list();
blob_choice_it.forward()) {
BLOB_CHOICE *blob_choice = blob_choice_it.data();
if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) {
fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n",
unicharset.id_to_unichar(blob_choice->unichar_id()),
label, blob_choice->rating(), blob_choice->certainty());
}
}
}
// Output raw choices for many->many and 1->many classifications.
getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars);
// Dump all paths through the ratings matrix (which is normally small).
int dim = werd_res->ratings->dimension();
const BLOB_CHOICE** blob_choices = new const BLOB_CHOICE*[dim];
PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices,
unicharset, label, output_file);
delete [] blob_choices;
}
} // namespace tesseract

View File

@ -30,13 +30,13 @@
#include "scanutils.h"
#include <ctype.h>
#include <string.h>
#include "memry.h"
#include "genericvector.h"
#include "reject.h"
#include "tfacep.h"
#include "imgs.h"
#include "control.h"
#include "docqual.h"
#include "secname.h"
#include "globaloc.h" // For err_exit.
#include "globals.h"
#include "helpers.h"
@ -58,126 +58,26 @@ CLISTIZEH (STRING) CLISTIZE (STRING)
*************************************************************************/
namespace tesseract {
void Tesseract::set_done( //set done flag
WERD_RES *word,
inT16 pass) {
/*
0: Original heuristic used in Tesseract and Ray's prototype Resaljet
*/
if (tessedit_ok_mode == 0) {
/* NOTE - done even if word contains some or all spaces !!! */
word->done = word->tess_accepted;
void Tesseract::set_done(WERD_RES *word, inT16 pass) {
word->done = word->tess_accepted &&
(strchr(word->best_choice->unichar_string().string(), ' ') == NULL);
bool word_is_ambig = word->best_choice->dangerous_ambig_found();
bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
word->best_choice->permuter() == FREQ_DAWG_PERM ||
word->best_choice->permuter() == USER_DAWG_PERM;
if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
one_ell_conflict(word, FALSE)) {
if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
word->done = FALSE;
}
/*
1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts
*/
else if (tessedit_ok_mode == 1) {
word->done = word->tess_accepted &&
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
if (word->done && ((!word_from_dict &&
word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
word->done = FALSE;
}
/*
2: as 1 + only accept dict words or numerics in pass 1
*/
else if (tessedit_ok_mode == 2) {
word->done = word->tess_accepted &&
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
word->done = FALSE;
if (word->done &&
(pass == 1) &&
(word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
(word->best_choice->permuter () != USER_DAWG_PERM) &&
(word->best_choice->permuter () != NUMBER_PERM)) {
#ifndef SECURE_NAMES
if (tessedit_rejection_debug)
tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
word->best_choice->unichar_string().string ());
#endif
word->done = FALSE;
}
}
/*
3: as 2 + only accept dict words or numerics in pass 2 as well
*/
else if (tessedit_ok_mode == 3) {
word->done = word->tess_accepted &&
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
word->done = FALSE;
if (word->done &&
(word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
(word->best_choice->permuter () != USER_DAWG_PERM) &&
(word->best_choice->permuter () != NUMBER_PERM)) {
#ifndef SECURE_NAMES
if (tessedit_rejection_debug)
tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
word->best_choice->unichar_string().string ());
#endif
word->done = FALSE;
}
}
/*
4: as 2 + reject dict ambigs in pass 1
*/
else if (tessedit_ok_mode == 4) {
word->done = word->tess_accepted &&
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
word->done = FALSE;
if (word->done &&
(pass == 1) &&
(((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
(word->best_choice->permuter () != USER_DAWG_PERM) &&
(word->best_choice->permuter () != NUMBER_PERM)) ||
(test_ambig_word (word)))) {
#ifndef SECURE_NAMES
if (tessedit_rejection_debug)
tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
word->best_choice->unichar_string().string ());
#endif
word->done = FALSE;
}
}
/*
5: as 3 + reject dict ambigs in both passes
*/
else if (tessedit_ok_mode == 5) {
word->done = word->tess_accepted &&
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
word->done = FALSE;
if (word->done &&
(((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
(word->best_choice->permuter () != USER_DAWG_PERM) &&
(word->best_choice->permuter () != NUMBER_PERM)) ||
(test_ambig_word (word)))) {
#ifndef SECURE_NAMES
if (tessedit_rejection_debug)
tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
word->best_choice->unichar_string().string ());
#endif
word->done = FALSE;
}
}
else {
tprintf ("BAD tessedit_ok_mode\n");
err_exit();
if (tessedit_rejection_debug) {
tprintf("set_done(): done=%d\n", word->done);
word->best_choice->print("");
}
}
@ -189,12 +89,7 @@ void Tesseract::set_done( //set done flag
*
* Sets a reject map for the word.
*************************************************************************/
void Tesseract::make_reject_map( //make rej map for wd //detailed results
WERD_RES *word,
BLOB_CHOICE_LIST_CLIST *blob_choices,
ROW *row,
inT16 pass //1st or 2nd?
) {
void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) {
int i;
int offset;
@ -208,7 +103,7 @@ void Tesseract::make_reject_map( //make rej map for wd //detailed results
*/
if (tessedit_reject_mode == 0) {
if (!word->done)
reject_poor_matches(word, blob_choices);
reject_poor_matches(word);
} else if (tessedit_reject_mode == 5) {
/*
5: Reject I/1/l from words where there is no strong contextual confirmation;
@ -313,45 +208,13 @@ void Tesseract::reject_I_1_L(WERD_RES *word) {
} // namespace tesseract
void reject_poor_matches( //detailed results
WERD_RES *word,
BLOB_CHOICE_LIST_CLIST *blob_choices) {
float threshold;
inT16 i = 0;
inT16 offset = 0;
//super iterator
BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
BLOB_CHOICE_IT choice_it; //real iterator
#ifndef SECURE_NAMES
if (strlen(word->best_choice->unichar_lengths().string()) !=
list_it.length()) {
tprintf
("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
word->best_choice->unichar_string().string(),
strlen (word->best_choice->unichar_lengths().string()), list_it.length(),
word->box_word->length());
}
#endif
ASSERT_HOST (strlen (word->best_choice->unichar_lengths().string ()) ==
list_it.length ());
ASSERT_HOST(word->box_word->length() == list_it.length());
threshold = compute_reject_threshold (blob_choices);
for (list_it.mark_cycle_pt ();
!list_it.cycled_list (); list_it.forward (), i++,
offset += word->best_choice->unichar_lengths()[i]) {
/* NB - only compares the threshold against the TOP choice char in the
choices list for a blob !! - the selected one may be below the threshold
*/
choice_it.set_to_list (list_it.data ());
if ((word->best_choice->unichar_string()[offset] == ' ') ||
(choice_it.length () == 0))
//rej unrecognised blobs
word->reject_map[i].setrej_tess_failure ();
else if (choice_it.data ()->certainty () < threshold)
//rej poor score blob
word->reject_map[i].setrej_poor_match ();
void reject_poor_matches(WERD_RES *word) {
float threshold = compute_reject_threshold(word->best_choice);
for (int i = 0; i < word->best_choice->length(); ++i) {
if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
word->reject_map[i].setrej_tess_failure();
else if (word->best_choice->certainty(i) < threshold)
word->reject_map[i].setrej_poor_match();
}
}
@ -364,52 +227,32 @@ void reject_poor_matches( //detailed results
* gap in the certainty value.
**********************************************************************/
float compute_reject_threshold( //compute threshold //detailed results
BLOB_CHOICE_LIST_CLIST *blob_choices) {
inT16 index; //to ratings
inT16 blob_count; //no of blobs in word
inT16 ok_blob_count = 0; //non TESS rej blobs in word
float *ratings; //array of confidences
float threshold; //rejection threshold
float bestgap; //biggest gap
float gapstart; //bottom of gap
//super iterator
BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
BLOB_CHOICE_IT choice_it; //real iterator
float compute_reject_threshold(WERD_CHOICE* word) {
float threshold; // rejection threshold
float bestgap = 0.0f; // biggest gap
float gapstart; // bottom of gap
// super iterator
BLOB_CHOICE_IT choice_it; // real iterator
blob_count = blob_choices->length ();
ratings = (float *) alloc_mem (blob_count * sizeof (float));
for (list_it.mark_cycle_pt (), index = 0;
!list_it.cycled_list (); list_it.forward (), index++) {
choice_it.set_to_list (list_it.data ());
if (choice_it.length () > 0) {
ratings[ok_blob_count] = choice_it.data ()->certainty ();
//get in an array
// tprintf("Rating[%d]=%c %g %g\n",
// index,choice_it.data()->char_class(),
// choice_it.data()->rating(),choice_it.data()->certainty());
ok_blob_count++;
}
int blob_count = word->length();
GenericVector<float> ratings;
ratings.init_to_size(blob_count, 0.0f);
for (int i = 0; i < blob_count; ++i) {
ratings[i] = word->certainty(i);
}
ASSERT_HOST (index == blob_count);
qsort (ratings, ok_blob_count, sizeof (float), sort_floats);
//sort them
bestgap = 0;
gapstart = ratings[0] - 1; //all reject if none better
if (ok_blob_count >= 3) {
for (index = 0; index < ok_blob_count - 1; index++) {
ratings.sort();
gapstart = ratings[0] - 1; // all reject if none better
if (blob_count >= 3) {
for (int index = 0; index < blob_count - 1; index++) {
if (ratings[index + 1] - ratings[index] > bestgap) {
bestgap = ratings[index + 1] - ratings[index];
//find biggest
// find biggest
gapstart = ratings[index];
}
}
}
threshold = gapstart + bestgap / 2;
// tprintf("First=%g, last=%g, gap=%g, threshold=%g\n",
// ratings[0],ratings[index],bestgap,threshold);
free_mem(ratings);
return threshold;
}
@ -680,21 +523,6 @@ BOOL8 Tesseract::word_contains_non_1_digit(const char *word,
return FALSE;
}
BOOL8 Tesseract::test_ambig_word( //test for ambiguity
WERD_RES *word) {
BOOL8 ambig = FALSE;
if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
(word->best_choice->permuter () == FREQ_DAWG_PERM) ||
(word->best_choice->permuter () == USER_DAWG_PERM)) {
ambig = !getDict().NoDangerousAmbig(
word->best_choice, NULL, false, NULL, NULL);
}
return ambig;
}
/*************************************************************************
* dont_allow_1Il()
* Dont unreject LONE accepted 1Il conflict set chars
@ -786,10 +614,9 @@ inT16 Tesseract::safe_dict_word(const WERD_RES *werd_res) {
return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
}
// Note: After running this function word_res->best_choice->blob_choices()
// might not contain the right BLOB_CHOICE coresponding to each character
// in word_res->best_choice. However, the length of blob_choices and
// word_res->best_choice will remain the same.
// Note: After running this function word_res->ratings
// might not contain the right BLOB_CHOICE corresponding to each character
// in word_res->best_choice.
void Tesseract::flip_hyphens(WERD_RES *word_res) {
WERD_CHOICE *best_choice = word_res->best_choice;
int i;
@ -801,16 +628,16 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
if (tessedit_lower_flip_hyphen <= 1)
return;
TBLOB* blob = word_res->rebuild_word->blobs;
int num_blobs = word_res->rebuild_word->NumBlobs();
UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
bool modified = false;
for (i = 0; i < best_choice->length() && blob != NULL; ++i,
blob = blob->next) {
for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
TBLOB* blob = word_res->rebuild_word->blobs[i];
out_box = blob->bounding_box();
if (blob->next == NULL)
if (i + 1 == num_blobs)
next_left = 9999;
else
next_left = blob->next->bounding_box().left();
next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
// Dont touch small or touching blobs - it is too dangerous.
if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
(out_box.left() > prev_right) && (out_box.right() < next_left)) {
@ -846,10 +673,9 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
}
}
// Note: After running this function word_res->best_choice->blob_choices()
// might not contain the right BLOB_CHOICE coresponding to each character
// in word_res->best_choice. However, the length of blob_choices and
// word_res->best_choice will remain the same.
// Note: After running this function word_res->ratings
// might not contain the right BLOB_CHOICE corresponding to each character
// in word_res->best_choice.
void Tesseract::flip_0O(WERD_RES *word_res) {
WERD_CHOICE *best_choice = word_res->best_choice;
int i;
@ -858,9 +684,9 @@ void Tesseract::flip_0O(WERD_RES *word_res) {
if (!tessedit_flip_0O)
return;
TBLOB* blob = word_res->rebuild_word->blobs;
for (i = 0; i < best_choice->length() && blob != NULL; ++i,
blob = blob->next) {
int num_blobs = word_res->rebuild_word->NumBlobs();
for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
TBLOB* blob = word_res->rebuild_word->blobs[i];
if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
out_box = blob->bounding_box();

View File

@ -24,8 +24,8 @@
#include "pageres.h"
void reject_blanks(WERD_RES *word);
void reject_poor_matches(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices);
float compute_reject_threshold(BLOB_CHOICE_LIST_CLIST *blob_choices);
void reject_poor_matches(WERD_RES *word);
float compute_reject_threshold(WERD_CHOICE* word);
BOOL8 word_contains_non_1_digit(const char *word, const char *word_lengths);
void dont_allow_1Il(WERD_RES *word);
void flip_hyphens(WERD_RES *word);

View File

@ -24,8 +24,9 @@
#include "platform.h"
#include "ltrresultiterator.h"
#include "genericvector.h"
template <typename T> class GenericVector;
template <typename T> class GenericVectorEqEq;
class BLOB_CHOICE_IT;
class WERD_RES;
class STRING;

View File

@ -31,6 +31,7 @@
#include <stdlib.h>
#include <string.h>
#include "fileerr.h"
#include "globaloc.h" // For err_exit.
#include "tprintf.h"
#include "img.h"
#include "imgscale.h"

View File

@ -21,25 +21,22 @@
#pragma warning(disable:4244) // Conversion warnings
#endif
#include "tfacep.h"
#include "tfacepp.h"
#include "tessbox.h"
#include "mfoutline.h"
#include "tessbox.h"
#include "tesseractclass.h"
#define EXTERN
/**
* @name tess_segment_pass1
* @name tess_segment_pass_n
*
* Segment a word using the pass1 conditions of the tess segmenter.
* Segment a word using the pass_n conditions of the tess segmenter.
* @param pass_n pass number
* @param word word to do
* @param blob_choices list of blob lists
*/
namespace tesseract {
void Tesseract::tess_segment_pass1(WERD_RES *word,
BLOB_CHOICE_LIST_CLIST *blob_choices) {
void Tesseract::tess_segment_pass_n(int pass_n, WERD_RES *word) {
int saved_enable_assoc = 0;
int saved_chop_enable = 0;
@ -48,46 +45,17 @@ void Tesseract::tess_segment_pass1(WERD_RES *word,
saved_chop_enable = chop_enable;
wordrec_enable_assoc.set_value(0);
chop_enable.set_value(0);
if (word->word->flag(W_REP_CHAR))
getDict().permute_only_top.set_value(true);
}
set_pass1();
recog_word(word, blob_choices);
if (pass_n == 1)
set_pass1();
else
set_pass2();
recog_word(word);
if (word->best_choice == NULL)
word->SetupFake(*word->uch_set);
if (word->word->flag(W_DONT_CHOP)) {
wordrec_enable_assoc.set_value(saved_enable_assoc);
chop_enable.set_value(saved_chop_enable);
getDict().permute_only_top.set_value(false);
}
}
/**
* @name tess_segment_pass2
*
* Segment a word using the pass2 conditions of the tess segmenter.
* @param word word to do
* @param blob_choices list of blob lists
*/
void Tesseract::tess_segment_pass2(WERD_RES *word,
BLOB_CHOICE_LIST_CLIST *blob_choices) {
int saved_enable_assoc = 0;
int saved_chop_enable = 0;
if (word->word->flag(W_DONT_CHOP)) {
saved_enable_assoc = wordrec_enable_assoc;
saved_chop_enable = chop_enable;
wordrec_enable_assoc.set_value(0);
chop_enable.set_value(0);
if (word->word->flag(W_REP_CHAR))
getDict().permute_only_top.set_value(true);
}
set_pass2();
recog_word(word, blob_choices);
if (word->word->flag(W_DONT_CHOP)) {
wordrec_enable_assoc.set_value(saved_enable_assoc);
chop_enable.set_value(saved_chop_enable);
getDict().permute_only_top.set_value(false);
}
}
@ -98,10 +66,8 @@ void Tesseract::tess_segment_pass2(WERD_RES *word,
* @param word_choice after context
* @param raw_choice before context
*/
BOOL8 Tesseract::tess_acceptable_word(
WERD_CHOICE *word_choice, // after context
WERD_CHOICE *raw_choice) { // before context
return getDict().AcceptableResult(*word_choice);
bool Tesseract::tess_acceptable_word(WERD_RES* word) {
return getDict().AcceptableResult(word);
}

View File

@ -17,30 +17,17 @@
*
**********************************************************************/
//#include <osfcn.h>
//#include <signal.h>
//#include <time.h>
//#include <unistd.h>
#include "tfacep.h" //must be before main.h
//#include "fileerr.h"
#include "stderr.h"
#include "basedir.h"
#include "tessvars.h"
//#include "debgwin.h"
//#include "epapdest.h"
#include "control.h"
#include "imgs.h"
#include "reject.h"
#include "pageres.h"
//#include "gpapdest.h"
#include "nwmain.h"
#include "pgedit.h"
#include "tprintf.h"
//#include "ipeerr.h"
//#include "restart.h"
#include "tessedit.h"
//#include "fontfind.h"
#include "permute.h"
#include "stopper.h"
#include "intmatcher.h"
#include "chop.h"
@ -190,9 +177,16 @@ bool Tesseract::init_tesseract_lang_data(
if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
right_to_left_ = unicharset.major_right_to_left();
// Setup initial unichar ambigs table and read universal ambigs.
UNICHARSET encoder_unicharset;
encoder_unicharset.CopyFrom(unicharset);
unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
if (!tessedit_ambigs_training &&
tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) {
unichar_ambigs.LoadUnicharAmbigs(
encoder_unicharset,
tessdata_manager.GetDataFilePtr(),
tessdata_manager.GetEndOffset(TESSDATA_AMBIGS),
ambigs_debug_level, use_ambigs_for_adaption, &unicharset);
@ -210,6 +204,23 @@ bool Tesseract::init_tesseract_lang_data(
tprintf("Loaded Cube with combiner\n");
}
// Init ParamsModel.
// Load pass1 and pass2 weights (for now these two sets are the same, but in
// the future separate sets of weights can be generated).
for (int p = ParamsModel::PTRAIN_PASS1;
p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
language_model_->getParamsModel().SetPass(
static_cast<ParamsModel::PassEnum>(p));
if (tessdata_manager.SeekToStart(TESSDATA_PARAMS_MODEL)) {
if (!language_model_->getParamsModel().LoadFromFp(
lang.string(), tessdata_manager.GetDataFilePtr(),
tessdata_manager.GetEndOffset(TESSDATA_PARAMS_MODEL))) {
return false;
}
}
}
if (tessdata_manager_debug_level) language_model_->getParamsModel().Print();
return true;
}
@ -323,6 +334,30 @@ int Tesseract::init_tesseract(
tprintf("Tesseract couldn't load any languages!\n");
return -1; // Couldn't load any language!
}
if (!sub_langs_.empty()) {
// In multilingual mode word ratings have to be directly comparable,
// so use the same language model weights for all languages:
// use the primary language's params model if
// tessedit_use_primary_params_model is set,
// otherwise use default language model weights.
if (tessedit_use_primary_params_model) {
for (int s = 0; s < sub_langs_.size(); ++s) {
sub_langs_[s]->language_model_->getParamsModel().Copy(
this->language_model_->getParamsModel());
}
tprintf("Using params model of the primary language\n");
if (tessdata_manager_debug_level) {
this->language_model_->getParamsModel().Print();
}
} else {
this->language_model_->getParamsModel().Clear();
for (int s = 0; s < sub_langs_.size(); ++s) {
sub_langs_[s]->language_model_->getParamsModel().Clear();
}
tprintf("Using default language params\n");
}
}
SetupUniversalFontIds();
return 0;
}
@ -420,7 +455,7 @@ int Tesseract::init_tesseract_lm(const char *arg0,
if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
NULL, 0, NULL, NULL, false))
return -1;
getDict().Load();
getDict().Load(Dict::GlobalDawgCache());
tessdata_manager.End();
return 0;
}

View File

@ -221,16 +221,16 @@ bool TesseractCubeCombiner::ComputeCombinerFeatures(const string &tess_str,
features->push_back(cube_best_bigram_cost);
}
// case-insensitive string comparison, including punctuation
int compare_nocase_punc = CompareStrings(cube_best_str.c_str(),
tess_str.c_str(), false, true);
int compare_nocase_punc = CompareStrings(cube_best_str,
tess_str, false, true);
features->push_back(compare_nocase_punc == 0);
// case-sensitive string comparison, ignoring punctuation
int compare_case_nopunc = CompareStrings(cube_best_str.c_str(),
tess_str.c_str(), true, false);
int compare_case_nopunc = CompareStrings(cube_best_str,
tess_str, true, false);
features->push_back(compare_case_nopunc == 0);
// case-insensitive string comparison, ignoring punctuation
int compare_nocase_nopunc = CompareStrings(cube_best_str.c_str(),
tess_str.c_str(), true, true);
int compare_nocase_nopunc = CompareStrings(cube_best_str,
tess_str, true, true);
features->push_back(compare_nocase_nopunc == 0);
return true;
}

View File

@ -1,37 +0,0 @@
/**********************************************************************
* File: tfacep.h (Formerly tfacep.h)
* Description: Declarations of C functions and C owned data.
* Author: Ray Smith
* Created: Mon Apr 27 12:51:28 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TFACEP_H
#define TFACEP_H
#include "host.h"
#include "blobs.h"
#include "tessarray.h"
#include "oldlist.h"
#include "permute.h"
#include "blobclass.h"
#include "stopper.h"
#include "associate.h"
#include "chop.h"
#include "structures.h"
typedef void (*TESS_TESTER) (TBLOB *, BOOL8, char *, inT32, LIST);
typedef LIST (*TESS_MATCHER) (TBLOB *, TBLOB *, TBLOB *);
#endif

View File

@ -25,19 +25,12 @@
#include <math.h>
#ifdef __UNIX__
#include <assert.h>
#endif
#include "errcode.h"
#include "ratngs.h"
#include "reject.h"
#include "werd.h"
#include "tfacep.h"
#include "tfacepp.h"
#include "tessvars.h"
#include "globals.h"
#include "reject.h"
#include "tesseractclass.h"
#include "blamer.h"
#include "errcode.h"
#include "ratngs.h"
#include "reject.h"
#include "tesseractclass.h"
#include "werd.h"
#define MAX_UNDIVIDED_LENGTH 24
@ -50,21 +43,30 @@
* Convert the output back to editor form.
**********************************************************************/
namespace tesseract {
void Tesseract::recog_word(WERD_RES *word,
BLOB_CHOICE_LIST_CLIST *blob_choices) {
ASSERT_HOST(word->chopped_word->blobs != NULL);
recog_word_recursive(word, blob_choices);
void Tesseract::recog_word(WERD_RES *word) {
if (wordrec_skip_no_truth_words && (word->blamer_bundle == NULL ||
word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
if (classify_debug_level) tprintf("No truth for word - skipping\n");
word->tess_failed = true;
return;
}
ASSERT_HOST(!word->chopped_word->blobs.empty());
recog_word_recursive(word);
word->SetupBoxWord();
if ((word->best_choice->length() != word->box_word->length()) ||
(word->best_choice->length() != blob_choices->length())) {
if (word->best_choice->length() != word->box_word->length()) {
tprintf("recog_word ASSERT FAIL String:\"%s\"; "
"Strlen=%d; #Blobs=%d; #Choices=%d\n",
"Strlen=%d; #Blobs=%d\n",
word->best_choice->debug_string().string(),
word->best_choice->length(), word->box_word->length(),
blob_choices->length());
word->best_choice->length(), word->box_word->length());
}
ASSERT_HOST(word->best_choice->length() == word->box_word->length());
ASSERT_HOST(word->best_choice->length() == blob_choices->length());
// Check that the ratings matrix size matches the sum of all the
// segmentation states.
if (!word->StatesAllValid()) {
tprintf("Not all words have valid states relative to ratings matrix!!");
word->DebugWordChoices(true, NULL);
ASSERT_HOST(word->StatesAllValid());
}
if (tessedit_override_permuter) {
/* Override the permuter type if a straight dictionary check disagrees. */
uinT8 perm_type = word->best_choice->permuter();
@ -105,31 +107,13 @@ void Tesseract::recog_word(WERD_RES *word,
* Convert the word to tess form and pass it to the tess segmenter.
* Convert the output back to editor form.
**********************************************************************/
void Tesseract::recog_word_recursive(WERD_RES *word,
BLOB_CHOICE_LIST_CLIST *blob_choices) {
void Tesseract::recog_word_recursive(WERD_RES *word) {
int word_length = word->chopped_word->NumBlobs(); // no of blobs
if (word_length > MAX_UNDIVIDED_LENGTH) {
return split_and_recog_word(word, blob_choices);
return split_and_recog_word(word);
}
int initial_blob_choice_len = blob_choices->length();
BLOB_CHOICE_LIST_VECTOR* tess_ratings = cc_recog(word);
// Put BLOB_CHOICE_LISTs from tess_ratings into blob_choices.
BLOB_CHOICE_LIST_C_IT blob_choices_it(blob_choices);
for (int i = 0; i < tess_ratings->length(); ++i) {
blob_choices_it.add_to_end(tess_ratings->get(i));
}
delete tess_ratings;
cc_recog(word);
word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
// Pad raw_choice with spaces if needed.
if (word->raw_choice->length() < word_length) {
UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
while (word->raw_choice->length() < word_length) {
word->raw_choice->append_unichar_id(space_id, 1, 0.0,
word->raw_choice->certainty());
}
}
// Do sanity checks and minor fixes on best_choice.
if (word->best_choice->length() > word_length) {
@ -141,21 +125,6 @@ void Tesseract::recog_word_recursive(WERD_RES *word,
tprintf("Word is at:");
word->word->bounding_box().print();
}
if (blob_choices->length() - initial_blob_choice_len != word_length) {
word->best_choice->make_bad(); // force rejection
tprintf("recog_word: Choices list len:%d; blob lists len:%d\n",
blob_choices->length(), word_length);
blob_choices_it.set_to_list(blob_choices); // list of lists
while (blob_choices->length() - initial_blob_choice_len < word_length) {
blob_choices_it.add_to_end(new BLOB_CHOICE_LIST()); // add a fake one
tprintf("recog_word: Added dummy choice list\n");
}
while (blob_choices->length() - initial_blob_choice_len > word_length) {
blob_choices_it.move_to_last(); // should never happen
delete blob_choices_it.extract();
tprintf("recog_word: Deleted choice list\n");
}
}
if (word->best_choice->length() < word_length) {
UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
while (word->best_choice->length() < word_length) {
@ -172,133 +141,134 @@ void Tesseract::recog_word_recursive(WERD_RES *word,
* Split the word into 2 smaller pieces at the largest gap.
* Recognize the pieces and stick the results back together.
**********************************************************************/
void Tesseract::split_and_recog_word(WERD_RES *word,
BLOB_CHOICE_LIST_CLIST *blob_choices) {
void Tesseract::split_and_recog_word(WERD_RES *word) {
// Find the biggest blob gap in the chopped_word.
int bestgap = -MAX_INT32;
TPOINT best_split_pt;
int split_index = 0;
TBLOB* best_end = NULL;
TBLOB* prev_blob = NULL;
for (TBLOB* blob = word->chopped_word->blobs; blob != NULL;
blob = blob->next) {
if (prev_blob != NULL) {
TBOX prev_box = prev_blob->bounding_box();
TBOX blob_box = blob->bounding_box();
int gap = blob_box.left() - prev_box.right();
if (gap > bestgap) {
bestgap = gap;
best_end = prev_blob;
best_split_pt.x = (prev_box.right() + blob_box.left()) / 2;
best_split_pt.y = (prev_box.top() + prev_box.bottom() +
blob_box.top() + blob_box.bottom()) / 4;
}
for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
int gap = blob_box.left() - prev_box.right();
if (gap > bestgap) {
bestgap = gap;
split_index = b;
}
prev_blob = blob;
}
ASSERT_HOST(best_end != NULL);
ASSERT_HOST(best_end->next != NULL);
ASSERT_HOST(split_index > 0);
// Make a copy of the word to put the 2nd half in.
WERD_RES* word2 = new WERD_RES(*word);
// Blow away the copied chopped_word, as we want to work with the blobs
// from the input chopped_word so the seam_arrays can be merged.
delete word2->chopped_word;
word2->chopped_word = new TWERD;
word2->chopped_word->blobs = best_end->next;
best_end->next = NULL;
// Make a new seamarray on both words.
free_seam_list(word->seam_array);
word->seam_array = start_seam_list(word->chopped_word->blobs);
word2->seam_array = start_seam_list(word2->chopped_word->blobs);
BlamerBundle *orig_bb = word->blamer_bundle;
STRING blamer_debug;
// Try to adjust truth information.
if (orig_bb != NULL) {
// Find truth boxes that correspond to the split in the blobs.
int b;
int begin2_truth_index = -1;
if (orig_bb->incorrect_result_reason != IRR_NO_TRUTH &&
orig_bb->truth_has_char_boxes) {
int end1_x = best_end->bounding_box().right();
int begin2_x = word2->chopped_word->blobs->bounding_box().left();
blamer_debug = "Looking for truth split at";
blamer_debug.add_str_int(" end1_x ", end1_x);
blamer_debug.add_str_int(" begin2_x ", begin2_x);
blamer_debug += "\nnorm_truth_word boxes:\n";
if (orig_bb->norm_truth_word.length() > 1) {
orig_bb->norm_truth_word.BlobBox(0).append_debug(&blamer_debug);
for (b = 1; b < orig_bb->norm_truth_word.length(); ++b) {
orig_bb->norm_truth_word.BlobBox(b).append_debug(&blamer_debug);
if ((abs(end1_x - orig_bb->norm_truth_word.BlobBox(b-1).right()) <
orig_bb->norm_box_tolerance) &&
(abs(begin2_x - orig_bb->norm_truth_word.BlobBox(b).left()) <
orig_bb->norm_box_tolerance)) {
begin2_truth_index = b;
blamer_debug += "Split found\n";
break;
}
}
}
}
// Populate truth information in word and word2 with the first and second
// part of the original truth.
word->blamer_bundle = new BlamerBundle();
word2->blamer_bundle = new BlamerBundle();
if (begin2_truth_index > 0) {
word->blamer_bundle->truth_has_char_boxes = true;
word->blamer_bundle->norm_box_tolerance = orig_bb->norm_box_tolerance;
word2->blamer_bundle->truth_has_char_boxes = true;
word2->blamer_bundle->norm_box_tolerance = orig_bb->norm_box_tolerance;
BlamerBundle *curr_bb = word->blamer_bundle;
for (b = 0; b < orig_bb->norm_truth_word.length(); ++b) {
if (b == begin2_truth_index) curr_bb = word2->blamer_bundle;
curr_bb->norm_truth_word.InsertBox(
b, orig_bb->norm_truth_word.BlobBox(b));
curr_bb->truth_word.InsertBox(b, orig_bb->truth_word.BlobBox(b));
curr_bb->truth_text.push_back(orig_bb->truth_text[b]);
}
} else if (orig_bb->incorrect_result_reason == IRR_NO_TRUTH) {
word->blamer_bundle->incorrect_result_reason = IRR_NO_TRUTH;
word2->blamer_bundle->incorrect_result_reason = IRR_NO_TRUTH;
} else {
blamer_debug += "Truth split not found";
blamer_debug += orig_bb->truth_has_char_boxes ?
"\n" : " (no truth char boxes)\n";
word->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug,
NULL, wordrec_debug_blamer);
word2->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug,
NULL, wordrec_debug_blamer);
}
}
WERD_RES *word2 = NULL;
BlamerBundle *orig_bb = NULL;
split_word(word, split_index, &word2, &orig_bb);
// Recognize the first part of the word.
recog_word_recursive(word, blob_choices);
recog_word_recursive(word);
// Recognize the second part of the word.
recog_word_recursive(word2, blob_choices);
recog_word_recursive(word2);
join_words(word, word2, orig_bb);
}
/**********************************************************************
* split_word
*
* Split a given WERD_RES in place into two smaller words for recognition.
* split_pt is the index of the first blob to go in the second word.
* The underlying word is left alone, only the TWERD (and subsequent data)
* are split up. orig_blamer_bundle is set to the original blamer bundle,
* and will now be owned by the caller. New blamer bundles are forged for the
* two pieces.
**********************************************************************/
void Tesseract::split_word(WERD_RES *word,
int split_pt,
WERD_RES **right_piece,
BlamerBundle **orig_blamer_bundle) const {
ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
// Save a copy of the blamer bundle so we can try to reconstruct it below.
BlamerBundle *orig_bb =
word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : NULL;
WERD_RES *word2 = new WERD_RES(*word);
// blow away the copied chopped_word, as we want to work with
// the blobs from the input chopped_word so seam_arrays can be merged.
TWERD *chopped = word->chopped_word;
TWERD *chopped2 = new TWERD;
chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
chopped2->blobs.push_back(chopped->blobs[i]);
}
chopped->blobs.truncate(split_pt);
word->chopped_word = NULL;
delete word2->chopped_word;
word2->chopped_word = NULL;
const UNICHARSET &unicharset = *word->uch_set;
word->ClearResults();
word2->ClearResults();
word->chopped_word = chopped;
word2->chopped_word = chopped2;
word->SetupBasicsFromChoppedWord(unicharset);
word2->SetupBasicsFromChoppedWord(unicharset);
// Try to adjust the blamer bundle.
if (orig_bb != NULL) {
// TODO(rays) Looks like a leak to me.
// orig_bb should take, rather than copy.
word->blamer_bundle = new BlamerBundle();
word2->blamer_bundle = new BlamerBundle();
orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
word2->chopped_word->blobs[0]->bounding_box().left(),
wordrec_debug_blamer,
word->blamer_bundle, word2->blamer_bundle);
}
*right_piece = word2;
*orig_blamer_bundle = orig_bb;
}
/**********************************************************************
* join_words
*
* The opposite of split_word():
* join word2 (including any recognized data / seam array / etc)
* onto the right of word and then delete word2.
* Also, if orig_bb is provided, stitch it back into word.
**********************************************************************/
void Tesseract::join_words(WERD_RES *word,
WERD_RES *word2,
BlamerBundle *orig_bb) const {
TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
// Tack the word2 outputs onto the end of the word outputs.
// New blobs might have appeared on the end of word1.
for (best_end = word->chopped_word->blobs; best_end->next != NULL;
best_end = best_end->next);
best_end->next = word2->chopped_word->blobs;
TBLOB* blob;
for (blob = word->rebuild_word->blobs; blob->next != NULL; blob = blob->next);
blob->next = word2->rebuild_word->blobs;
word2->chopped_word->blobs = NULL;
word2->rebuild_word->blobs = NULL;
// Copy the seams onto the end of the word1 seam_array.
word->chopped_word->blobs += word2->chopped_word->blobs;
word->rebuild_word->blobs += word2->rebuild_word->blobs;
word2->chopped_word->blobs.clear();
word2->rebuild_word->blobs.clear();
TPOINT split_pt;
split_pt.x = (prev_box.right() + blob_box.left()) / 2;
split_pt.y = (prev_box.top() + prev_box.bottom() +
blob_box.top() + blob_box.bottom()) / 4;
// Move the word2 seams onto the end of the word1 seam_array.
// Since the seam list is one element short, an empty seam marking the
// end of the last blob in the first word is needed first.
word->seam_array = add_seam(word->seam_array,
new_seam(0.0, best_split_pt, NULL, NULL, NULL));
for (int i = 0; i < array_count(word2->seam_array); ++i) {
SEAM* seam = reinterpret_cast<SEAM*>(array_value(word2->seam_array, i));
array_value(word2->seam_array, i) = NULL;
word->seam_array = add_seam(word->seam_array, seam);
}
word->seam_array.push_back(new SEAM(0.0f, split_pt, NULL, NULL, NULL));
word->seam_array += word2->seam_array;
word2->seam_array.truncate(0);
// Fix widths and gaps.
word->blob_widths += word2->blob_widths;
word->blob_gaps += word2->blob_gaps;
// Fix the ratings matrix.
int rat1 = word->ratings->dimension();
int rat2 = word2->ratings->dimension();
word->ratings->AttachOnCorner(word2->ratings);
ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
word->best_state += word2->best_state;
// Append the word choices.
*word->best_choice += *word2->best_choice;
*word->raw_choice += *word2->raw_choice;
// How many alt choices from each should we try to get?
@ -306,70 +276,56 @@ void Tesseract::split_and_recog_word(WERD_RES *word,
// When do we start throwing away extra alt choices?
const int kTooManyAltChoices = 100;
if (word->alt_choices.size() > 0 && word2->alt_choices.size() > 0) {
// Construct the cartesian product of the alt choices of word(1) and word2.
int num_first_alt_choices = word->alt_choices.size();
// Nota Bene: For the main loop here, we leave in place word1-only
// alt_choices in
// word->alt_choices[0] .. word_alt_choices[num_first_alt_choices - 1]
// These will get fused with the best choices for word2 below.
for (int j = 1; j < word2->alt_choices.size() &&
(j <= kAltsPerPiece || word->alt_choices.size() < kTooManyAltChoices);
j++) {
for (int i = 0; i < num_first_alt_choices &&
(i <= kAltsPerPiece ||
word->alt_choices.size() < kTooManyAltChoices);
i++) {
WERD_CHOICE *wc = new WERD_CHOICE(*word->alt_choices[i]);
*wc += *word2->alt_choices[j];
word->alt_choices.push_back(wc);
word->alt_states.push_back(GenericVector<int>());
GenericVector<int> &alt_state = word->alt_states.back();
alt_state += word->alt_states[i];
alt_state += word2->alt_states[j];
}
}
// Now that we've filled in as many alternates as we want, paste the best
// choice for word2 onto the original word alt_choices.
for (int i = 0; i < num_first_alt_choices; i++) {
*word->alt_choices[i] += *word2->alt_choices[0];
word->alt_states[i] += word2->alt_states[0];
// Construct the cartesian product of the best_choices of word(1) and word2.
WERD_CHOICE_LIST joined_choices;
WERD_CHOICE_IT jc_it(&joined_choices);
WERD_CHOICE_IT bc1_it(&word->best_choices);
WERD_CHOICE_IT bc2_it(&word2->best_choices);
int num_word1_choices = word->best_choices.length();
int total_joined_choices = num_word1_choices;
// Nota Bene: For the main loop here, we operate only on the 2nd and greater
// word2 choices, and put them in the joined_choices list. The 1st word2
// choice gets added to the original word1 choices in-place after we have
// finished with them.
int bc2_index = 1;
for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
if (total_joined_choices >= kTooManyAltChoices &&
bc2_index > kAltsPerPiece)
break;
int bc1_index = 0;
for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
++bc1_index, bc1_it.forward()) {
if (total_joined_choices >= kTooManyAltChoices &&
bc1_index > kAltsPerPiece)
break;
WERD_CHOICE *wc = new WERD_CHOICE(*bc1_it.data());
*wc += *bc2_it.data();
jc_it.add_after_then_move(wc);
++total_joined_choices;
}
}
// Now that we've filled in as many alternates as we want, paste the best
// choice for word2 onto the original word alt_choices.
bc1_it.move_to_first();
bc2_it.move_to_first();
for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
*bc1_it.data() += *bc2_it.data();
}
bc1_it.move_to_last();
bc1_it.add_list_after(&joined_choices);
// Restore the pointer to original blamer bundle and combine blamer
// information recorded in the splits.
if (orig_bb != NULL) {
IncorrectResultReason irr = orig_bb->incorrect_result_reason;
if (irr != IRR_NO_TRUTH_SPLIT) blamer_debug = "";
if (word->blamer_bundle->incorrect_result_reason != IRR_CORRECT &&
word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH &&
word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH_SPLIT) {
blamer_debug += "Blame from part 1: ";
blamer_debug += word->blamer_bundle->debug;
irr = word->blamer_bundle->incorrect_result_reason;
}
if (word2->blamer_bundle->incorrect_result_reason != IRR_CORRECT &&
word2->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH &&
word2->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH_SPLIT) {
blamer_debug += "Blame from part 2: ";
blamer_debug += word2->blamer_bundle->debug;
if (irr == IRR_CORRECT) {
irr = word2->blamer_bundle->incorrect_result_reason;
} else if (irr != word2->blamer_bundle->incorrect_result_reason) {
irr = IRR_UNKNOWN;
}
}
orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
wordrec_debug_blamer);
delete word->blamer_bundle;
word->blamer_bundle = orig_bb;
word->blamer_bundle->incorrect_result_reason = irr;
if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) {
word->blamer_bundle->SetBlame(irr, blamer_debug, NULL,
wordrec_debug_blamer);
}
}
word->SetupBoxWord();
word->reject_map.initialise(word->box_word->length());
delete word2;
}
} // namespace tesseract

View File

@ -1,41 +0,0 @@
/**********************************************************************
* File: tfacepp.h (Formerly tface++.h)
* Description: C++ side of the C/C++ Tess/Editor interface.
* Author: Ray Smith
* Created: Thu Apr 23 15:39:23 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TFACEPP_H
#define TFACEPP_H
#include "ratngs.h"
#include "blobs.h"
#include "tesseractclass.h"
void call_tester( //call a tester
TBLOB *tessblob, //blob to test
BOOL8 correct_blob, //true if good
char *text, //source text
inT32 count, //chars in text
LIST result //output of matcher
);
void call_train_tester( //call a tester
TBLOB *tessblob, //blob to test
BOOL8 correct_blob, //true if good
char *text, //source text
inT32 count, //chars in text
LIST result //output of matcher
);
#endif

View File

@ -27,7 +27,7 @@
**********************************************************************/
WERD *make_pseudo_word(PAGE_RES* page_res, // Blocks to check.
TBOX &selection_box,
const TBOX &selection_box,
BLOCK *&pseudo_block,
ROW *&pseudo_row) { // Row of selection.
PAGE_RES_IT pr_it(page_res);

View File

@ -23,7 +23,7 @@
#include "pageres.h"
WERD *make_pseudo_word(PAGE_RES* page_res, // blocks to check
TBOX &selection_box,
const TBOX &selection_box,
BLOCK *&pseudo_block,
ROW *&pseudo_row);

View File

@ -9,7 +9,7 @@ endif
include_HEADERS = publictypes.h
noinst_HEADERS = \
blckerr.h blobbox.h blobs.h blread.h boxread.h boxword.h ccstruct.h coutln.h crakedge.h \
blamer.h blckerr.h blobbox.h blobs.h blread.h boxread.h boxword.h ccstruct.h coutln.h crakedge.h \
detlinefit.h dppoint.h fontinfo.h genblob.h hpdsizes.h ipoints.h \
linlsq.h matrix.h mod128.h normalis.h \
ocrblock.h ocrpara.h ocrrow.h otsuthr.h \
@ -31,12 +31,12 @@ libtesseract_ccstruct_la_LIBADD = \
endif
libtesseract_ccstruct_la_SOURCES = \
blobbox.cpp blobs.cpp blread.cpp boxread.cpp boxword.cpp ccstruct.cpp coutln.cpp \
blamer.cpp blobbox.cpp blobs.cpp blread.cpp boxread.cpp boxword.cpp ccstruct.cpp coutln.cpp \
detlinefit.cpp dppoint.cpp fontinfo.cpp genblob.cpp \
linlsq.cpp matrix.cpp mod128.cpp normalis.cpp \
ocrblock.cpp ocrpara.cpp ocrrow.cpp otsuthr.cpp \
pageres.cpp pdblock.cpp points.cpp polyaprx.cpp polyblk.cpp \
publictypes.cpp \
params_training_featdef.cpp publictypes.cpp \
quadlsq.cpp quadratc.cpp quspline.cpp ratngs.cpp rect.cpp rejctmap.cpp \
seam.cpp split.cpp statistc.cpp stepblob.cpp \
vecfuncs.cpp werd.cpp

587
ccstruct/blamer.cpp Normal file
View File

@ -0,0 +1,587 @@
///////////////////////////////////////////////////////////////////////
// File: blamer.cpp
// Description: Module allowing precise error causes to be allocated.
// Author: Rike Antonova
// Refactored: Ray Smith
// Created: Mon Feb 04 14:37:01 PST 2013
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "blamer.h"
#include "blobs.h"
#include "matrix.h"
#include "normalis.h"
#include "pageres.h"
// Names for each value of IncorrectResultReason enum. Keep in sync.
const char kBlameCorrect[] = "corr";
const char kBlameClassifier[] = "cl";
const char kBlameChopper[] = "chop";
const char kBlameClassLMTradeoff[] = "cl/LM";
const char kBlamePageLayout[] = "pglt";
const char kBlameSegsearchHeur[] = "ss_heur";
const char kBlameSegsearchPP[] = "ss_pp";
const char kBlameClassOldLMTradeoff[] = "cl/old_LM";
const char kBlameAdaption[] = "adapt";
const char kBlameNoTruthSplit[] = "no_tr_spl";
const char kBlameNoTruth[] = "no_tr";
const char kBlameUnknown[] = "unkn";
const char * const kIncorrectResultReasonNames[] = {
kBlameCorrect,
kBlameClassifier,
kBlameChopper,
kBlameClassLMTradeoff,
kBlamePageLayout,
kBlameSegsearchHeur,
kBlameSegsearchPP,
kBlameClassOldLMTradeoff,
kBlameAdaption,
kBlameNoTruthSplit,
kBlameNoTruth,
kBlameUnknown
};
const char *BlamerBundle::IncorrectReasonName(IncorrectResultReason irr) {
return kIncorrectResultReasonNames[irr];
}
const char *BlamerBundle::IncorrectReason() const {
return kIncorrectResultReasonNames[incorrect_result_reason_];
}
// Functions to setup the blamer.
// Whole word string, whole word bounding box.
void BlamerBundle::SetWordTruth(const UNICHARSET& unicharset,
const char* truth_str, const TBOX& word_box) {
truth_word_.InsertBox(0, word_box);
truth_has_char_boxes_ = false;
// Encode the string as UNICHAR_IDs.
GenericVector<UNICHAR_ID> encoding;
GenericVector<char> lengths;
unicharset.encode_string(truth_str, false, &encoding, &lengths, NULL);
int total_length = 0;
for (int i = 0; i < encoding.size(); total_length += lengths[i++]) {
STRING uch(truth_str + total_length);
uch.truncate_at(lengths[i] - total_length);
UNICHAR_ID id = encoding[i];
if (id != INVALID_UNICHAR_ID) uch = unicharset.get_normed_unichar(id);
truth_text_.push_back(uch);
}
}
// Single "character" string, "character" bounding box.
// May be called multiple times to indicate the characters in a word.
void BlamerBundle::SetSymbolTruth(const UNICHARSET& unicharset,
const char* char_str, const TBOX& char_box) {
STRING symbol_str(char_str);
UNICHAR_ID id = unicharset.unichar_to_id(char_str);
if (id != INVALID_UNICHAR_ID) {
STRING normed_uch(unicharset.get_normed_unichar(id));
if (normed_uch.length() > 0) symbol_str = normed_uch;
}
int length = truth_word_.length();
truth_text_.push_back(symbol_str);
truth_word_.InsertBox(length, char_box);
if (length == 0)
truth_has_char_boxes_ = true;
else if (truth_word_.BlobBox(length - 1) == char_box)
truth_has_char_boxes_ = false;
}
// Marks that there is something wrong with the truth text, like it contains
// reject characters.
void BlamerBundle::SetRejectedTruth() {
incorrect_result_reason_ = IRR_NO_TRUTH;
truth_has_char_boxes_ = false;
}
// Returns true if the provided word_choice is correct.
bool BlamerBundle::ChoiceIsCorrect(const WERD_CHOICE* word_choice) const {
if (word_choice == NULL) return false;
const UNICHARSET* uni_set = word_choice->unicharset();
STRING normed_choice_str;
for (int i = 0; i < word_choice->length(); ++i) {
normed_choice_str +=
uni_set->get_normed_unichar(word_choice->unichar_id(i));
}
STRING truth_str = TruthString();
return truth_str == normed_choice_str;
}
void BlamerBundle::FillDebugString(const STRING &msg,
const WERD_CHOICE *choice,
STRING *debug) {
(*debug) += "Truth ";
for (int i = 0; i < this->truth_text_.length(); ++i) {
(*debug) += this->truth_text_[i];
}
if (!this->truth_has_char_boxes_) (*debug) += " (no char boxes)";
if (choice != NULL) {
(*debug) += " Choice ";
STRING choice_str;
choice->string_and_lengths(&choice_str, NULL);
(*debug) += choice_str;
}
if (msg.length() > 0) {
(*debug) += "\n";
(*debug) += msg;
}
(*debug) += "\n";
}
// Sets up the norm_truth_word from truth_word using the given DENORM.
void BlamerBundle::SetupNormTruthWord(const DENORM& denorm) {
// TODO(rays) Is this the last use of denorm in WERD_RES and can it go?
norm_box_tolerance_ = kBlamerBoxTolerance * denorm.x_scale();
TPOINT topleft;
TPOINT botright;
TPOINT norm_topleft;
TPOINT norm_botright;
for (int b = 0; b < truth_word_.length(); ++b) {
const TBOX &box = truth_word_.BlobBox(b);
topleft.x = box.left();
topleft.y = box.top();
botright.x = box.right();
botright.y = box.bottom();
denorm.NormTransform(NULL, topleft, &norm_topleft);
denorm.NormTransform(NULL, botright, &norm_botright);
TBOX norm_box(norm_topleft.x, norm_botright.y,
norm_botright.x, norm_topleft.y);
norm_truth_word_.InsertBox(b, norm_box);
}
}
// Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
// bundles) where the right edge/ of the left-hand word is word1_right,
// and the left edge of the right-hand word is word2_left.
void BlamerBundle::SplitBundle(int word1_right, int word2_left, bool debug,
BlamerBundle* bundle1,
BlamerBundle* bundle2) const {
STRING debug_str;
// Find truth boxes that correspond to the split in the blobs.
int b;
int begin2_truth_index = -1;
if (incorrect_result_reason_ != IRR_NO_TRUTH &&
truth_has_char_boxes_) {
debug_str = "Looking for truth split at";
debug_str.add_str_int(" end1_x ", word1_right);
debug_str.add_str_int(" begin2_x ", word2_left);
debug_str += "\nnorm_truth_word boxes:\n";
if (norm_truth_word_.length() > 1) {
norm_truth_word_.BlobBox(0).print_to_str(&debug_str);
for (b = 1; b < norm_truth_word_.length(); ++b) {
norm_truth_word_.BlobBox(b).print_to_str(&debug_str);
if ((abs(word1_right - norm_truth_word_.BlobBox(b - 1).right()) <
norm_box_tolerance_) &&
(abs(word2_left - norm_truth_word_.BlobBox(b).left()) <
norm_box_tolerance_)) {
begin2_truth_index = b;
debug_str += "Split found";
break;
}
}
debug_str += '\n';
}
}
// Populate truth information in word and word2 with the first and second
// part of the original truth.
if (begin2_truth_index > 0) {
bundle1->truth_has_char_boxes_ = true;
bundle1->norm_box_tolerance_ = norm_box_tolerance_;
bundle2->truth_has_char_boxes_ = true;
bundle2->norm_box_tolerance_ = norm_box_tolerance_;
BlamerBundle *curr_bb = bundle1;
for (b = 0; b < norm_truth_word_.length(); ++b) {
if (b == begin2_truth_index) curr_bb = bundle2;
curr_bb->norm_truth_word_.InsertBox(b, norm_truth_word_.BlobBox(b));
curr_bb->truth_word_.InsertBox(b, truth_word_.BlobBox(b));
curr_bb->truth_text_.push_back(truth_text_[b]);
}
} else if (incorrect_result_reason_ == IRR_NO_TRUTH) {
bundle1->incorrect_result_reason_ = IRR_NO_TRUTH;
bundle2->incorrect_result_reason_ = IRR_NO_TRUTH;
} else {
debug_str += "Truth split not found";
debug_str += truth_has_char_boxes_ ?
"\n" : " (no truth char boxes)\n";
bundle1->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, NULL, debug);
bundle2->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, NULL, debug);
}
}
// "Joins" the blames from bundle1 and bundle2 into *this.
void BlamerBundle::JoinBlames(const BlamerBundle& bundle1,
const BlamerBundle& bundle2, bool debug) {
STRING debug_str;
IncorrectResultReason irr = incorrect_result_reason_;
if (irr != IRR_NO_TRUTH_SPLIT) debug_str = "";
if (bundle1.incorrect_result_reason_ != IRR_CORRECT &&
bundle1.incorrect_result_reason_ != IRR_NO_TRUTH &&
bundle1.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) {
debug_str += "Blame from part 1: ";
debug_str += bundle1.debug_;
irr = bundle1.incorrect_result_reason_;
}
if (bundle2.incorrect_result_reason_ != IRR_CORRECT &&
bundle2.incorrect_result_reason_ != IRR_NO_TRUTH &&
bundle2.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) {
debug_str += "Blame from part 2: ";
debug_str += bundle2.debug_;
if (irr == IRR_CORRECT) {
irr = bundle2.incorrect_result_reason_;
} else if (irr != bundle2.incorrect_result_reason_) {
irr = IRR_UNKNOWN;
}
}
incorrect_result_reason_ = irr;
if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) {
SetBlame(irr, debug_str, NULL, debug);
}
}
// If a blob with the same bounding box as one of the truth character
// bounding boxes is not classified as the corresponding truth character
// blames character classifier for incorrect answer.
void BlamerBundle::BlameClassifier(const UNICHARSET& unicharset,
const TBOX& blob_box,
const BLOB_CHOICE_LIST& choices,
bool debug) {
if (!truth_has_char_boxes_ ||
incorrect_result_reason_ != IRR_CORRECT)
return; // Nothing to do here.
for (int b = 0; b < norm_truth_word_.length(); ++b) {
const TBOX &truth_box = norm_truth_word_.BlobBox(b);
// Note that we are more strict on the bounding box boundaries here
// than in other places (chopper, segmentation search), since we do
// not have the ability to check the previous and next bounding box.
if (blob_box.x_almost_equal(truth_box, norm_box_tolerance_/2)) {
bool found = false;
bool incorrect_adapted = false;
UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID;
const char *truth_str = truth_text_[b].string();
// We promise not to modify the list or its contents, using a
// const BLOB_CHOICE* below.
BLOB_CHOICE_IT choices_it(const_cast<BLOB_CHOICE_LIST*>(&choices));
for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
choices_it.forward()) {
const BLOB_CHOICE* choice = choices_it.data();
if (strcmp(truth_str, unicharset.get_normed_unichar(
choice->unichar_id())) == 0) {
found = true;
break;
} else if (choice->IsAdapted()) {
incorrect_adapted = true;
incorrect_adapted_id = choice->unichar_id();
}
} // end choices_it for loop
if (!found) {
STRING debug_str = "unichar ";
debug_str += truth_str;
debug_str += " not found in classification list";
SetBlame(IRR_CLASSIFIER, debug_str, NULL, debug);
} else if (incorrect_adapted) {
STRING debug_str = "better rating for adapted ";
debug_str += unicharset.id_to_unichar(incorrect_adapted_id);
debug_str += " than for correct ";
debug_str += truth_str;
SetBlame(IRR_ADAPTION, debug_str, NULL, debug);
}
break;
}
} // end iterating over blamer_bundle->norm_truth_word
}
// Checks whether chops were made at all the character bounding box
// boundaries in word->truth_word. If not - blames the chopper for an
// incorrect answer.
void BlamerBundle::SetChopperBlame(const WERD_RES* word, bool debug) {
if (NoTruth() || !truth_has_char_boxes_ ||
word->chopped_word->blobs.empty()) {
return;
}
STRING debug_str;
bool missing_chop = false;
int num_blobs = word->chopped_word->blobs.size();
int box_index = 0;
int blob_index = 0;
inT16 truth_x;
while (box_index < truth_word_.length() && blob_index < num_blobs) {
truth_x = norm_truth_word_.BlobBox(box_index).right();
TBLOB * curr_blob = word->chopped_word->blobs[blob_index];
if (curr_blob->bounding_box().right() < truth_x - norm_box_tolerance_) {
++blob_index;
continue; // encountered an extra chop, keep looking
} else if (curr_blob->bounding_box().right() >
truth_x + norm_box_tolerance_) {
missing_chop = true;
break;
} else {
++blob_index;
}
}
if (missing_chop || box_index < norm_truth_word_.length()) {
STRING debug_str;
if (missing_chop) {
debug_str.add_str_int("Detected missing chop (tolerance=",
norm_box_tolerance_);
debug_str += ") at Bounding Box=";
TBLOB * curr_blob = word->chopped_word->blobs[blob_index];
curr_blob->bounding_box().print_to_str(&debug_str);
debug_str.add_str_int("\nNo chop for truth at x=", truth_x);
} else {
debug_str.add_str_int("Missing chops for last ",
norm_truth_word_.length() - box_index);
debug_str += " truth box(es)";
}
debug_str += "\nMaximally chopped word boxes:\n";
for (blob_index = 0; blob_index < num_blobs; ++blob_index) {
TBLOB * curr_blob = word->chopped_word->blobs[blob_index];
curr_blob->bounding_box().print_to_str(&debug_str);
debug_str += '\n';
}
debug_str += "Truth bounding boxes:\n";
for (box_index = 0; box_index < norm_truth_word_.length(); ++box_index) {
norm_truth_word_.BlobBox(box_index).print_to_str(&debug_str);
debug_str += '\n';
}
SetBlame(IRR_CHOPPER, debug_str, word->best_choice, debug);
}
}
// Blames the classifier or the language model if, after running only the
// chopper, best_choice is incorrect and no blame has been yet set.
// Blames the classifier if best_choice is classifier's top choice and is a
// dictionary word (i.e. language model could not have helped).
// Otherwise, blames the language model (formerly permuter word adjustment).
void BlamerBundle::BlameClassifierOrLangModel(
const WERD_RES* word,
const UNICHARSET& unicharset, bool valid_permuter, bool debug) {
if (valid_permuter) {
// Find out whether best choice is a top choice.
best_choice_is_dict_and_top_choice_ = true;
for (int i = 0; i < word->best_choice->length(); ++i) {
BLOB_CHOICE_IT blob_choice_it(word->GetBlobChoices(i));
ASSERT_HOST(!blob_choice_it.empty());
BLOB_CHOICE *first_choice = NULL;
for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
blob_choice_it.forward()) { // find first non-fragment choice
if (!(unicharset.get_fragment(blob_choice_it.data()->unichar_id()))) {
first_choice = blob_choice_it.data();
break;
}
}
ASSERT_HOST(first_choice != NULL);
if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) {
best_choice_is_dict_and_top_choice_ = false;
break;
}
}
}
STRING debug_str;
if (best_choice_is_dict_and_top_choice_) {
debug_str = "Best choice is: incorrect, top choice, dictionary word";
debug_str += " with permuter ";
debug_str += word->best_choice->permuter_name();
} else {
debug_str = "Classifier/Old LM tradeoff is to blame";
}
SetBlame(best_choice_is_dict_and_top_choice_ ? IRR_CLASSIFIER
: IRR_CLASS_OLD_LM_TRADEOFF,
debug_str, word->best_choice, debug);
}
// Sets up the correct_segmentation_* to mark the correct bounding boxes.
void BlamerBundle::SetupCorrectSegmentation(const TWERD* word, bool debug) {
params_training_bundle_.StartHypothesisList();
if (incorrect_result_reason_ != IRR_CORRECT || !truth_has_char_boxes_)
return; // Nothing to do here.
STRING debug_str;
debug_str += "Blamer computing correct_segmentation_cols\n";
int curr_box_col = 0;
int next_box_col = 0;
int num_blobs = word->NumBlobs();
if (num_blobs == 0) return; // No blobs to play with.
int blob_index = 0;
inT16 next_box_x = word->blobs[blob_index]->bounding_box().right();
for (int truth_idx = 0; blob_index < num_blobs &&
truth_idx < norm_truth_word_.length();
++blob_index) {
++next_box_col;
inT16 curr_box_x = next_box_x;
if (blob_index + 1 < num_blobs)
next_box_x = word->blobs[blob_index + 1]->bounding_box().right();
inT16 truth_x = norm_truth_word_.BlobBox(truth_idx).right();
debug_str.add_str_int("Box x coord vs. truth: ", curr_box_x);
debug_str.add_str_int(" ", truth_x);
debug_str += "\n";
if (curr_box_x > (truth_x + norm_box_tolerance_)) {
break; // failed to find a matching box
} else if (curr_box_x >= truth_x - norm_box_tolerance_ && // matched
(blob_index + 1 >= num_blobs || // next box can't be included
next_box_x > truth_x + norm_box_tolerance_)) {
correct_segmentation_cols_.push_back(curr_box_col);
correct_segmentation_rows_.push_back(next_box_col-1);
++truth_idx;
debug_str.add_str_int("col=", curr_box_col);
debug_str.add_str_int(" row=", next_box_col-1);
debug_str += "\n";
curr_box_col = next_box_col;
}
}
if (blob_index < num_blobs || // trailing blobs
correct_segmentation_cols_.length() != norm_truth_word_.length()) {
debug_str.add_str_int("Blamer failed to find correct segmentation"
" (tolerance=", norm_box_tolerance_);
if (blob_index >= num_blobs) debug_str += " blob == NULL";
debug_str += ")\n";
debug_str.add_str_int(" path length ", correct_segmentation_cols_.length());
debug_str.add_str_int(" vs. truth ", norm_truth_word_.length());
debug_str += "\n";
SetBlame(IRR_UNKNOWN, debug_str, NULL, debug);
correct_segmentation_cols_.clear();
correct_segmentation_rows_.clear();
}
}
// Returns true if a guided segmentation search is needed.
bool BlamerBundle::GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const {
return incorrect_result_reason_ == IRR_CORRECT &&
!segsearch_is_looking_for_blame_ &&
truth_has_char_boxes_ &&
!ChoiceIsCorrect(best_choice);
}
// Setup ready to guide the segmentation search to the correct segmentation.
// The callback pp_cb is used to avoid a cyclic dependency.
// It calls into LMPainPoints::GenerateForBlamer by pre-binding the
// WERD_RES, and the LMPainPoints itself.
// pp_cb must be a permanent callback, and should be deleted by the caller.
void BlamerBundle::InitForSegSearch(const WERD_CHOICE *best_choice,
MATRIX* ratings, UNICHAR_ID wildcard_id,
bool debug, STRING *debug_str,
TessResultCallback2<bool, int, int>* cb) {
segsearch_is_looking_for_blame_ = true;
if (debug) {
tprintf("segsearch starting to look for blame\n");
}
// Fill pain points for any unclassifed blob corresponding to the
// correct segmentation state.
*debug_str += "Correct segmentation:\n";
for (int idx = 0; idx < correct_segmentation_cols_.length(); ++idx) {
debug_str->add_str_int("col=", correct_segmentation_cols_[idx]);
debug_str->add_str_int(" row=", correct_segmentation_rows_[idx]);
*debug_str += "\n";
if (!ratings->Classified(correct_segmentation_cols_[idx],
correct_segmentation_rows_[idx],
wildcard_id) &&
!cb->Run(correct_segmentation_cols_[idx],
correct_segmentation_rows_[idx])) {
segsearch_is_looking_for_blame_ = false;
*debug_str += "\nFailed to insert pain point\n";
SetBlame(IRR_SEGSEARCH_HEUR, *debug_str, best_choice, debug);
break;
}
} // end for blamer_bundle->correct_segmentation_cols/rows
}
// Returns true if the guided segsearch is in progress.
bool BlamerBundle::GuidedSegsearchStillGoing() const {
return segsearch_is_looking_for_blame_;
}
// The segmentation search has ended. Sets the blame appropriately.
void BlamerBundle::FinishSegSearch(const WERD_CHOICE *best_choice,
bool debug, STRING *debug_str) {
// If we are still looking for blame (i.e. best_choice is incorrect, but a
// path representing the correct segmentation could be constructed), we can
// blame segmentation search pain point prioritization if the rating of the
// path corresponding to the correct segmentation is better than that of
// best_choice (i.e. language model would have done the correct thing, but
// because of poor pain point prioritization the correct segmentation was
// never explored). Otherwise we blame the tradeoff between the language model
// and the classifier, since even after exploring the path corresponding to
// the correct segmentation incorrect best_choice would have been chosen.
// One special case when we blame the classifier instead is when best choice
// is incorrect, but it is a dictionary word and it classifier's top choice.
if (segsearch_is_looking_for_blame_) {
segsearch_is_looking_for_blame_ = false;
if (best_choice_is_dict_and_top_choice_) {
*debug_str = "Best choice is: incorrect, top choice, dictionary word";
*debug_str += " with permuter ";
*debug_str += best_choice->permuter_name();
SetBlame(IRR_CLASSIFIER, *debug_str, best_choice, debug);
} else if (best_correctly_segmented_rating_ <
best_choice->rating()) {
*debug_str += "Correct segmentation state was not explored";
SetBlame(IRR_SEGSEARCH_PP, *debug_str, best_choice, debug);
} else {
if (best_correctly_segmented_rating_ >=
WERD_CHOICE::kBadRating) {
*debug_str += "Correct segmentation paths were pruned by LM\n";
} else {
debug_str->add_str_double("Best correct segmentation rating ",
best_correctly_segmented_rating_);
debug_str->add_str_double(" vs. best choice rating ",
best_choice->rating());
}
SetBlame(IRR_CLASS_LM_TRADEOFF, *debug_str, best_choice, debug);
}
}
}
// If the bundle is null or still does not indicate the correct result,
// fix it and use some backup reason for the blame.
void BlamerBundle::LastChanceBlame(bool debug, WERD_RES* word) {
if (word->blamer_bundle == NULL) {
word->blamer_bundle = new BlamerBundle();
word->blamer_bundle->SetBlame(IRR_PAGE_LAYOUT, "LastChanceBlame",
word->best_choice, debug);
} else if (word->blamer_bundle->incorrect_result_reason_ == IRR_NO_TRUTH) {
word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth",
word->best_choice, debug);
} else {
bool correct = word->blamer_bundle->ChoiceIsCorrect(word->best_choice);
IncorrectResultReason irr = word->blamer_bundle->incorrect_result_reason_;
if (irr == IRR_CORRECT && !correct) {
STRING debug_str = "Choice is incorrect after recognition";
word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug_str, word->best_choice,
debug);
} else if (irr != IRR_CORRECT && correct) {
if (debug) {
tprintf("Corrected %s\n", word->blamer_bundle->debug_.string());
}
word->blamer_bundle->incorrect_result_reason_ = IRR_CORRECT;
word->blamer_bundle->debug_ = "";
}
}
}
// Sets the misadaption debug if this word is incorrect, as this word is
// being adapted to.
void BlamerBundle::SetMisAdaptionDebug(const WERD_CHOICE *best_choice,
bool debug) {
if (incorrect_result_reason_ != IRR_NO_TRUTH &&
!ChoiceIsCorrect(best_choice)) {
misadaption_debug_ ="misadapt to word (";
misadaption_debug_ += best_choice->permuter_name();
misadaption_debug_ += "): ";
FillDebugString("", best_choice, &misadaption_debug_);
if (debug) {
tprintf("%s\n", misadaption_debug_.string());
}
}
}

330
ccstruct/blamer.h Normal file
View File

@ -0,0 +1,330 @@
///////////////////////////////////////////////////////////////////////
// File: blamer.h
// Description: Module allowing precise error causes to be allocated.
// Author: Rike Antonova
// Refactored: Ray Smith
// Created: Mon Feb 04 14:37:01 PST 2013
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCSTRUCT_BLAMER_H_
#define TESSERACT_CCSTRUCT_BLAMER_H_
#include <stdio.h>
#include "boxword.h"
#include "genericvector.h"
#include "matrix.h"
#include "params_training_featdef.h"
#include "ratngs.h"
#include "strngs.h"
#include "tesscallback.h"
static const inT16 kBlamerBoxTolerance = 5;
// Enum for expressing the source of error.
// Note: Please update kIncorrectResultReasonNames when modifying this enum.
enum IncorrectResultReason {
// The text recorded in best choice == truth text
IRR_CORRECT,
// Either: Top choice is incorrect and is a dictionary word (language model
// is unlikely to help correct such errors, so blame the classifier).
// Or: the correct unichar was not included in shortlist produced by the
// classifier at all.
IRR_CLASSIFIER,
// Chopper have not found one or more splits that correspond to the correct
// character bounding boxes recorded in BlamerBundle::truth_word.
IRR_CHOPPER,
// Classifier did include correct unichars for each blob in the correct
// segmentation, however its rating could have been too bad to allow the
// language model to pull out the correct choice. On the other hand the
// strength of the language model might have been too weak to favor the
// correct answer, this we call this case a classifier-language model
// tradeoff error.
IRR_CLASS_LM_TRADEOFF,
// Page layout failed to produce the correct bounding box. Blame page layout
// if the truth was not found for the word, which implies that the bounding
// box of the word was incorrect (no truth word had a similar bounding box).
IRR_PAGE_LAYOUT,
// SegSearch heuristic prevented one or more blobs from the correct
// segmentation state to be classified (e.g. the blob was too wide).
IRR_SEGSEARCH_HEUR,
// The correct segmentaiton state was not explored because of poor SegSearch
// pain point prioritization. We blame SegSearch pain point prioritization
// if the best rating of a choice constructed from correct segmentation is
// better than that of the best choice (i.e. if we got to explore the correct
// segmentation state, language model would have picked the correct choice).
IRR_SEGSEARCH_PP,
// Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
// and thus use the old language model (permuters).
// TODO(antonova): integrate the new language mode with chopper
IRR_CLASS_OLD_LM_TRADEOFF,
// If there is an incorrect adaptive template match with a better score than
// a correct one (either pre-trained or adapted), mark this as adaption error.
IRR_ADAPTION,
// split_and_recog_word() failed to find a suitable split in truth.
IRR_NO_TRUTH_SPLIT,
// Truth is not available for this word (e.g. when words in corrected content
// file are turned into ~~~~ because an appropriate alignment was not found.
IRR_NO_TRUTH,
// The text recorded in best choice != truth text, but none of the above
// reasons are set.
IRR_UNKNOWN,
IRR_NUM_REASONS
};
// Blamer-related information to determine the source of errors.
struct BlamerBundle {
static const char *IncorrectReasonName(IncorrectResultReason irr);
BlamerBundle() : truth_has_char_boxes_(false),
incorrect_result_reason_(IRR_CORRECT),
lattice_data_(NULL) { ClearResults(); }
BlamerBundle(const BlamerBundle &other) {
this->CopyTruth(other);
this->CopyResults(other);
}
~BlamerBundle() { delete[] lattice_data_; }
// Accessors.
STRING TruthString() const {
STRING truth_str;
for (int i = 0; i < truth_text_.length(); ++i)
truth_str += truth_text_[i];
return truth_str;
}
IncorrectResultReason incorrect_result_reason() const {
return incorrect_result_reason_;
}
bool NoTruth() const {
return incorrect_result_reason_ == IRR_NO_TRUTH ||
incorrect_result_reason_ == IRR_PAGE_LAYOUT;
}
bool HasDebugInfo() const {
return debug_.length() > 0 || misadaption_debug_.length() > 0;
}
const STRING& debug() const {
return debug_;
}
const STRING& misadaption_debug() const {
return misadaption_debug_;
}
void UpdateBestRating(float rating) {
if (rating < best_correctly_segmented_rating_)
best_correctly_segmented_rating_ = rating;
}
int correct_segmentation_length() const {
return correct_segmentation_cols_.length();
}
// Returns true if the given ratings matrix col,row position is included
// in the correct segmentation path at the given index.
bool MatrixPositionCorrect(int index, const MATRIX_COORD& coord) {
return correct_segmentation_cols_[index] == coord.col &&
correct_segmentation_rows_[index] == coord.row;
}
void set_best_choice_is_dict_and_top_choice(bool value) {
best_choice_is_dict_and_top_choice_ = value;
}
const char* lattice_data() const {
return lattice_data_;
}
int lattice_size() const {
return lattice_size_; // size of lattice_data in bytes
}
void set_lattice_data(const char* data, int size) {
lattice_size_ = size;
delete [] lattice_data_;
lattice_data_ = new char[lattice_size_];
memcpy(lattice_data_, data, lattice_size_);
}
const tesseract::ParamsTrainingBundle& params_training_bundle() const {
return params_training_bundle_;
}
// Adds a new ParamsTrainingHypothesis to the current hypothesis list.
void AddHypothesis(const tesseract::ParamsTrainingHypothesis& hypo) {
params_training_bundle_.AddHypothesis(hypo);
}
// Functions to setup the blamer.
// Whole word string, whole word bounding box.
void SetWordTruth(const UNICHARSET& unicharset,
const char* truth_str, const TBOX& word_box);
// Single "character" string, "character" bounding box.
// May be called multiple times to indicate the characters in a word.
void SetSymbolTruth(const UNICHARSET& unicharset,
const char* char_str, const TBOX& char_box);
// Marks that there is something wrong with the truth text, like it contains
// reject characters.
void SetRejectedTruth();
// Returns true if the provided word_choice is correct.
bool ChoiceIsCorrect(const WERD_CHOICE* word_choice) const;
void ClearResults() {
norm_truth_word_.DeleteAllBoxes();
norm_box_tolerance_ = 0;
if (!NoTruth()) incorrect_result_reason_ = IRR_CORRECT;
debug_ = "";
segsearch_is_looking_for_blame_ = false;
best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating;
correct_segmentation_cols_.clear();
correct_segmentation_rows_.clear();
best_choice_is_dict_and_top_choice_ = false;
delete[] lattice_data_;
lattice_data_ = NULL;
lattice_size_ = 0;
}
void CopyTruth(const BlamerBundle &other) {
truth_has_char_boxes_ = other.truth_has_char_boxes_;
truth_word_ = other.truth_word_;
truth_text_ = other.truth_text_;
incorrect_result_reason_ =
(other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);
}
void CopyResults(const BlamerBundle &other) {
norm_truth_word_ = other.norm_truth_word_;
norm_box_tolerance_ = other.norm_box_tolerance_;
incorrect_result_reason_ = other.incorrect_result_reason_;
segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_;
best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_;
correct_segmentation_cols_ = other.correct_segmentation_cols_;
correct_segmentation_rows_ = other.correct_segmentation_rows_;
best_choice_is_dict_and_top_choice_ =
other.best_choice_is_dict_and_top_choice_;
if (other.lattice_data_ != NULL) {
lattice_data_ = new char[other.lattice_size_];
memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);
lattice_size_ = other.lattice_size_;
} else {
lattice_data_ = NULL;
}
}
const char *IncorrectReason() const;
// Appends choice and truth details to the given debug string.
void FillDebugString(const STRING &msg, const WERD_CHOICE *choice,
STRING *debug);
// Sets up the norm_truth_word from truth_word using the given DENORM.
void SetupNormTruthWord(const DENORM& denorm);
// Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
// bundles) where the right edge/ of the left-hand word is word1_right,
// and the left edge of the right-hand word is word2_left.
void SplitBundle(int word1_right, int word2_left, bool debug,
BlamerBundle* bundle1, BlamerBundle* bundle2) const;
// "Joins" the blames from bundle1 and bundle2 into *this.
void JoinBlames(const BlamerBundle& bundle1, const BlamerBundle& bundle2,
bool debug);
// If a blob with the same bounding box as one of the truth character
// bounding boxes is not classified as the corresponding truth character
// blames character classifier for incorrect answer.
void BlameClassifier(const UNICHARSET& unicharset,
const TBOX& blob_box,
const BLOB_CHOICE_LIST& choices,
bool debug);
// Checks whether chops were made at all the character bounding box
// boundaries in word->truth_word. If not - blames the chopper for an
// incorrect answer.
void SetChopperBlame(const WERD_RES* word, bool debug);
// Blames the classifier or the language model if, after running only the
// chopper, best_choice is incorrect and no blame has been yet set.
// Blames the classifier if best_choice is classifier's top choice and is a
// dictionary word (i.e. language model could not have helped).
// Otherwise, blames the language model (formerly permuter word adjustment).
void BlameClassifierOrLangModel(
const WERD_RES* word,
const UNICHARSET& unicharset, bool valid_permuter, bool debug);
// Sets up the correct_segmentation_* to mark the correct bounding boxes.
void SetupCorrectSegmentation(const TWERD* word, bool debug);
// Returns true if a guided segmentation search is needed.
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;
// Setup ready to guide the segmentation search to the correct segmentation.
// The callback pp_cb is used to avoid a cyclic dependency.
// It calls into LMPainPoints::GenerateForBlamer by pre-binding the
// WERD_RES, and the LMPainPoints itself.
// pp_cb must be a permanent callback, and should be deleted by the caller.
void InitForSegSearch(const WERD_CHOICE *best_choice,
MATRIX* ratings, UNICHAR_ID wildcard_id,
bool debug, STRING *debug_str,
TessResultCallback2<bool, int, int>* pp_cb);
// Returns true if the guided segsearch is in progress.
bool GuidedSegsearchStillGoing() const;
// The segmentation search has ended. Sets the blame appropriately.
void FinishSegSearch(const WERD_CHOICE *best_choice,
bool debug, STRING *debug_str);
// If the bundle is null or still does not indicate the correct result,
// fix it and use some backup reason for the blame.
static void LastChanceBlame(bool debug, WERD_RES* word);
// Sets the misadaption debug if this word is incorrect, as this word is
// being adapted to.
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);
private:
void SetBlame(IncorrectResultReason irr, const STRING &msg,
const WERD_CHOICE *choice, bool debug) {
incorrect_result_reason_ = irr;
debug_ = IncorrectReason();
debug_ += " to blame: ";
FillDebugString(msg, choice, &debug_);
if (debug) tprintf("SetBlame(): %s", debug_.string());
}
private:
// Set to true when bounding boxes for individual unichars are recorded.
bool truth_has_char_boxes_;
// The true_word (in the original image coordinate space) contains ground
// truth bounding boxes for this WERD_RES.
tesseract::BoxWord truth_word_;
// Same as above, but in normalized coordinates
// (filled in by WERD_RES::SetupForRecognition()).
tesseract::BoxWord norm_truth_word_;
// Tolerance for bounding box comparisons in normalized space.
int norm_box_tolerance_;
// Contains ground truth unichar for each of the bounding boxes in truth_word.
GenericVector<STRING> truth_text_;
// The reason for incorrect OCR result.
IncorrectResultReason incorrect_result_reason_;
// Debug text associated with the blame.
STRING debug_;
// Misadaption debug information (filled in if this word was misadapted to).
STRING misadaption_debug_;
// Variables used by the segmentation search when looking for the blame.
// Set to true while segmentation search is continued after the usual
// termination condition in order to look for the blame.
bool segsearch_is_looking_for_blame_;
// Best rating for correctly segmented path
// (set and used by SegSearch when looking for blame).
float best_correctly_segmented_rating_;
// Vectors populated by SegSearch to indicate column and row indices that
// correspond to blobs with correct bounding boxes.
GenericVector<int> correct_segmentation_cols_;
GenericVector<int> correct_segmentation_rows_;
// Set to true if best choice is a dictionary word and
// classifier's top choice.
bool best_choice_is_dict_and_top_choice_;
// Serialized segmentation search lattice.
char *lattice_data_;
int lattice_size_; // size of lattice_data in bytes
// Information about hypotheses (paths) explored by the segmentation search.
tesseract::ParamsTrainingBundle params_training_bundle_;
};
#endif // TESSERACT_CCSTRUCT_BLAMER_H_

View File

@ -29,12 +29,6 @@ namespace tesseract {
// tolerance. Otherwise, the blob may be chopped and we have to just use
// the word bounding box.
const int kBoxClipTolerance = 2;
// Min offset in baseline-normalized coords to make a character a subscript.
const int kMinSubscriptOffset = 20;
// Min offset in baseline-normalized coords to make a character a superscript.
const int kMinSuperscriptOffset = 20;
// Max y of bottom of a drop-cap blob.
const int kMaxDropCapBottom = -128;
BoxWord::BoxWord() : length_(0) {
}
@ -60,21 +54,17 @@ void BoxWord::CopyFrom(const BoxWord& src) {
boxes_.push_back(src.boxes_[i]);
}
// Factory to build a BoxWord from a TWERD and the DENORM to switch
// back to original image coordinates.
// If the denorm is not NULL, then the output is denormalized and rotated
// back to the original image coordinates.
BoxWord* BoxWord::CopyFromNormalized(const DENORM* denorm,
TWERD* tessword) {
// Factory to build a BoxWord from a TWERD using the DENORMs on each blob to
// switch back to original image coordinates.
BoxWord* BoxWord::CopyFromNormalized(TWERD* tessword) {
BoxWord* boxword = new BoxWord();
// Count the blobs.
boxword->length_ = 0;
for (TBLOB* tblob = tessword->blobs; tblob != NULL; tblob = tblob->next)
++boxword->length_;
boxword->length_ = tessword->NumBlobs();
// Allocate memory.
boxword->boxes_.reserve(boxword->length_);
for (TBLOB* tblob = tessword->blobs; tblob != NULL; tblob = tblob->next) {
for (int b = 0; b < boxword->length_; ++b) {
TBLOB* tblob = tessword->blobs[b];
TBOX blob_box;
for (TESSLINE* outline = tblob->outlines; outline != NULL;
outline = outline->next) {
@ -83,12 +73,10 @@ BoxWord* BoxWord::CopyFromNormalized(const DENORM* denorm,
do {
if (!edgept->IsHidden() || !edgept->prev->IsHidden()) {
ICOORD pos(edgept->pos.x, edgept->pos.y);
if (denorm != NULL) {
TPOINT denormed;
denorm->DenormTransform(edgept->pos, &denormed);
pos.set_x(denormed.x);
pos.set_y(denormed.y);
}
TPOINT denormed;
tblob->denorm().DenormTransform(NULL, edgept->pos, &denormed);
pos.set_x(denormed.x);
pos.set_y(denormed.y);
TBOX pt_box(pos, pos);
blob_box += pt_box;
}
@ -101,37 +89,6 @@ BoxWord* BoxWord::CopyFromNormalized(const DENORM* denorm,
return boxword;
}
// Sets up the script_pos_ member using the tessword to get the bln
// bounding boxes, the best_choice to get the unichars, and the unicharset
// to get the target positions. If small_caps is true, sub/super are not
// considered, but dropcaps are.
void BoxWord::SetScriptPositions(const UNICHARSET& unicharset, bool small_caps,
TWERD* tessword, WERD_CHOICE* best_choice) {
// Allocate memory.
script_pos_.init_to_size(length_, SP_NORMAL);
int blob_index = 0;
for (TBLOB* tblob = tessword->blobs; tblob != NULL; tblob = tblob->next,
++blob_index) {
int class_id = best_choice->unichar_id(blob_index);
TBOX blob_box = tblob->bounding_box();
int top = blob_box.top();
int bottom = blob_box.bottom();
int min_bottom, max_bottom, min_top, max_top;
unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
&min_top, &max_top);
if (bottom <= kMaxDropCapBottom) {
script_pos_[blob_index] = SP_DROPCAP;
} else if (!small_caps) {
if (top + kMinSubscriptOffset < min_top) {
script_pos_[blob_index] = SP_SUBSCRIPT;
} else if (bottom - kMinSuperscriptOffset > max_bottom) {
script_pos_[blob_index] = SP_SUPERSCRIPT;
}
}
}
}
// Clean up the bounding boxes from the polygonal approximation by
// expanding slightly, then clipping to the blobs from the original_word
// that overlap. If not null, the block provides the inverse rotation.
@ -228,9 +185,8 @@ void BoxWord::ComputeBoundingBox() {
// The callback is deleted on completion.
void BoxWord::ProcessMatchedBlobs(const TWERD& other,
TessCallback1<int>* cb) const {
TBLOB* blob = other.blobs;
for (int i = 0; i < length_ && blob != NULL; ++i, blob = blob->next) {
TBOX blob_box = blob->bounding_box();
for (int i = 0; i < length_ && i < other.NumBlobs(); ++i) {
TBOX blob_box = other.blobs[i]->bounding_box();
if (blob_box == boxes_[i])
cb->Run(i);
}
@ -238,5 +194,3 @@ void BoxWord::ProcessMatchedBlobs(const TWERD& other,
}
} // namespace tesseract.

View File

@ -22,6 +22,7 @@
#include "genericvector.h"
#include "rect.h"
#include "unichar.h"
class BLOCK;
class DENORM;
@ -34,14 +35,6 @@ class WERD_RES;
namespace tesseract {
// ScriptPos tells whether a character is subscript, superscript or normal.
enum ScriptPos {
SP_NORMAL,
SP_SUBSCRIPT,
SP_SUPERSCRIPT,
SP_DROPCAP
};
// Class to hold an array of bounding boxes for an output word and
// the bounding box of the whole word.
class BoxWord {
@ -54,19 +47,9 @@ class BoxWord {
void CopyFrom(const BoxWord& src);
// Factory to build a BoxWord from a TWERD and the DENORM to switch
// back to original image coordinates.
// If the denorm is not NULL, then the output is denormalized and rotated
// back to the original image coordinates.
static BoxWord* CopyFromNormalized(const DENORM* denorm,
TWERD* tessword);
// Sets up the script_pos_ member using the tessword to get the bln
// bounding boxes, the best_choice to get the unichars, and the unicharset
// to get the target positions. If small_caps is true, sub/super are not
// considered, but dropcaps are.
void SetScriptPositions(const UNICHARSET& unicharset, bool small_caps,
TWERD* tessword, WERD_CHOICE* best_choice);
// Factory to build a BoxWord from a TWERD using the DENORMs on each blob to
// switch back to original image coordinates.
static BoxWord* CopyFromNormalized(TWERD* tessword);
// Clean up the bounding boxes from the polygonal approximation by
// expanding slightly, then clipping to the blobs from the original_word
@ -102,11 +85,6 @@ class BoxWord {
const TBOX& BlobBox(int index) const {
return boxes_[index];
}
ScriptPos BlobPosition(int index) const {
if (index < 0 || index >= script_pos_.size())
return SP_NORMAL;
return script_pos_[index];
}
private:
void ComputeBoundingBox();
@ -114,7 +92,6 @@ class BoxWord {
TBOX bbox_;
int length_;
GenericVector<TBOX> boxes_;
GenericVector<ScriptPos> script_pos_;
};
} // namespace tesseract.

View File

@ -32,21 +32,120 @@
#include "tprintf.h"
#include "unicharset.h"
// Returns true if there are any real classification results.
bool MATRIX::Classified(int col, int row, int wildcard_id) const {
if (get(col, row) == NOT_CLASSIFIED) return false;
BLOB_CHOICE_IT b_it(get(col, row));
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOB_CHOICE* choice = b_it.data();
if (choice->IsClassified())
return true;
}
return false;
}
// Expands the existing matrix in-place to make the band wider, without
// losing any existing data.
void MATRIX::IncreaseBandSize(int bandwidth) {
ResizeWithCopy(dimension(), bandwidth);
}
// Returns a bigger MATRIX with a new column and row in the matrix in order
// to split the blob at the given (ind,ind) diagonal location.
// Entries are relocated to the new MATRIX using the transformation defined
// by MATRIX_COORD::MapForSplit.
// Transfers the pointer data to the new MATRIX and deletes *this.
MATRIX* MATRIX::ConsumeAndMakeBigger(int ind) {
int dim = dimension();
int band_width = bandwidth();
// Check to see if bandwidth needs expanding.
for (int col = ind; col >= 0 && col > ind - band_width; --col) {
if (array_[col * band_width + band_width - 1] != empty_) {
++band_width;
break;
}
}
MATRIX* result = new MATRIX(dim + 1, band_width);
for (int col = 0; col < dim; ++col) {
for (int row = col; row < dim && row < col + bandwidth(); ++row) {
MATRIX_COORD coord(col, row);
coord.MapForSplit(ind);
BLOB_CHOICE_LIST* choices = get(col, row);
if (choices != NULL) {
// Correct matrix location on each choice.
BLOB_CHOICE_IT bc_it(choices);
for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
BLOB_CHOICE* choice = bc_it.data();
choice->set_matrix_cell(coord.col, coord.row);
}
ASSERT_HOST(coord.Valid(*result));
result->put(coord.col, coord.row, choices);
}
}
}
delete this;
return result;
}
// Makes and returns a deep copy of *this, including all the BLOB_CHOICEs
// on the lists, but not any LanguageModelState that may be attached to the
// BLOB_CHOICEs.
MATRIX* MATRIX::DeepCopy() const {
int dim = dimension();
int band_width = bandwidth();
MATRIX* result = new MATRIX(dim, band_width);
for (int col = 0; col < dim; ++col) {
for (int row = col; row < col + band_width; ++row) {
BLOB_CHOICE_LIST* choices = get(col, row);
if (choices != NULL) {
BLOB_CHOICE_LIST* copy_choices = new BLOB_CHOICE_LIST;
choices->deep_copy(copy_choices, &BLOB_CHOICE::deep_copy);
result->put(col, row, copy_choices);
}
}
}
return result;
}
// Print the best guesses out of the match rating matrix.
void MATRIX::print(const UNICHARSET &unicharset) const {
tprintf("Ratings Matrix (top choices)\n");
tprintf("Ratings Matrix (top 3 choices)\n");
int dim = dimension();
int band_width = bandwidth();
int row, col;
for (col = 0; col < this->dimension(); ++col) tprintf("\t%d", col);
for (col = 0; col < dim; ++col) {
for (row = col; row < dim && row < col + band_width; ++row) {
BLOB_CHOICE_LIST *rating = this->get(col, row);
if (rating == NOT_CLASSIFIED) continue;
BLOB_CHOICE_IT b_it(rating);
tprintf("col=%d row=%d ", col, row);
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
tprintf("%s rat=%g cert=%g " ,
unicharset.id_to_unichar(b_it.data()->unichar_id()),
b_it.data()->rating(), b_it.data()->certainty());
}
tprintf("\n");
}
tprintf("\n");
}
tprintf("\n");
for (row = 0; row < this->dimension(); ++row) {
for (col = 0; col < dim; ++col) tprintf("\t%d", col);
tprintf("\n");
for (row = 0; row < dim; ++row) {
for (col = 0; col <= row; ++col) {
if (col == 0) tprintf("%d\t", row);
if (row >= col + band_width) {
tprintf(" \t");
continue;
}
BLOB_CHOICE_LIST *rating = this->get(col, row);
if (rating != NOT_CLASSIFIED) {
BLOB_CHOICE_IT b_it(rating);
int counter = 0;
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
tprintf("%s ", unicharset.id_to_unichar(b_it.data()->unichar_id()));
tprintf("%s ",
unicharset.id_to_unichar(b_it.data()->unichar_id()));
++counter;
if (counter == 3) break;
}

View File

@ -1,5 +1,5 @@
/* -*-C-*-
********************************************************************************
******************************************************************************
*
* File: matrix.h (Formerly matrix.h)
* Description: Ratings matrix code. (Used by associator)
@ -25,18 +25,28 @@
#ifndef TESSERACT_CCSTRUCT_MATRIX_H__
#define TESSERACT_CCSTRUCT_MATRIX_H__
#include "ratngs.h"
#include "kdpair.h"
#include "unicharset.h"
class BLOB_CHOICE_LIST;
#define NOT_CLASSIFIED reinterpret_cast<BLOB_CHOICE_LIST*>(NULL)
// A generic class to store a matrix with entries of type T.
// A generic class to hold a 2-D matrix with entries of type T, but can also
// act as a base class for other implementations, such as a triangular or
// banded matrix.
template <class T>
class GENERIC_2D_ARRAY {
public:
// Allocate a piece of memory to hold a 2d-array of the given dimension.
// Initialize all the elements of the array to empty instead of assuming
// that a default constructor can be used.
// Initializes the array size, and empty element, but cannot allocate memory
// for the subclasses or initialize because calls to the num_elements
// member will be routed to the base class implementation. Subclasses can
// either pass the memory in, or allocate after by calling Resize().
GENERIC_2D_ARRAY(int dim1, int dim2, const T& empty, T* array)
: empty_(empty), dim1_(dim1), dim2_(dim2), array_(array) {
}
// Original constructor for a full rectangular matrix DOES allocate memory
// and initialize it to empty.
GENERIC_2D_ARRAY(int dim1, int dim2, const T& empty)
: empty_(empty), dim1_(dim1), dim2_(dim2) {
array_ = new T[dim1_ * dim2_];
@ -44,26 +54,67 @@ class GENERIC_2D_ARRAY {
for (int y = 0; y < dim2_; y++)
this->put(x, y, empty_);
}
~GENERIC_2D_ARRAY() { delete[] array_; }
virtual ~GENERIC_2D_ARRAY() { delete[] array_; }
// Reallocate the array to the given size. Does not keep old data.
void Resize(int size1, int size2, const T& empty) {
empty_ = empty;
if (size1 != dim1_ || size2 != dim2_) {
dim1_ = size1;
dim2_ = size2;
delete [] array_;
array_ = new T[dim1_ * dim2_];
}
Clear();
}
// Reallocate the array to the given size, keeping old data.
void ResizeWithCopy(int size1, int size2) {
if (size1 != dim1_ || size2 != dim2_) {
T* new_array = new T[size1 * size2];
for (int col = 0; col < size1; ++col) {
for (int row = 0; row < size2; ++row) {
int old_index = col * dim2() + row;
int new_index = col * size2 + row;
if (col < dim1_ && row < dim2_) {
new_array[new_index] = array_[old_index];
} else {
new_array[new_index] = empty_;
}
}
}
delete[] array_;
array_ = new_array;
dim1_ = size1;
dim2_ = size2;
}
}
// Sets all the elements of the array to the empty value.
void Clear() {
int total_size = num_elements();
for (int i = 0; i < total_size; ++i)
array_[i] = empty_;
}
// Writes to the given file. Returns false in case of error.
// Only works with bitwise-serializeable types!
bool Serialize(FILE* fp) const {
if (!SerializeSize(fp)) return false;
if (fwrite(&empty_, sizeof(empty_), 1, fp) != 1) return false;
int size = dim1_ * dim2_;
int size = num_elements();
if (fwrite(array_, sizeof(*array_), size, fp) != size) return false;
return true;
}
// Reads from the given file. Returns false in case of error.
// Only works with bitwise-serializeable types!
// Only works with bitwise-serializeable typ
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp) {
if (!DeSerializeSize(swap, fp)) return false;
if (fread(&empty_, sizeof(empty_), 1, fp) != 1) return false;
if (swap) ReverseN(&empty_, sizeof(empty_));
int size = dim1_ * dim2_;
int size = num_elements();
if (fread(array_, sizeof(*array_), size, fp) != size) return false;
if (swap) {
for (int i = 0; i < size; ++i)
@ -77,7 +128,7 @@ class GENERIC_2D_ARRAY {
bool SerializeClasses(FILE* fp) const {
if (!SerializeSize(fp)) return false;
if (!empty_.Serialize(fp)) return false;
int size = dim1_ * dim2_;
int size = num_elements();
for (int i = 0; i < size; ++i) {
if (!array_[i].Serialize(fp)) return false;
}
@ -90,7 +141,7 @@ class GENERIC_2D_ARRAY {
bool DeSerializeClasses(bool swap, FILE* fp) {
if (!DeSerializeSize(swap, fp)) return false;
if (!empty_.DeSerialize(swap, fp)) return false;
int size = dim1_ * dim2_;
int size = num_elements();
for (int i = 0; i < size; ++i) {
if (!array_[i].DeSerialize(swap, fp)) return false;
}
@ -100,11 +151,14 @@ class GENERIC_2D_ARRAY {
// Provide the dimensions of this rectangular matrix.
int dim1() const { return dim1_; }
int dim2() const { return dim2_; }
// Returns the number of elements in the array.
// Banded/triangular matrices may override.
virtual int num_elements() const { return dim1_ * dim2_; }
// Expression to select a specific location in the matrix. The matrix is
// stored COLUMN-major, so the left-most index is the most significant.
// This allows [][] access to use indices in the same order as (,).
int index(int column, int row) const {
virtual int index(int column, int row) const {
return (column * dim2_ + row);
}
@ -129,19 +183,21 @@ class GENERIC_2D_ARRAY {
T* operator[](int column) {
return &array_[this->index(column, 0)];
}
const T* operator[](int column) const {
return &array_[this->index(column, 0)];
}
// Delete objects pointed to by array_[i].
void delete_matrix_pointers() {
for (int x = 0; x < dim1_; x++) {
for (int y = 0; y < dim2_; y++) {
T matrix_cell = this->get(x, y);
if (matrix_cell != empty_)
delete matrix_cell;
}
int size = num_elements();
for (int i = 0; i < size; ++i) {
T matrix_cell = array_[i];
if (matrix_cell != empty_)
delete matrix_cell;
}
}
private:
protected:
// Factored helper to serialize the size.
bool SerializeSize(FILE* fp) const {
inT32 size = dim1_;
@ -160,12 +216,7 @@ class GENERIC_2D_ARRAY {
ReverseN(&size1, sizeof(size1));
ReverseN(&size2, sizeof(size2));
}
if (size1 != dim1_ || size2 != dim2_) {
dim1_ = size1;
dim2_ = size2;
delete [] array_;
array_ = new T[dim1_ * dim2_];
}
Resize(size1, size2, empty_);
return true;
}
@ -175,25 +226,90 @@ class GENERIC_2D_ARRAY {
int dim2_; // Size of the 2nd dimension in indexing functions.
};
// A generic class to store a square matrix with entries of type T.
// A generic class to store a banded triangular matrix with entries of type T.
// In this array, the nominally square matrix is dim1_ x dim1_, and dim2_ is
// the number of bands, INCLUDING the diagonal. The storage is thus of size
// dim1_ * dim2_ and index(col, row) = col * dim2_ + row - col, and an
// assert will fail if row < col or row - col >= dim2.
template <class T>
class GENERIC_MATRIX : public GENERIC_2D_ARRAY<T> {
class BandTriMatrix : public GENERIC_2D_ARRAY<T> {
public:
// Allocate a piece of memory to hold a matrix of the given dimension.
// Initialize all the elements of the matrix to empty instead of assuming
// Allocate a piece of memory to hold a 2d-array of the given dimension.
// Initialize all the elements of the array to empty instead of assuming
// that a default constructor can be used.
GENERIC_MATRIX(int dimension, const T& empty)
: GENERIC_2D_ARRAY<T>(dimension, dimension, empty) {
BandTriMatrix(int dim1, int dim2, const T& empty)
: GENERIC_2D_ARRAY<T>(dim1, dim2, empty) {
}
// The default destructor will do.
// Provide the dimensions of this matrix.
// dimension is the size of the nominally square matrix.
int dimension() const { return this->dim1_; }
// bandwidth is the number of bands in the matrix, INCLUDING the diagonal.
int bandwidth() const { return this->dim2_; }
// Expression to select a specific location in the matrix. The matrix is
// stored COLUMN-major, so the left-most index is the most significant.
// This allows [][] access to use indices in the same order as (,).
virtual int index(int column, int row) const {
ASSERT_HOST(row >= column);
ASSERT_HOST(row - column < this->dim2_);
return column * this->dim2_ + row - column;
}
// Provide the dimension of this square matrix.
int dimension() const { return this->dim1(); }
// Appends array2 corner-to-corner to *this, making an array of dimension
// equal to the sum of the individual dimensions.
// array2 is not destroyed, but is left empty, as all elements are moved
// to *this.
void AttachOnCorner(BandTriMatrix<T>* array2) {
int new_dim1 = this->dim1_ + array2->dim1_;
int new_dim2 = MAX(this->dim2_, array2->dim2_);
T* new_array = new T[new_dim1 * new_dim2];
for (int col = 0; col < new_dim1; ++col) {
for (int j = 0; j < new_dim2; ++j) {
int new_index = col * new_dim2 + j;
if (col < this->dim1_ && j < this->dim2_) {
new_array[new_index] = this->get(col, col + j);
} else if (col >= this->dim1_ && j < array2->dim2_) {
new_array[new_index] = array2->get(col - this->dim1_,
col - this->dim1_ + j);
array2->put(col - this->dim1_, col - this->dim1_ + j, NULL);
} else {
new_array[new_index] = this->empty_;
}
}
}
delete[] this->array_;
this->array_ = new_array;
this->dim1_ = new_dim1;
this->dim2_ = new_dim2;
}
};
class MATRIX : public GENERIC_MATRIX<BLOB_CHOICE_LIST *> {
class MATRIX : public BandTriMatrix<BLOB_CHOICE_LIST *> {
public:
MATRIX(int dimension) : GENERIC_MATRIX<BLOB_CHOICE_LIST *>(dimension,
NOT_CLASSIFIED) {}
MATRIX(int dimension, int bandwidth)
: BandTriMatrix<BLOB_CHOICE_LIST *>(dimension, bandwidth, NOT_CLASSIFIED) {}
// Returns true if there are any real classification results.
bool Classified(int col, int row, int wildcard_id) const;
// Expands the existing matrix in-place to make the band wider, without
// losing any existing data.
void IncreaseBandSize(int bandwidth);
// Returns a bigger MATRIX with a new column and row in the matrix in order
// to split the blob at the given (ind,ind) diagonal location.
// Entries are relocated to the new MATRIX using the transformation defined
// by MATRIX_COORD::MapForSplit.
// Transfers the pointer data to the new MATRIX and deletes *this.
MATRIX* ConsumeAndMakeBigger(int ind);
// Makes and returns a deep copy of *this, including all the BLOB_CHOICEs
// on the lists, but not any LanguageModelState that may be attached to the
// BLOB_CHOICEs.
MATRIX* DeepCopy() const;
// Print a shortened version of the contents of the matrix.
void print(const UNICHARSET &unicharset) const;
};
@ -203,14 +319,34 @@ struct MATRIX_COORD {
MATRIX_COORD *c = static_cast<MATRIX_COORD *>(arg);
delete c;
}
// Default constructor required by GenericHeap.
MATRIX_COORD() : col(0), row(0) {}
MATRIX_COORD(int c, int r): col(c), row(r) {}
~MATRIX_COORD() {}
bool Valid(const MATRIX &m) const {
return (col >= 0 && row >= 0 &&
col < m.dimension() && row < m.dimension());
return 0 <= col && col < m.dimension() &&
col <= row && row < col + m.bandwidth() && row < m.dimension();
}
// Remaps the col,row pair to split the blob at the given (ind,ind) diagonal
// location.
// Entries at (i,j) for i in [0,ind] and j in [ind,dim) move to (i,j+1),
// making a new row at ind.
// Entries at (i,j) for i in [ind+1,dim) and j in [i,dim) move to (i+i,j+1),
// making a new column at ind+1.
void MapForSplit(int ind) {
ASSERT_HOST(row >= col);
if (col > ind) ++col;
if (row >= ind) ++row;
ASSERT_HOST(row >= col);
}
int col;
int row;
};
// The MatrixCoordPair contains a MATRIX_COORD and its priority.
typedef tesseract::KDPairInc<float, MATRIX_COORD> MatrixCoordPair;
#endif // TESSERACT_CCSTRUCT_MATRIX_H__

View File

@ -472,6 +472,8 @@ void RefreshWordBlobsFromNewBlobs(BLOCK_LIST* block_list,
BLOCK_IT block_it(block_list);
for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
BLOCK* block = block_it.data();
if (block->poly_block() != NULL && !block->poly_block()->IsText())
continue; // Don't touch non-text blocks.
// Iterate over all rows in the block.
ROW_IT row_it(block->row_list());
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {

File diff suppressed because it is too large Load Diff

View File

@ -19,6 +19,7 @@
#ifndef PAGERES_H
#define PAGERES_H
#include "blamer.h"
#include "blobs.h"
#include "boxword.h"
#include "elst.h"
@ -38,167 +39,6 @@ class Tesseract;
}
using tesseract::FontInfo;
static const inT16 kBlamerBoxTolerance = 5;
// Enum for expressing the source of error.
// Note: Please update kIncorrectResultReasonNames when modifying this enum.
enum IncorrectResultReason {
// The text recorded in best choice == truth text
IRR_CORRECT,
// Either: Top choice is incorrect and is a dictionary word (language model
// is unlikely to help correct such errors, so blame the classifier).
// Or: the correct unichar was not included in shortlist produced by the
// classifier at all.
IRR_CLASSIFIER,
// Chopper have not found one or more splits that correspond to the correct
// character bounding boxes recorded in BlamerBundle::truth_word.
IRR_CHOPPER,
// Classifier did include correct unichars for each blob in the correct
// segmentation, however its rating could have been too bad to allow the
// language model to pull out the correct choice. On the other hand the
// strength of the language model might have been too weak to favor the
// correct answer, this we call this case a classifier-language model
// tradeoff error.
IRR_CLASS_LM_TRADEOFF,
// Page layout failed to produce the correct bounding box. Blame page layout
// if the truth was not found for the word, which implies that the bounding
// box of the word was incorrect (no truth word had a similar bounding box).
IRR_PAGE_LAYOUT,
// SegSearch heuristic prevented one or more blobs from the correct
// segmentation state to be classified (e.g. the blob was too wide).
IRR_SEGSEARCH_HEUR,
// The correct segmentaiton state was not explored because of poor SegSearch
// pain point prioritization. We blame SegSearch pain point prioritization
// if the best rating of a choice constructed from correct segmentation is
// better than that of the best choice (i.e. if we got to explore the correct
// segmentation state, language model would have picked the correct choice).
IRR_SEGSEARCH_PP,
// Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
// and thus use the old language model (permuters).
// TODO(antonova): integrate the new language mode with chopper
IRR_CLASS_OLD_LM_TRADEOFF,
// If there is an incorrect adaptive template match with a better score than
// a correct one (either pre-trained or adapted), mark this as adaption error.
IRR_ADAPTION,
// split_and_recog_word() failed to find a suitable split in truth.
IRR_NO_TRUTH_SPLIT,
// Truth is not available for this word (e.g. when words in corrected content
// file are turned into ~~~~ because an appropriate alignment was not found.
IRR_NO_TRUTH,
// The text recorded in best choice != truth text, but none of the above
// reasons are set.
IRR_UNKNOWN,
IRR_NUM_REASONS
};
// Blamer-related information to determine the source of errors.
struct BlamerBundle {
static const char *IncorrectReasonName(IncorrectResultReason irr);
BlamerBundle() : truth_has_char_boxes(false),
incorrect_result_reason(IRR_CORRECT),
lattice_data(NULL) { ClearResults(); }
~BlamerBundle() { delete[] lattice_data; }
void ClearResults() {
norm_truth_word.DeleteAllBoxes();
norm_box_tolerance = 0;
if (!NoTruth()) incorrect_result_reason = IRR_CORRECT;
debug = "";
segsearch_is_looking_for_blame = false;
best_correctly_segmented_rating = WERD_CHOICE::kBadRating;
correct_segmentation_cols.clear();
correct_segmentation_rows.clear();
best_choice_is_dict_and_top_choice = false;
delete[] lattice_data;
lattice_data = NULL;
lattice_size = 0;
}
void CopyTruth(const BlamerBundle &other) {
truth_has_char_boxes = other.truth_has_char_boxes;
truth_word = other.truth_word;
truth_text = other.truth_text;
incorrect_result_reason =
(other.NoTruth() ? other.incorrect_result_reason : IRR_CORRECT);
}
void CopyResults(const BlamerBundle &other) {
norm_truth_word = other.norm_truth_word;
norm_box_tolerance = other.norm_box_tolerance;
incorrect_result_reason = other.incorrect_result_reason;
segsearch_is_looking_for_blame = other.segsearch_is_looking_for_blame;
best_correctly_segmented_rating =other.best_correctly_segmented_rating;
correct_segmentation_cols = other.correct_segmentation_cols;
correct_segmentation_rows = other.correct_segmentation_rows;
best_choice_is_dict_and_top_choice =
other.best_choice_is_dict_and_top_choice;
if (other.lattice_data != NULL) {
lattice_data = new char[other.lattice_size];
memcpy(lattice_data, other.lattice_data, other.lattice_size);
lattice_size = other.lattice_size;
} else {
lattice_data = NULL;
}
}
BlamerBundle(const BlamerBundle &other) {
this->CopyTruth(other);
this->CopyResults(other);
}
const char *IncorrectReason() const;
bool NoTruth() const {
return (incorrect_result_reason == IRR_NO_TRUTH ||
incorrect_result_reason == IRR_PAGE_LAYOUT);
}
void SetBlame(IncorrectResultReason irr,
const STRING &msg, const WERD_CHOICE *choice, bool debug) {
this->incorrect_result_reason = irr;
this->debug = this->IncorrectReason();
this->debug += " to blame: ";
this->FillDebugString(msg, choice, &(this->debug));
if (debug) tprintf("SetBlame(): %s", this->debug.string());
}
// Appends choice and truth details to the given debug string.
void FillDebugString(const STRING &msg, const WERD_CHOICE *choice,
STRING *debug);
// Set to true when bounding boxes for individual unichars are recorded.
bool truth_has_char_boxes;
// The true_word (in the original image coordinate space) contains ground
// truth bounding boxes for this WERD_RES.
tesseract::BoxWord truth_word;
// Same as above, but in normalized coordinates
// (filled in by WERD_RES::SetupForRecognition()).
tesseract::BoxWord norm_truth_word;
// Tolerance for bounding box comparisons in normalized space.
int norm_box_tolerance;
// Contains ground truth unichar for each of the bounding boxes in truth_word.
GenericVector<STRING> truth_text;
// The reason for incorrect OCR result.
IncorrectResultReason incorrect_result_reason;
// Debug text associated with the blame.
STRING debug;
// Misadaption debug information (filled in if this word was misadapted to).
STRING misadaption_debug;
// Variables used by the segmentation search when looking for the blame.
// Set to true while segmentation search is continued after the usual
// termination condition in order to look for the blame.
bool segsearch_is_looking_for_blame;
// Best rating for correctly segmented path
// (set and used by SegSearch when looking for blame).
float best_correctly_segmented_rating;
// Vectors populated by SegSearch to indicate column and row indices that
// correspond to blobs with correct bounding boxes.
GenericVector<int> correct_segmentation_cols;
GenericVector<int> correct_segmentation_rows;
// Set to true if best choice is a dictionary word and
// classifier's top choice.
bool best_choice_is_dict_and_top_choice;
// Serialized segmentation search lattice.
char *lattice_data;
int lattice_size; // size of lattice_data in bytes
// Information about hypotheses (paths) explored by the segmentation search.
tesseract::ParamsTrainingBundle params_training_bundle;
};
/* Forward declarations */
class BLOCK_RES;
@ -341,8 +181,11 @@ class WERD_RES : public ELIST_LINK {
// TODO(rays) determine if docqual does anything useful and delete bln_boxes
// if it doesn't.
tesseract::BoxWord* bln_boxes; // BLN input bounding boxes.
// The ROW that this word sits in. NOT owned by the WERD_RES.
ROW* blob_row;
// The denorm provides the transformation to get back to the rotated image
// coords from the chopped_word/rebuild_word BLN coords.
// coords from the chopped_word/rebuild_word BLN coords, but each blob also
// has its own denorm.
DENORM denorm; // For use on chopped_word.
// Unicharset used by the classifier output in best_choice and raw_choice.
const UNICHARSET* uch_set; // For converting back to utf8.
@ -355,13 +198,32 @@ class WERD_RES : public ELIST_LINK {
// character fragments that make up the word.
// The length of chopped_word matches length of seam_array + 1 (if set).
TWERD* chopped_word; // BLN chopped fragments output.
SEAMS seam_array; // Seams matching chopped_word.
WERD_CHOICE *best_choice; // tess output
WERD_CHOICE *raw_choice; // top choice permuter
// Alternative paths found during chopping/segmentation search stages
// (the first entry being a slim copy of best_choice).
GenericVector<WERD_CHOICE *> alt_choices;
GenericVector<GenericVector<int> > alt_states;
// Vector of SEAM* holding chopping points matching chopped_word.
GenericVector<SEAM*> seam_array;
// Widths of blobs in chopped_word.
GenericVector<int> blob_widths;
// Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
// blob i and blob i+1.
GenericVector<int> blob_gaps;
// Ratings matrix contains classifier choices for each classified combination
// of blobs. The dimension is the same as the number of blobs in chopped_word
// and the leading diagonal corresponds to classifier results of the blobs
// in chopped_word. The state_ members of best_choice, raw_choice and
// best_choices all correspond to this ratings matrix and allow extraction
// of the blob choices for any given WERD_CHOICE.
MATRIX* ratings; // Owned pointer.
// Pointer to the first WERD_CHOICE in best_choices. This is the result that
// will be output from Tesseract. Note that this is now a borrowed pointer
// and should NOT be deleted.
WERD_CHOICE* best_choice; // Borrowed pointer.
// The best raw_choice found during segmentation search. Differs from the
// best_choice by being the best result according to just the character
// classifier, not taking any language model information into account.
// Unlike best_choice, the pointer IS owned by this WERD_RES.
WERD_CHOICE* raw_choice; // Owned pointer.
// Alternative results found during chopping/segmentation search stages.
// Note that being an ELIST, best_choices owns the WERD_CHOICEs.
WERD_CHOICE_LIST best_choices;
// Truth bounding boxes, text and incorrect choice reason.
BlamerBundle *blamer_bundle;
@ -462,6 +324,8 @@ class WERD_RES : public ELIST_LINK {
InitPointers();
word = the_word;
}
// Deep copies everything except the ratings MATRIX.
// To get that use deep_copy below.
WERD_RES(const WERD_RES &source) {
InitPointers();
*this = source; // see operator=
@ -545,7 +409,11 @@ class WERD_RES : public ELIST_LINK {
void InitPointers();
void Clear();
void ClearResults();
void ClearWordChoices();
void ClearRatings();
// Deep copies everything except the ratings MATRIX.
// To get that use deep_copy below.
WERD_RES& operator=(const WERD_RES& source); //from this
void CopySimpleFields(const WERD_RES& source);
@ -557,18 +425,28 @@ class WERD_RES : public ELIST_LINK {
void InitForRetryRecognition(const WERD_RES& source);
// Sets up the members used in recognition: bln_boxes, chopped_word,
// seam_array, denorm, best_choice, raw_choice. Returns false if
// seam_array, denorm. Returns false if
// the word is empty and sets up fake results. If use_body_size is
// true and row->body_size is set, then body_size will be used for
// blob normalization instead of xheight + ascrise. This flag is for
// those languages that are using CJK pitch model and thus it has to
// be true if and only if tesseract->textord_use_cjk_fp_model is
// true.
// If allow_detailed_fx is true, the feature extractor will receive fine
// precision outline information, allowing smoother features and better
// features on low resolution images.
// Returns false if the word is empty and sets up fake results.
bool SetupForTessRecognition(const UNICHARSET& unicharset_in,
tesseract::Tesseract* tesseract, Pix* pix,
bool numeric_mode, bool use_body_size,
bool allow_detailed_fx,
ROW *row, BLOCK* block);
// Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
// accumulators from a made chopped word. We presume the fields are already
// empty.
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in);
// Sets up the members used in recognition:
// bln_boxes, chopped_word, seam_array, denorm.
// Returns false if the word is empty and sets up fake results.
@ -586,6 +464,87 @@ class WERD_RES : public ELIST_LINK {
// Sets up the blamer_bundle if it is not null, using the initialized denorm.
void SetupBlamerBundle();
// Computes the blob_widths and blob_gaps from the chopped_word.
void SetupBlobWidthsAndGaps();
// Updates internal data to account for a new SEAM (chop) at the given
// blob_number. Fixes the ratings matrix and states in the choices, as well
// as the blob widths and gaps.
void InsertSeam(int blob_number, SEAM* seam);
// Returns true if all the word choices except the first have adjust_factors
// worse than the given threshold.
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const;
// Returns true if the current word is ambiguous (by number of answers or
// by dangerous ambigs.)
bool IsAmbiguous();
// Returns true if the ratings matrix size matches the sum of each of the
// segmentation states.
bool StatesAllValid();
// Prints a list of words found if debug is true or the word result matches
// the word_to_debug.
void DebugWordChoices(bool debug, const char* word_to_debug);
// Removes from best_choices all choices which are not within a reasonable
// range of the best choice.
void FilterWordChoices(int debug_level);
// Computes a set of distance thresholds used to control adaption.
// Compares the best choice for the current word to the best raw choice
// to determine which characters were classified incorrectly by the
// classifier. Then places a separate threshold into thresholds for each
// character in the word. If the classifier was correct, max_rating is placed
// into thresholds. If the classifier was incorrect, the mean match rating
// (error percentage) of the classifier's incorrect choice minus some margin
// is placed into thresholds. This can then be used by the caller to try to
// create a new template for the desired class that will classify the
// character with a rating better than the threshold value. The match rating
// placed into thresholds is never allowed to be below min_rating in order to
// prevent trying to make overly tight templates.
// min_rating limits how tight to make a template.
// max_rating limits how loose to make a template.
// rating_margin denotes the amount of margin to put in template.
void ComputeAdaptionThresholds(float certainty_scale,
float min_rating,
float max_rating,
float rating_margin,
float* thresholds);
// Saves a copy of the word_choice if it has the best unadjusted rating.
// Returns true if the word_choice was the new best.
bool LogNewRawChoice(WERD_CHOICE* word_choice);
// Consumes word_choice by adding it to best_choices, (taking ownership) if
// the certainty for word_choice is some distance of the best choice in
// best_choices, or by deleting the word_choice and returning false.
// The best_choices list is kept in sorted order by rating. Duplicates are
// removed, and the list is kept no longer than max_num_choices in length.
// Returns true if the word_choice is still a valid pointer.
bool LogNewCookedChoice(int max_num_choices, bool debug,
WERD_CHOICE* word_choice);
// Prints a brief list of all the best choices.
void PrintBestChoices() const;
// Returns the sum of the widths of the blob between start_blob and last_blob
// inclusive.
int GetBlobsWidth(int start_blob, int last_blob);
// Returns the width of a gap between the specified blob and the next one.
int GetBlobsGap(int blob_index);
// Returns the BLOB_CHOICE corresponding to the given index in the
// best choice word taken from the appropriate cell in the ratings MATRIX.
// Borrowed pointer, so do not delete. May return NULL if there is no
// BLOB_CHOICE matching the unichar_id at the given index.
BLOB_CHOICE* GetBlobChoice(int index) const;
// Returns the BLOB_CHOICE_LIST corresponding to the given index in the
// best choice word taken from the appropriate cell in the ratings MATRIX.
// Borrowed pointer, so do not delete.
BLOB_CHOICE_LIST* GetBlobChoices(int index) const;
// Moves the results fields from word to this. This takes ownership of all
// the data, so src can be destructed.
// word1.ConsumeWordResult(word);
@ -597,10 +556,11 @@ class WERD_RES : public ELIST_LINK {
void ConsumeWordResults(WERD_RES* word);
// Replace the best choice and rebuild box word.
void ReplaceBestChoice(const WERD_CHOICE& choice,
const GenericVector<int> &segmentation_state);
// choice must be from the current best_choices list.
void ReplaceBestChoice(WERD_CHOICE* choice);
// Builds the rebuild_word from the chopped_word and the best_state.
// Builds the rebuild_word and sets the best_state from the chopped_word and
// the best_choice->state.
void RebuildBestState();
// Copies the chopped_word to the rebuild_word, faking a best_state as well.
@ -610,30 +570,26 @@ class WERD_RES : public ELIST_LINK {
// Sets/replaces the box_word with one made from the rebuild_word.
void SetupBoxWord();
// Sets up the script positions in the output boxword using the best_choice
// Sets up the script positions in the best_choice using the best_choice
// to get the unichars, and the unicharset to get the target positions.
void SetScriptPositions();
// Returns the indices [start, end) containing the core of the word, stripped
// of any superscript digits on either side.
// (i.e., the non-footnote part of the word).
// Assumes that BoxWord is all set up for best_choice.
void WithoutFootnoteSpan(int *start, int *end) const;
// Given an alternate word choice and segmentation state, yield the indices
// [start, end) containig the core of the word, stripped of any superscript
// digits on either side. (i.e. stripping off the footnote parts).
void WithoutFootnoteSpan(
const WERD_CHOICE &choice, const GenericVector<int> &state,
int *start, int *end) const;
// Sets all the blobs in all the words (best choice and alternates) to be
// the given position. (When a sub/superscript is recognized as a separate
// word, it falls victim to the rule that a whole word cannot be sub or
// superscript, so this function overrides that problem.)
void SetAllScriptPositions(tesseract::ScriptPos position);
// Classifies the word with some already-calculated BLOB_CHOICEs.
// The choices are an array of blob_count pointers to BLOB_CHOICE,
// providing a single classifier result for each blob.
// The BLOB_CHOICEs are consumed and the word takes ownership.
// The number of blobs in the outword must match blob_count.
// The number of blobs in the box_word must match blob_count.
void FakeClassifyWord(int blob_count, BLOB_CHOICE** choices);
// Creates a WERD_CHOICE for the word using the top choices from the leading
// diagonal of the ratings matrix.
void FakeWordFromRatings();
// Copies the best_choice strings to the correct_text for adaption/training.
void BestChoiceToCorrectText();
@ -644,13 +600,16 @@ class WERD_RES : public ELIST_LINK {
// Returns true if anything was merged.
bool ConditionalBlobMerge(
TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb,
TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb,
BLOB_CHOICE_LIST_CLIST *blob_choices);
TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb);
// Merges 2 adjacent blobs in the result (index and index+1) and corrects
// all the data to account for the change.
void MergeAdjacentBlobs(int index);
// Callback helper for fix_quotes returns a double quote if both
// arguments are quote, otherwise INVALID_UNICHAR_ID.
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2);
void fix_quotes(BLOB_CHOICE_LIST_CLIST *blob_choices);
void fix_quotes();
// Callback helper for fix_hyphens returns UNICHAR_ID of - if both
// arguments are hyphen, otherwise INVALID_UNICHAR_ID.
@ -658,15 +617,21 @@ class WERD_RES : public ELIST_LINK {
// Callback helper for fix_hyphens returns true if box1 and box2 overlap
// (assuming both on the same textline, are in order and a chopped em dash.)
bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2);
void fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices);
void fix_hyphens();
// Callback helper for merge_tess_fails returns a space if both
// arguments are space, otherwise INVALID_UNICHAR_ID.
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2);
void merge_tess_fails();
// Returns a really deep copy of *src, including the ratings MATRIX.
static WERD_RES* deep_copy(const WERD_RES* src) {
return new WERD_RES(*src);
WERD_RES* result = new WERD_RES(*src);
// That didn't copy the ratings, but we want a copy if there is one to
// begin width.
if (src->ratings != NULL)
result->ratings = src->ratings->DeepCopy();
return result;
}
// Copy blobs from word_res onto this word (eliminating spaces between).

View File

@ -0,0 +1,40 @@
///////////////////////////////////////////////////////////////////////
// File: params_training_featdef.cpp
// Description: Utility functions for params training features.
// Author: David Eger
// Created: Mon Jun 11 11:26:42 PDT 2012
//
// (C) Copyright 2012, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include <string.h>
#include "params_training_featdef.h"
namespace tesseract {
int ParamsTrainingFeatureByName(const char *name) {
if (name == NULL)
return -1;
int array_size = sizeof(kParamsTrainingFeatureTypeName) /
sizeof(kParamsTrainingFeatureTypeName[0]);
for (int i = 0; i < array_size; i++) {
if (kParamsTrainingFeatureTypeName[i] == NULL)
continue;
if (strcmp(name, kParamsTrainingFeatureTypeName[i]) == 0)
return i;
}
return -1;
}
} // namespace tesseract

View File

@ -25,67 +25,97 @@
namespace tesseract {
// Maximum number of unichars in the small and medium sized words
static const int kMaxSmallWordUnichars = 3;
static const int kMaxMediumWordUnichars = 6;
// Raw features extracted from a single OCR hypothesis.
// The features are non-normalized real-valued quantities with
// unbounded range and unknown distribution.
// The features are normalized (by outline length or number of unichars as
// appropriate) real-valued quantities with unbounded range and
// unknown distribution.
// Normalization / binarization of these features is done at a later stage.
// Note: when adding new fields to this enum make sure to modify
// kParamsTrainingRawFeatureTypeName enum accordingly.
enum ParamsTrainingRawFeatureType {
// What dictionary (if any) was this hypothesis found in.
// See PermuterType enum in ccstruct/ratngs.h for interpretation.
PTRAIN_RAW_FEATURE_DICT_MATCH_TYPE, // 0
// Boolean indicator of whether this hypothesis is ambiguous to a known
// dictionary word (or a valid number pattern).
PTRAIN_RAW_FEATURE_UNAMBIG_DICT_MATCH, // 1
// Shape cost of the segmentation path for this hypothesis.
PTRAIN_RAW_FEATURE_SHAPE_COST, // 2
// Character ngram probability of the string of unichars of this hypothesis.
PTRAIN_RAW_FEATURE_NGRAM_PROB, // 3
// Number of bad/inconsistent spots in this hypothesis.
PTRAIN_RAW_FEATURE_NUM_BAD_PUNC, // 4
PTRAIN_RAW_FEATURE_NUM_BAD_CASE, // 5
PTRAIN_RAW_FEATURE_NUM_BAD_CHAR_TYPE, // 6
PTRAIN_RAW_FEATURE_NUM_BAD_SPACING, // 7
PTRAIN_RAW_FEATURE_NUM_BAD_SCRIPT, // 8
PTRAIN_RAW_FEATURE_NUM_BAD_FONT, // 9
// Classifier-related features.
PTRAIN_RAW_FEATURE_WORST_CERT, // 10
PTRAIN_RAW_FEATURE_RATING, // 11
// Number of classifier results that came from adapted templates.
PTRAIN_RAW_FEATURE_ADAPTED, // 12
// Features potentially useful for normalization.
PTRAIN_RAW_FEATURE_NUM_UNICHARS, // 13
PTRAIN_RAW_FEATURE_OUTLINE_LEN, // 14
// kParamsTrainingFeatureTypeName
enum kParamsTrainingFeatureType {
// Digits
PTRAIN_DIGITS_SHORT, // 0
PTRAIN_DIGITS_MED, // 1
PTRAIN_DIGITS_LONG, // 2
// Number or pattern (NUMBER_PERM, USER_PATTERN_PERM)
PTRAIN_NUM_SHORT, // 3
PTRAIN_NUM_MED, // 4
PTRAIN_NUM_LONG, // 5
// Document word (DOC_DAWG_PERM)
PTRAIN_DOC_SHORT, // 6
PTRAIN_DOC_MED, // 7
PTRAIN_DOC_LONG, // 8
// Word (SYSTEM_DAWG_PERM, USER_DAWG_PERM, COMPOUND_PERM)
PTRAIN_DICT_SHORT, // 9
PTRAIN_DICT_MED, // 10
PTRAIN_DICT_LONG, // 11
// Frequent word (FREQ_DAWG_PERM)
PTRAIN_FREQ_SHORT, // 12
PTRAIN_FREQ_MED, // 13
PTRAIN_FREQ_LONG, // 14
PTRAIN_SHAPE_COST_PER_CHAR, // 15
PTRAIN_NGRAM_COST_PER_CHAR, // 16
PTRAIN_NUM_BAD_PUNC, // 17
PTRAIN_NUM_BAD_CASE, // 18
PTRAIN_XHEIGHT_CONSISTENCY, // 19
PTRAIN_NUM_BAD_CHAR_TYPE, // 20
PTRAIN_NUM_BAD_SPACING, // 21
PTRAIN_NUM_BAD_FONT, // 22
PTRAIN_RATING_PER_CHAR, // 23
PTRAIN_NUM_RAW_FEATURE_TYPES
PTRAIN_NUM_FEATURE_TYPES
};
static const char * const kParamsTrainingRawFeatureTypeName[] = {
"DICT_MATCH_TYPE", // 0
"UNAMBIG_DICT_MATCH", // 1
"SHAPE_COST", // 2
"NGRAM_PROB", // 3
"NUM_BAD_PUNC", // 4
"NUM_BAD_CASE", // 5
"NUM_BAD_CHAR_TYPE", // 6
"NUM_BAD_SPACING", // 7
"NUM_BAD_SCRIPT", // 8
"NUM_BAD_FONT", // 9
"WORST_CERT", // 10
"RATING", // 11
"ADAPTED", // 12
"NUM_UNICHARS", // 13
"OUTLINE_LEN", // 14
static const char * const kParamsTrainingFeatureTypeName[] = {
"PTRAIN_DIGITS_SHORT", // 0
"PTRAIN_DIGITS_MED", // 1
"PTRAIN_DIGITS_LONG", // 2
"PTRAIN_NUM_SHORT", // 3
"PTRAIN_NUM_MED", // 4
"PTRAIN_NUM_LONG", // 5
"PTRAIN_DOC_SHORT", // 6
"PTRAIN_DOC_MED", // 7
"PTRAIN_DOC_LONG", // 8
"PTRAIN_DICT_SHORT", // 9
"PTRAIN_DICT_MED", // 10
"PTRAIN_DICT_LONG", // 11
"PTRAIN_FREQ_SHORT", // 12
"PTRAIN_FREQ_MED", // 13
"PTRAIN_FREQ_LONG", // 14
"PTRAIN_SHAPE_COST_PER_CHAR", // 15
"PTRAIN_NGRAM_COST_PER_CHAR", // 16
"PTRAIN_NUM_BAD_PUNC", // 17
"PTRAIN_NUM_BAD_CASE", // 18
"PTRAIN_XHEIGHT_CONSISTENCY", // 19
"PTRAIN_NUM_BAD_CHAR_TYPE", // 20
"PTRAIN_NUM_BAD_SPACING", // 21
"PTRAIN_NUM_BAD_FONT", // 22
"PTRAIN_RATING_PER_CHAR", // 23
};
// Returns the index of the given feature (by name),
// or -1 meaning the feature is unknown.
int ParamsTrainingFeatureByName(const char *name);
// Entry with features extracted from a single OCR hypothesis for a word.
struct ParamsTrainingHypothesis {
ParamsTrainingHypothesis() {
for (int i = 0; i < PTRAIN_NUM_RAW_FEATURE_TYPES; ++i) features[i] = 0.0;
ParamsTrainingHypothesis() : cost(0.0) {
memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
}
float features[PTRAIN_NUM_RAW_FEATURE_TYPES];
ParamsTrainingHypothesis(const ParamsTrainingHypothesis &other) {
memcpy(features, other.features,
sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
str = other.str;
cost = other.cost;
}
float features[PTRAIN_NUM_FEATURE_TYPES];
STRING str; // string corresponding to word hypothesis (for debugging)
float cost; // path cost computed by segsearch
};
// A list of hypotheses explored during one run of segmentation search.
@ -104,9 +134,10 @@ class ParamsTrainingBundle {
}
// Adds a new ParamsTrainingHypothesis to the current hypothesis list
// and returns the reference to the newly added entry.
ParamsTrainingHypothesis &AddHypothesis() {
ParamsTrainingHypothesis &AddHypothesis(
const ParamsTrainingHypothesis &other) {
if (hyp_list_vec.empty()) StartHypothesisList();
hyp_list_vec.back().push_back(ParamsTrainingHypothesis());
hyp_list_vec.back().push_back(ParamsTrainingHypothesis(other));
return hyp_list_vec.back().back();
}

View File

@ -19,13 +19,33 @@
#include "ratngs.h"
#include "blobs.h"
#include "callcpp.h"
#include "genericvector.h"
#include "matrix.h"
#include "normalis.h" // kBlnBaselineOffset.
#include "unicharset.h"
ELISTIZE (BLOB_CHOICE) CLISTIZE (BLOB_CHOICE_LIST) CLISTIZE (WERD_CHOICE);
using tesseract::ScriptPos;
ELISTIZE(BLOB_CHOICE);
ELISTIZE(WERD_CHOICE);
const float WERD_CHOICE::kBadRating = 100000.0;
// Min offset in baseline-normalized coords to make a character a subscript.
const int kMinSubscriptOffset = 20;
// Min offset in baseline-normalized coords to make a character a superscript.
const int kMinSuperscriptOffset = 20;
// Max y of bottom of a drop-cap blob.
const int kMaxDropCapBottom = -128;
// Max fraction of x-height to use as denominator in measuring x-height overlap.
const double kMaxOverlapDenominator = 0.125;
// Min fraction of x-height range that should be in agreement for matching
// x-heights.
const double kMinXHeightMatch = 0.5;
// Max tolerance on baseline position as a fraction of x-height for matching
// baselines.
const double kMaxBaselineDrift = 0.0625;
static const char kPermuterTypeNoPerm[] = "None";
static const char kPermuterTypePuncPerm[] = "Punctuation";
@ -68,20 +88,20 @@ BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
inT16 src_fontinfo_id, // font
inT16 src_fontinfo_id2, // 2nd choice font
int src_script_id, // script
inT16 min_xheight, // min xheight allowed
inT16 max_xheight, // max xheight by this char
bool adapted // adapted match or not
) {
float min_xheight, // min xheight allowed
float max_xheight, // max xheight by this char
float yshift, // yshift out of position
BlobChoiceClassifier c) { // adapted match or other
unichar_id_ = src_unichar_id;
rating_ = src_rating;
certainty_ = src_cert;
fontinfo_id_ = src_fontinfo_id;
fontinfo_id2_ = src_fontinfo_id2;
script_id_ = src_script_id;
language_model_state_ = NULL;
min_xheight_ = min_xheight;
max_xheight_ = max_xheight;
adapted_ = adapted;
yshift_ = yshift;
classifier_ = c;
}
/**
@ -96,12 +116,75 @@ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) {
fontinfo_id_ = other.fontinfo_id();
fontinfo_id2_ = other.fontinfo_id2();
script_id_ = other.script_id();
language_model_state_ = NULL;
matrix_cell_ = other.matrix_cell_;
min_xheight_ = other.min_xheight_;
max_xheight_ = other.max_xheight_;
adapted_ = other.adapted_;
yshift_ = other.yshift();
classifier_ = other.classifier_;
}
// Returns true if *this and other agree on the baseline and x-height
// to within some tolerance based on a given estimate of the x-height.
bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE& other, float x_height,
bool debug) const {
double baseline_diff = fabs(yshift() - other.yshift());
if (baseline_diff > kMaxBaselineDrift * x_height) {
if (debug) {
tprintf("Baseline diff %g for %d v %d\n",
baseline_diff, unichar_id_, other.unichar_id_);
}
return false;
}
double this_range = max_xheight() - min_xheight();
double other_range = other.max_xheight() - other.min_xheight();
double denominator = ClipToRange(MIN(this_range, other_range),
1.0, kMaxOverlapDenominator * x_height);
double overlap = MIN(max_xheight(), other.max_xheight()) -
MAX(min_xheight(), other.min_xheight());
overlap /= denominator;
if (debug) {
tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n",
unichar_id_, other.unichar_id_, baseline_diff,
this_range, other_range, denominator, overlap);
}
return overlap >= kMinXHeightMatch;
}
// Helper to find the BLOB_CHOICE in the bc_list that matches the given
// unichar_id, or NULL if there is no match.
BLOB_CHOICE* FindMatchingChoice(UNICHAR_ID char_id,
BLOB_CHOICE_LIST* bc_list) {
// Find the corresponding best BLOB_CHOICE.
BLOB_CHOICE_IT choice_it(bc_list);
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
choice_it.forward()) {
BLOB_CHOICE* choice = choice_it.data();
if (choice->unichar_id() == char_id) {
return choice;
}
}
return NULL;
}
const char *WERD_CHOICE::permuter_name(uinT8 permuter) {
return kPermuterTypeNames[permuter];
}
namespace tesseract {
const char *ScriptPosToString(enum ScriptPos script_pos) {
switch (script_pos) {
case SP_NORMAL: return "NORM";
case SP_SUBSCRIPT: return "SUB";
case SP_SUPERSCRIPT: return "SUPER";
case SP_DROPCAP: return "DROPC";
}
return "SP_UNKNOWN";
}
} // namespace tesseract.
/**
* WERD_CHOICE::WERD_CHOICE
*
@ -111,16 +194,13 @@ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) {
WERD_CHOICE::WERD_CHOICE(const char *src_string,
const UNICHARSET &unicharset)
: unicharset_(&unicharset){
STRING src_lengths;
const char *ptr = src_string;
const char *end = src_string + strlen(src_string);
int step = unicharset.step(ptr);
for (; ptr < end && step > 0;
step = unicharset.step(ptr), src_lengths += step, ptr += step);
if (step != 0 && ptr == end) {
this->init(src_string, src_lengths.string(),
0.0, 0.0, NO_PERM);
} else { // there must have been an invalid unichar in the string
GenericVector<UNICHAR_ID> encoding;
GenericVector<char> lengths;
if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) {
lengths.push_back('\0');
STRING src_lengths = &lengths[0];
this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM);
} else { // There must have been an invalid unichar in the string.
this->init(8);
this->make_bad();
}
@ -152,13 +232,16 @@ void WERD_CHOICE::init(const char *src_string,
int unichar_length = src_lengths ? src_lengths[i] : 1;
unichar_ids_[i] =
unicharset_->unichar_to_id(src_string+offset, unichar_length);
fragment_lengths_[i] = 1;
state_[i] = 1;
certainties_[i] = src_certainty;
offset += unichar_length;
}
}
adjust_factor_ = 1.0f;
rating_ = src_rating;
certainty_ = src_certainty;
permuter_ = src_permuter;
dangerous_ambig_found_ = false;
}
/**
@ -166,25 +249,46 @@ void WERD_CHOICE::init(const char *src_string,
*/
WERD_CHOICE::~WERD_CHOICE() {
delete[] unichar_ids_;
delete[] fragment_lengths_;
delete_blob_choices();
delete[] script_pos_;
delete[] state_;
delete[] certainties_;
}
const char *WERD_CHOICE::permuter_name() const {
return kPermuterTypeNames[permuter_];
}
/**
* WERD_CHOICE::set_blob_choices
*
* Delete current blob_choices. Set the blob_choices to the given new
* list.
*/
void WERD_CHOICE::set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices) {
if (blob_choices_ != blob_choices) {
delete_blob_choices();
blob_choices_ = blob_choices;
// Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
// taken from the appropriate cell in the ratings MATRIX.
// Borrowed pointer, so do not delete.
BLOB_CHOICE_LIST* WERD_CHOICE::blob_choices(int index, MATRIX* ratings) const {
MATRIX_COORD coord = MatrixCoord(index);
BLOB_CHOICE_LIST* result = ratings->get(coord.col, coord.row);
if (result == NULL) {
result = new BLOB_CHOICE_LIST;
ratings->put(coord.col, coord.row, result);
}
return result;
}
// Returns the MATRIX_COORD corresponding to the location in the ratings
// MATRIX for the given index into the word.
MATRIX_COORD WERD_CHOICE::MatrixCoord(int index) const {
int col = 0;
for (int i = 0; i < index; ++i)
col += state_[i];
int row = col + state_[index] - 1;
return MATRIX_COORD(col, row);
}
// Sets the entries for the given index from the BLOB_CHOICE, assuming
// unit fragment lengths, but setting the state for this index to blob_count.
void WERD_CHOICE::set_blob_choice(int index, int blob_count,
const BLOB_CHOICE* blob_choice) {
unichar_ids_[index] = blob_choice->unichar_id();
script_pos_[index] = tesseract::SP_NORMAL;
state_[index] = blob_count;
certainties_[index] = blob_choice->certainty();
}
@ -211,9 +315,18 @@ bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const {
*/
void WERD_CHOICE::remove_unichar_ids(int start, int num) {
ASSERT_HOST(start >= 0 && start + num <= length_);
for (int i = start; i+num < length_; ++i) {
unichar_ids_[i] = unichar_ids_[i+num];
fragment_lengths_[i] = fragment_lengths_[i+num];
// Accumulate the states to account for the merged blobs.
for (int i = 0; i < num; ++i) {
if (start > 0)
state_[start - 1] += state_[start + i];
else if (start + num < length_)
state_[start + num] += state_[start + i];
}
for (int i = start; i + num < length_; ++i) {
unichar_ids_[i] = unichar_ids_[i + num];
script_pos_[i] = script_pos_[i + num];
state_[i] = state_[i + num];
certainties_[i] = certainties_[i + num];
}
length_ -= num;
}
@ -224,7 +337,7 @@ void WERD_CHOICE::remove_unichar_ids(int start, int num) {
* Reverses and mirrors unichars in unichar_ids.
*/
void WERD_CHOICE::reverse_and_mirror_unichar_ids() {
for (int i = 0; i < length_/2; ++i) {
for (int i = 0; i < length_ / 2; ++i) {
UNICHAR_ID tmp_id = unichar_ids_[i];
unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]);
unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id);
@ -255,6 +368,23 @@ void WERD_CHOICE::punct_stripped(int *start, int *end) const {
(*end)++;
}
void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const {
int end = length();
while (end > 0 &&
unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) {
end--;
}
int start = 0;
while (start < end &&
unicharset_->get_isdigit(unichar_ids_[start]) &&
BlobPosition(start) == tesseract::SP_SUPERSCRIPT) {
start++;
}
*pstart = start;
*pend = end;
}
WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const {
ASSERT_HOST(start >= 0 && start <= length_);
ASSERT_HOST(end >= 0 && end <= length_);
@ -262,7 +392,7 @@ WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const {
WERD_CHOICE retval(unicharset_, end - start);
for (int i = start; i < end; i++) {
retval.append_unichar_id_space_allocated(
unichar_ids_[i], fragment_lengths_[i], 0.0f, 0.0f);
unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
}
return retval;
}
@ -310,12 +440,12 @@ void WERD_CHOICE::string_and_lengths(STRING *word_str,
* and call append_unichar_id_space_allocated().
*/
void WERD_CHOICE::append_unichar_id(
UNICHAR_ID unichar_id, char fragment_length,
UNICHAR_ID unichar_id, int blob_count,
float rating, float certainty) {
if (length_ == reserved_) {
this->double_the_size();
}
this->append_unichar_id_space_allocated(unichar_id, fragment_length,
this->append_unichar_id_space_allocated(unichar_id, blob_count,
rating, certainty);
}
@ -327,59 +457,31 @@ void WERD_CHOICE::append_unichar_id(
* If the permuters are NOT the same the permuter is set to COMPOUND_PERM
*/
WERD_CHOICE & WERD_CHOICE::operator+= (const WERD_CHOICE & second) {
// TODO(daria): find out why the choice was cleared this way if any
// of the pieces are empty. Add the description of this behavior
// to the comments.
// if (word_string.length () == 0 || second.word_string.length () == 0) {
// word_string = NULL; //make it empty
// word_lengths = NULL;
// delete_blob_choices();
// } else {
ASSERT_HOST(unicharset_ == second.unicharset_);
while (reserved_ < length_ + second.length()) {
this->double_the_size();
}
const UNICHAR_ID *other_unichar_ids = second.unichar_ids();
const char *other_fragment_lengths = second.fragment_lengths();
for (int i = 0; i < second.length(); ++i) {
unichar_ids_[length_ + i] = other_unichar_ids[i];
fragment_lengths_[length_ + i] = other_fragment_lengths[i];
state_[length_ + i] = second.state_[i];
certainties_[length_ + i] = second.certainties_[i];
script_pos_[length_ + i] = second.BlobPosition(i);
}
length_ += second.length();
if (second.adjust_factor_ > adjust_factor_)
adjust_factor_ = second.adjust_factor_;
rating_ += second.rating(); // add ratings
if (second.certainty() < certainty_) // take min
certainty_ = second.certainty();
if (second.dangerous_ambig_found_)
dangerous_ambig_found_ = true;
if (permuter_ == NO_PERM) {
permuter_ = second.permuter();
} else if (second.permuter() != NO_PERM &&
second.permuter() != permuter_) {
permuter_ = COMPOUND_PERM;
}
// Append a deep copy of second blob_choices if it exists.
if (second.blob_choices_ != NULL) {
if (this->blob_choices_ == NULL)
this->blob_choices_ = new BLOB_CHOICE_LIST_CLIST;
BLOB_CHOICE_LIST_C_IT this_blob_choices_it;
BLOB_CHOICE_LIST_C_IT second_blob_choices_it;
this_blob_choices_it.set_to_list(this->blob_choices_);
this_blob_choices_it.move_to_last();
second_blob_choices_it.set_to_list(second.blob_choices_);
for (second_blob_choices_it.mark_cycle_pt();
!second_blob_choices_it.cycled_list();
second_blob_choices_it.forward()) {
BLOB_CHOICE_LIST* blob_choices_copy = new BLOB_CHOICE_LIST();
blob_choices_copy->deep_copy(second_blob_choices_it.data(),
&BLOB_CHOICE::deep_copy);
this_blob_choices_it.add_after_then_move(blob_choices_copy);
}
}
return *this;
}
@ -397,55 +499,202 @@ WERD_CHOICE& WERD_CHOICE::operator=(const WERD_CHOICE& source) {
unicharset_ = source.unicharset_;
const UNICHAR_ID *other_unichar_ids = source.unichar_ids();
const char *other_fragment_lengths = source.fragment_lengths();
for (int i = 0; i < source.length(); ++i) {
unichar_ids_[i] = other_unichar_ids[i];
fragment_lengths_[i] = other_fragment_lengths[i];
state_[i] = source.state_[i];
certainties_[i] = source.certainties_[i];
script_pos_[i] = source.BlobPosition(i);
}
length_ = source.length();
adjust_factor_ = source.adjust_factor_;
rating_ = source.rating();
certainty_ = source.certainty();
min_x_height_ = source.min_x_height();
max_x_height_ = source.max_x_height();
permuter_ = source.permuter();
fragment_mark_ = source.fragment_mark();
// Delete existing blob_choices
this->delete_blob_choices();
// Deep copy blob_choices of source
if (source.blob_choices_ != NULL) {
BLOB_CHOICE_LIST_C_IT this_blob_choices_it;
BLOB_CHOICE_LIST_C_IT source_blob_choices_it;
this->blob_choices_ = new BLOB_CHOICE_LIST_CLIST();
this_blob_choices_it.set_to_list(this->blob_choices_);
source_blob_choices_it.set_to_list(source.blob_choices_);
for (source_blob_choices_it.mark_cycle_pt();
!source_blob_choices_it.cycled_list();
source_blob_choices_it.forward()) {
BLOB_CHOICE_LIST* blob_choices_copy = new BLOB_CHOICE_LIST();
blob_choices_copy->deep_copy(source_blob_choices_it.data(),
&BLOB_CHOICE::deep_copy);
this_blob_choices_it.add_after_then_move(blob_choices_copy);
}
}
dangerous_ambig_found_ = source.dangerous_ambig_found_;
return *this;
}
/**********************************************************************
* WERD_CHOICE::delete_blob_choices
*
* Clear the blob_choices list, delete it and set it to NULL.
**********************************************************************/
void WERD_CHOICE::delete_blob_choices() {
if (blob_choices_ != NULL) {
blob_choices_->deep_clear();
delete blob_choices_;
blob_choices_ = NULL;
// Sets up the script_pos_ member using the blobs_list to get the bln
// bounding boxes, *this to get the unichars, and this->unicharset
// to get the target positions. If small_caps is true, sub/super are not
// considered, but dropcaps are.
// NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD* word) {
// Since WERD_CHOICE isn't supposed to depend on a Tesseract,
// we don't have easy access to the flags Tesseract stores. Therefore, debug
// for this module is hard compiled in.
int debug = 0;
// Initialize to normal.
for (int i = 0; i < length_; ++i)
script_pos_[i] = tesseract::SP_NORMAL;
if (word->blobs.empty())
return;
int position_counts[4];
for (int i = 0; i < 4; i++) {
position_counts[i] = 0;
}
int chunk_index = 0;
for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
TBLOB* tblob = word->blobs[chunk_index];
int uni_id = unichar_id(blob_index);
TBOX blob_box = tblob->bounding_box();
if (state_ != NULL) {
for (int i = 1; i < state_[blob_index]; ++i) {
++chunk_index;
tblob = word->blobs[chunk_index];
blob_box += tblob->bounding_box();
}
}
script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box,
uni_id);
if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
script_pos_[blob_index] = tesseract::SP_NORMAL;
}
position_counts[script_pos_[blob_index]]++;
}
// If almost everything looks like a superscript or subscript,
// we most likely just got the baseline wrong.
if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ ||
position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) {
if (debug >= 2) {
tprintf("Most characters of %s are subscript or superscript.\n"
"That seems wrong, so I'll assume we got the baseline wrong\n",
unichar_string().string());
}
for (int i = 0; i < length_; i++) {
ScriptPos sp = script_pos_[i];
if (sp == tesseract::SP_SUBSCRIPT || sp == tesseract::SP_SUPERSCRIPT) {
position_counts[sp]--;
position_counts[tesseract::SP_NORMAL]++;
script_pos_[i] = tesseract::SP_NORMAL;
}
}
}
if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) ||
debug >= 2) {
tprintf("SetScriptPosition on %s\n", unichar_string().string());
int chunk_index = 0;
for (int blob_index = 0; blob_index < length_; ++blob_index) {
if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
TBLOB* tblob = word->blobs[chunk_index];
ScriptPositionOf(true, *unicharset_, tblob->bounding_box(),
unichar_id(blob_index));
}
chunk_index += state_ != NULL ? state_[blob_index] : 1;
}
}
}
// Sets the script_pos_ member from some source positions with a given length.
void WERD_CHOICE::SetScriptPositions(const tesseract::ScriptPos* positions,
int length) {
ASSERT_HOST(length == length_);
if (positions != script_pos_) {
delete [] script_pos_;
script_pos_ = new ScriptPos[length];
memcpy(script_pos_, positions, sizeof(positions[0]) * length);
}
}
// Sets all the script_pos_ positions to the given position.
void WERD_CHOICE::SetAllScriptPositions(tesseract::ScriptPos position) {
for (int i = 0; i < length_; ++i)
script_pos_[i] = position;
}
/* static */
ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug,
const UNICHARSET& unicharset,
const TBOX& blob_box,
UNICHAR_ID unichar_id) {
ScriptPos retval = tesseract::SP_NORMAL;
int top = blob_box.top();
int bottom = blob_box.bottom();
int min_bottom, max_bottom, min_top, max_top;
unicharset.get_top_bottom(unichar_id,
&min_bottom, &max_bottom,
&min_top, &max_top);
int sub_thresh_top = min_top - kMinSubscriptOffset;
int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
if (bottom <= kMaxDropCapBottom) {
retval = tesseract::SP_DROPCAP;
} else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
retval = tesseract::SP_SUBSCRIPT;
} else if (bottom > sup_thresh_bot) {
retval = tesseract::SP_SUPERSCRIPT;
}
if (print_debug) {
const char *pos = ScriptPosToString(retval);
tprintf("%s Character %s[bot:%d top: %d] "
"bot_range[%d,%d] top_range[%d, %d] "
"sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
pos, unicharset.id_to_unichar(unichar_id),
bottom, top,
min_bottom, max_bottom, min_top, max_top,
sub_thresh_bot, sub_thresh_top,
sup_thresh_bot);
}
return retval;
}
// Returns the script-id (eg Han) of the dominant script in the word.
int WERD_CHOICE::GetTopScriptID() const {
int max_script = unicharset_->get_script_table_size();
int *sid = new int[max_script];
int x;
for (x = 0; x < max_script; x++) sid[x] = 0;
for (x = 0; x < length_; ++x) {
int script_id = unicharset_->get_script(unichar_id(x));
sid[script_id]++;
}
if (unicharset_->han_sid() != unicharset_->null_sid()) {
// Add the Hiragana & Katakana counts to Han and zero them out.
if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {
sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
sid[unicharset_->hiragana_sid()] = 0;
}
if (unicharset_->katakana_sid() != unicharset_->null_sid()) {
sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
sid[unicharset_->katakana_sid()] = 0;
}
}
// Note that high script ID overrides lower one on a tie, thus biasing
// towards non-Common script (if sorted that way in unicharset file).
int max_sid = 0;
for (x = 1; x < max_script; x++)
if (sid[x] >= sid[max_sid]) max_sid = x;
if (sid[max_sid] < length_ / 2)
max_sid = unicharset_->null_sid();
delete[] sid;
return max_sid;
}
// Fixes the state_ for a chop at the given blob_posiiton.
void WERD_CHOICE::UpdateStateForSplit(int blob_position) {
int total_chunks = 0;
for (int i = 0; i < length_; ++i) {
total_chunks += state_[i];
if (total_chunks > blob_position) {
++state_[i];
return;
}
}
}
// Returns the sum of all the state elements, being the total number of blobs.
int WERD_CHOICE::TotalOfStates() const {
int total_chunks = 0;
for (int i = 0; i < length_; ++i) {
total_chunks += state_[i];
}
return total_chunks;
}
/**
@ -453,32 +702,87 @@ void WERD_CHOICE::delete_blob_choices() {
*
* Print WERD_CHOICE to stdout.
*/
const void WERD_CHOICE::print(const char *msg) const {
tprintf("%s WERD_CHOICE:\n", msg);
tprintf("length_ %d reserved_ %d permuter_ %d\n",
length_, reserved_, permuter_);
tprintf("rating_ %.4f certainty_ %.4f", rating_, certainty_);
if (fragment_mark_) {
tprintf(" fragment_mark_ true");
void WERD_CHOICE::print(const char *msg) const {
tprintf("%s : ", msg);
for (int i = 0; i < length_; ++i) {
tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
}
tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n",
rating_, certainty_, adjust_factor_, permuter_,
min_x_height_, max_x_height_, dangerous_ambig_found_);
tprintf("pos");
for (int i = 0; i < length_; ++i) {
tprintf("\t%s", ScriptPosToString(script_pos_[i]));
}
tprintf("\nstr");
for (int i = 0; i < length_; ++i) {
tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
}
tprintf("\nstate:");
for (int i = 0; i < length_; ++i) {
tprintf("\t%d ", state_[i]);
}
tprintf("\nC");
for (int i = 0; i < length_; ++i) {
tprintf("\t%.3f", certainties_[i]);
}
tprintf("\n");
if (unichar_string_.length() > 0) {
tprintf("unichar_string_ %s unichar_lengths_ %s\n",
unichar_string_.string(), unichar_lengths_.string());
}
tprintf("unichar_ids: ");
int i;
for (i = 0; i < length_; ++i) {
tprintf("%d ", unichar_ids_[i]);
}
tprintf("\nfragment_lengths_: ");
for (i = 0; i < length_; ++i) {
tprintf("%d ", fragment_lengths_[i]);
}
tprintf("\n");
fflush(stdout);
}
// Prints the segmentation state with an introductory message.
void WERD_CHOICE::print_state(const char *msg) const {
tprintf("%s", msg);
for (int i = 0; i < length_; ++i)
tprintf(" %d", state_[i]);
tprintf("\n");
}
// Displays the segmentation state of *this (if not the same as the last
// one displayed) and waits for a click in the window.
void WERD_CHOICE::DisplaySegmentation(TWERD* word) {
#ifndef GRAPHICS_DISABLED
// Number of different colors to draw with.
const int kNumColors = 6;
static ScrollView *segm_window = NULL;
// Check the state against the static prev_drawn_state.
static GenericVector<int> prev_drawn_state;
bool already_done = prev_drawn_state.size() == length_;
if (!already_done) prev_drawn_state.init_to_size(length_, 0);
for (int i = 0; i < length_; ++i) {
if (prev_drawn_state[i] != state_[i]) {
already_done = false;
}
prev_drawn_state[i] = state_[i];
}
if (already_done || word->blobs.empty()) return;
// Create the window if needed.
if (segm_window == NULL) {
segm_window = new ScrollView("Segmentation", 5, 10, 500, 256,
2000.0, 256.0, true);
} else {
segm_window->Clear();
}
TBOX bbox;
int blob_index = 0;
for (int c = 0; c < length_; ++c) {
ScrollView::Color color =
static_cast<ScrollView::Color>(c % kNumColors + 3);
for (int i = 0; i < state_[c]; ++i, ++blob_index) {
TBLOB* blob = word->blobs[blob_index];
bbox += blob->bounding_box();
blob->plot(segm_window, color, color);
}
}
segm_window->ZoomToRectangle(bbox.left(), bbox.top(),
bbox.right(), bbox.bottom());
segm_window->Update();
window_wait(segm_window);
#endif
}
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1,
const WERD_CHOICE &word2) {
const UNICHARSET *uchset = word1.unicharset();
@ -526,114 +830,3 @@ void print_ratings_list(const char *msg,
tprintf("\n");
fflush(stdout);
}
/**
* print_ratings_list
*
* Print ratings list (unichar ids only).
*/
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings) {
if (ratings->length() == 0) {
tprintf("%s:<none>\n", msg);
return;
}
if (*msg != '\0') {
tprintf("%s\n", msg);
}
BLOB_CHOICE_IT c_it;
c_it.set_to_list(ratings);
for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
c_it.data()->print(NULL);
if (!c_it.at_last()) tprintf("\n");
}
tprintf("\n");
fflush(stdout);
}
/**
* print_ratings_info
*
* Send all the ratings out to the logfile.
*
* @param fp file to use
* @param ratings list of results
* @param current_unicharset unicharset that can be used
* for id-to-unichar conversion
*/
void print_ratings_info(FILE *fp,
BLOB_CHOICE_LIST *ratings,
const UNICHARSET &current_unicharset) {
inT32 index; // to list
const char* first_char = NULL; // character
FLOAT32 first_rat; // rating
FLOAT32 first_cert; // certainty
const char* sec_char = NULL; // character
FLOAT32 sec_rat = 0.0f; // rating
FLOAT32 sec_cert = 0.0f; // certainty
BLOB_CHOICE_IT c_it = ratings; // iterator
index = ratings->length();
if (index > 0) {
first_char = current_unicharset.id_to_unichar(c_it.data()->unichar_id());
first_rat = c_it.data()->rating();
first_cert = -c_it.data()->certainty();
if (index > 1) {
sec_char = current_unicharset.id_to_unichar(
c_it.data_relative(1)->unichar_id());
sec_rat = c_it.data_relative(1)->rating();
sec_cert = -c_it.data_relative(1)->certainty();
} else {
sec_char = NULL;
sec_rat = -1;
sec_cert = -1;
}
} else {
first_char = NULL;
first_rat = -1;
first_cert = -1;
}
if (first_char != NULL && (*first_char == '\0' || *first_char == ' '))
first_char = NULL;
if (sec_char != NULL && (*sec_char == '\0' || *sec_char == ' '))
sec_char = NULL;
tprintf(" " INT32FORMAT " %s %g %g %s %g %g\n",
ratings->length(),
first_char != NULL ? first_char : "~",
first_rat, first_cert, sec_char != NULL ? sec_char : "~",
sec_rat, sec_cert);
}
/**
* print_char_choices_list
*/
void print_char_choices_list(const char *msg,
const BLOB_CHOICE_LIST_VECTOR &char_choices,
const UNICHARSET &current_unicharset,
BOOL8 detailed) {
if (*msg != '\0') tprintf("%s\n", msg);
for (int x = 0; x < char_choices.length(); ++x) {
BLOB_CHOICE_IT c_it;
c_it.set_to_list(char_choices.get(x));
tprintf("\nchar[%d]: %s\n", x,
current_unicharset.debug_str( c_it.data()->unichar_id()).string());
if (detailed)
print_ratings_list("", char_choices.get(x), current_unicharset);
}
}
/**
* print_word_alternates_list
*/
void print_word_alternates_list(
WERD_CHOICE *word,
GenericVector<WERD_CHOICE *> *alternates) {
if (!word || !alternates) return;
STRING alternates_str;
for (int i = 0; i < alternates->size(); i++) {
if (i > 0) alternates_str += "\", \"";
alternates_str += alternates->get(i)->unichar_string();
}
tprintf("Alternates for \"%s\": {\"%s\"}\n",
word->unichar_string().string(), alternates_str.string());
}

View File

@ -23,11 +23,27 @@
#include <assert.h>
#include "clst.h"
#include "elst.h"
#include "genericvector.h"
#include "matrix.h"
#include "unichar.h"
#include "unicharset.h"
#include "werd.h"
class MATRIX;
class TBLOB;
class TWERD;
// Enum to describe the source of a BLOB_CHOICE to make it possible to determine
// whether a blob has been classified by inspecting the BLOB_CHOICEs.
enum BlobChoiceClassifier {
BCC_STATIC_CLASSIFIER, // From the char_norm classifier.
BCC_ADAPTED_CLASSIFIER, // From the adaptive classifier.
BCC_SPECKLE_CLASSIFIER, // Backup for failed classification.
BCC_AMBIG, // Generated by ambiguity detection.
BCC_FAKE, // From some other process.
};
class BLOB_CHOICE: public ELIST_LINK
{
public:
@ -38,20 +54,23 @@ class BLOB_CHOICE: public ELIST_LINK
rating_ = MAX_FLOAT32;
certainty_ = -MAX_FLOAT32;
script_id_ = -1;
language_model_state_ = NULL;
min_xheight_ = 0;
max_xheight_ = 0;
adapted_ = false;
xgap_before_ = 0;
xgap_after_ = 0;
min_xheight_ = 0.0f;
max_xheight_ = 0.0f;
yshift_ = 0.0f;
classifier_ = BCC_FAKE;
}
BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
float src_rating, // rating
float src_cert, // certainty
inT16 src_fontinfo_id, // font
inT16 src_fontinfo_id2, // 2nd choice font
inT16 src_fontinfo_id, // font
inT16 src_fontinfo_id2, // 2nd choice font
int script_id, // script
inT16 min_xheight, // min xheight in image pixel units
inT16 max_xheight, // max xheight allowed by this char
bool adapted); // adapted match or not
float min_xheight, // min xheight in image pixel units
float max_xheight, // max xheight allowed by this char
float yshift, // the larger of y shift (top or bottom)
BlobChoiceClassifier c); // adapted match or other
BLOB_CHOICE(const BLOB_CHOICE &other);
~BLOB_CHOICE() {}
@ -73,8 +92,8 @@ class BLOB_CHOICE: public ELIST_LINK
int script_id() const {
return script_id_;
}
void *language_model_state() {
return language_model_state_;
const MATRIX_COORD& matrix_cell() {
return matrix_cell_;
}
inT16 xgap_before() const {
return xgap_before_;
@ -82,14 +101,25 @@ class BLOB_CHOICE: public ELIST_LINK
inT16 xgap_after() const {
return xgap_after_;
}
inT16 min_xheight() const {
float min_xheight() const {
return min_xheight_;
}
inT16 max_xheight() const {
float max_xheight() const {
return max_xheight_;
}
bool adapted() const {
return adapted_;
float yshift() const {
return yshift_;
}
BlobChoiceClassifier classifier() const {
return classifier_;
}
bool IsAdapted() const {
return classifier_ == BCC_ADAPTED_CLASSIFIER;
}
bool IsClassified() const {
return classifier_ == BCC_STATIC_CLASSIFIER ||
classifier_ == BCC_ADAPTED_CLASSIFIER ||
classifier_ == BCC_SPECKLE_CLASSIFIER;
}
void set_unichar_id(UNICHAR_ID newunichar_id) {
@ -110,8 +140,9 @@ class BLOB_CHOICE: public ELIST_LINK
void set_script(int newscript_id) {
script_id_ = newscript_id;
}
void set_language_model_state(void *language_model_state) {
language_model_state_ = language_model_state;
void set_matrix_cell(int col, int row) {
matrix_cell_.col = col;
matrix_cell_.row = row;
}
void set_xgap_before(inT16 gap) {
xgap_before_ = gap;
@ -119,19 +150,39 @@ class BLOB_CHOICE: public ELIST_LINK
void set_xgap_after(inT16 gap) {
xgap_after_ = gap;
}
void set_adapted(bool adapted) {
adapted_ = adapted;
void set_classifier(BlobChoiceClassifier classifier) {
classifier_ = classifier;
}
static BLOB_CHOICE* deep_copy(const BLOB_CHOICE* src) {
BLOB_CHOICE* choice = new BLOB_CHOICE;
*choice = *src;
return choice;
}
void print(const UNICHARSET *unicharset) {
tprintf("r%.2f c%.2f : %d %s", rating_, certainty_, unichar_id_,
// Returns true if *this and other agree on the baseline and x-height
// to within some tolerance based on a given estimate of the x-height.
bool PosAndSizeAgree(const BLOB_CHOICE& other, float x_height,
bool debug) const;
void print(const UNICHARSET *unicharset) const {
tprintf("r%.2f c%.2f x[%g,%g]: %d %s",
rating_, certainty_,
min_xheight_, max_xheight_, unichar_id_,
(unicharset == NULL) ? "" :
unicharset->debug_str(unichar_id_).string());
}
void print_full() const {
print(NULL);
tprintf(" script=%d, font1=%d, font2=%d, yshift=%g, classifier=%d\n",
script_id_, fontinfo_id_, fontinfo_id2_, yshift_, classifier_);
}
// Sort function for sorting BLOB_CHOICEs in increasing order of rating.
static int SortByRating(const void *p1, const void *p2) {
const BLOB_CHOICE *bc1 =
*reinterpret_cast<const BLOB_CHOICE * const *>(p1);
const BLOB_CHOICE *bc2 =
*reinterpret_cast<const BLOB_CHOICE * const *>(p2);
return (bc1->rating_ < bc2->rating_) ? -1 : 1;
}
private:
UNICHAR_ID unichar_id_; // unichar id
@ -149,21 +200,26 @@ class BLOB_CHOICE: public ELIST_LINK
// k is defined as above to normalize -klog p to the range [0, 1].
float certainty_; // absolute
int script_id_;
// Stores language model information about this BLOB_CHOICE. Used during
// the segmentation search for BLOB_CHOICEs in BLOB_CHOICE_LISTs that are
// recorded in the ratings matrix.
// The pointer is owned/managed by the segmentation search.
void *language_model_state_;
// Holds the position of this choice in the ratings matrix.
// Used to location position in the matrix during path backtracking.
MATRIX_COORD matrix_cell_;
inT16 xgap_before_;
inT16 xgap_after_;
// X-height range (in image pixels) that this classification supports.
inT16 min_xheight_;
inT16 max_xheight_;
bool adapted_; // true if this is a match from adapted templates
float min_xheight_;
float max_xheight_;
// yshift_ - The vertical distance (in image pixels) the character is
// shifted (up or down) from an acceptable y position.
float yshift_;
BlobChoiceClassifier classifier_; // What generated *this.
};
// Make BLOB_CHOICE listable.
ELISTIZEH (BLOB_CHOICE) CLISTIZEH (BLOB_CHOICE_LIST)
ELISTIZEH(BLOB_CHOICE)
// Return the BLOB_CHOICE in bc_list matching a given unichar_id,
// or NULL if there is no match.
BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list);
// Permuter codes used in WERD_CHOICEs.
enum PermuterType {
@ -180,11 +236,27 @@ enum PermuterType {
USER_DAWG_PERM, // 10
FREQ_DAWG_PERM, // 11
COMPOUND_PERM, // 12
NUM_PERMUTER_TYPES
};
class WERD_CHOICE {
namespace tesseract {
// ScriptPos tells whether a character is subscript, superscript or normal.
enum ScriptPos {
SP_NORMAL,
SP_SUBSCRIPT,
SP_SUPERSCRIPT,
SP_DROPCAP
};
const char *ScriptPosToString(tesseract::ScriptPos script_pos);
} // namespace tesseract.
class WERD_CHOICE : public ELIST_LINK {
public:
static const float kBadRating;
static const char *permuter_name(uinT8 permuter);
WERD_CHOICE(const UNICHARSET *unicharset)
: unicharset_(unicharset) { this->init(8); }
@ -213,6 +285,12 @@ class WERD_CHOICE {
inline int length() const {
return length_;
}
float adjust_factor() const {
return adjust_factor_;
}
void set_adjust_factor(float factor) {
adjust_factor_ = factor;
}
inline const UNICHAR_ID *unichar_ids() const {
return unichar_ids_;
}
@ -220,12 +298,13 @@ class WERD_CHOICE {
assert(index < length_);
return unichar_ids_[index];
}
inline const char *fragment_lengths() const {
return fragment_lengths_;
inline int state(int index) const {
return state_[index];
}
inline const char fragment_length(int index) const {
assert(index < length_);
return fragment_lengths_[index];
tesseract::ScriptPos BlobPosition(int index) const {
if (index < 0 || index >= length_)
return tesseract::SP_NORMAL;
return script_pos_[index];
}
inline float rating() const {
return rating_;
@ -233,23 +312,41 @@ class WERD_CHOICE {
inline float certainty() const {
return certainty_;
}
inline float certainty(int index) const {
return certainties_[index];
}
inline float min_x_height() const {
return min_x_height_;
}
inline float max_x_height() const {
return max_x_height_;
}
inline void set_x_heights(float min_height, float max_height) {
min_x_height_ = min_height;
max_x_height_ = max_height;
}
inline uinT8 permuter() const {
return permuter_;
}
const char *permuter_name() const;
inline bool fragment_mark() const {
return fragment_mark_;
}
inline BLOB_CHOICE_LIST_CLIST* blob_choices() {
return blob_choices_;
}
// Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
// taken from the appropriate cell in the ratings MATRIX.
// Borrowed pointer, so do not delete.
BLOB_CHOICE_LIST* blob_choices(int index, MATRIX* ratings) const;
// Returns the MATRIX_COORD corresponding to the location in the ratings
// MATRIX for the given index into the word.
MATRIX_COORD MatrixCoord(int index) const;
inline void set_unichar_id(UNICHAR_ID unichar_id, int index) {
assert(index < length_);
unichar_ids_[index] = unichar_id;
}
inline void set_fragment_length(char flen, int index) {
assert(index < length_);
fragment_lengths_[index] = flen;
bool dangerous_ambig_found() const {
return dangerous_ambig_found_;
}
void set_dangerous_ambig_found_(bool value) {
dangerous_ambig_found_ = value;
}
inline void set_rating(float new_val) {
rating_ = new_val;
@ -260,9 +357,6 @@ class WERD_CHOICE {
inline void set_permuter(uinT8 perm) {
permuter_ = perm;
}
inline void set_fragment_mark(bool new_fragment_mark) {
fragment_mark_ = new_fragment_mark;
}
// Note: this function should only be used if all the fields
// are populated manually with set_* functions (rather than
// (copy)constructors and append_* functions).
@ -270,19 +364,24 @@ class WERD_CHOICE {
ASSERT_HOST(reserved_ >= len);
length_ = len;
}
void set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices);
/// Make more space in unichar_id_ and fragment_lengths_ arrays.
inline void double_the_size() {
if (reserved_ > 0) {
unichar_ids_ = GenericVector<UNICHAR_ID>::double_the_size_memcpy(
reserved_, unichar_ids_);
fragment_lengths_ = GenericVector<char>::double_the_size_memcpy(
reserved_, fragment_lengths_);
script_pos_ = GenericVector<tesseract::ScriptPos>::double_the_size_memcpy(
reserved_, script_pos_);
state_ = GenericVector<int>::double_the_size_memcpy(
reserved_, state_);
certainties_ = GenericVector<float>::double_the_size_memcpy(
reserved_, certainties_);
reserved_ *= 2;
} else {
unichar_ids_ = new UNICHAR_ID[1];
fragment_lengths_ = new char[1];
script_pos_ = new tesseract::ScriptPos[1];
state_ = new int[1];
certainties_ = new float[1];
reserved_ = 1;
}
}
@ -293,18 +392,24 @@ class WERD_CHOICE {
reserved_ = reserved;
if (reserved > 0) {
unichar_ids_ = new UNICHAR_ID[reserved];
fragment_lengths_ = new char[reserved];
script_pos_ = new tesseract::ScriptPos[reserved];
state_ = new int[reserved];
certainties_ = new float[reserved];
} else {
unichar_ids_ = NULL;
fragment_lengths_ = NULL;
script_pos_ = NULL;
state_ = NULL;
certainties_ = NULL;
}
length_ = 0;
adjust_factor_ = 1.0f;
rating_ = 0.0;
certainty_ = MAX_FLOAT32;
min_x_height_ = 0.0f;
max_x_height_ = MAX_FLOAT32;
permuter_ = NO_PERM;
fragment_mark_ = false;
blob_choices_ = NULL;
unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
dangerous_ambig_found_ = false;
}
/// Helper function to build a WERD_CHOICE from the given string,
@ -321,34 +426,39 @@ class WERD_CHOICE {
length_ = 0;
rating_ = kBadRating;
certainty_ = -MAX_FLOAT32;
fragment_mark_ = false;
}
/// This function assumes that there is enough space reserved
/// in the WERD_CHOICE for adding another unichar.
/// This is an efficient alternative to append_unichar_id().
inline void append_unichar_id_space_allocated(
UNICHAR_ID unichar_id, char fragment_length,
UNICHAR_ID unichar_id, int blob_count,
float rating, float certainty) {
assert(reserved_ > length_);
length_++;
this->set_unichar_id(unichar_id, fragment_length,
this->set_unichar_id(unichar_id, blob_count,
rating, certainty, length_-1);
}
void append_unichar_id(UNICHAR_ID unichar_id, char fragment_length,
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count,
float rating, float certainty);
inline void set_unichar_id(UNICHAR_ID unichar_id, char fragment_length,
inline void set_unichar_id(UNICHAR_ID unichar_id, int blob_count,
float rating, float certainty, int index) {
assert(index < length_);
unichar_ids_[index] = unichar_id;
fragment_lengths_[index] = fragment_length;
state_[index] = blob_count;
certainties_[index] = certainty;
script_pos_[index] = tesseract::SP_NORMAL;
rating_ += rating;
if (certainty < certainty_) {
certainty_ = certainty;
}
}
// Sets the entries for the given index from the BLOB_CHOICE, assuming
// unit fragment lengths, but setting the state for this index to blob_count.
void set_blob_choice(int index, int blob_count,
const BLOB_CHOICE* blob_choice);
bool contains_unichar_id(UNICHAR_ID unichar_id) const;
void remove_unichar_ids(int index, int num);
@ -364,6 +474,11 @@ class WERD_CHOICE {
// punctuation from the left and right.
void punct_stripped(int *start_core, int *end_core) const;
// Returns the indices [start, end) containing the core of the word, stripped
// of any superscript digits on either side. (i.e., the non-footnote part
// of the word). There is no guarantee that the output range is non-empty.
void GetNonSuperscriptSpan(int *start, int *end) const;
// Return a copy of this WERD_CHOICE with the choices [start, end).
// The result is useful only for checking against a dictionary.
WERD_CHOICE shallow_copy(int start, int end) const;
@ -402,8 +517,42 @@ class WERD_CHOICE {
this->string_and_lengths(&unichar_string_, &unichar_lengths_);
return unichar_lengths_;
}
const void print() const { this->print(""); }
const void print(const char *msg) const;
// Sets up the script_pos_ member using the blobs_list to get the bln
// bounding boxes, *this to get the unichars, and this->unicharset
// to get the target positions. If small_caps is true, sub/super are not
// considered, but dropcaps are.
// NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
void SetScriptPositions(bool small_caps, TWERD* word);
// Sets the script_pos_ member from some source positions with a given length.
void SetScriptPositions(const tesseract::ScriptPos* positions, int length);
// Sets all the script_pos_ positions to the given position.
void SetAllScriptPositions(tesseract::ScriptPos position);
static tesseract::ScriptPos ScriptPositionOf(bool print_debug,
const UNICHARSET& unicharset,
const TBOX& blob_box,
UNICHAR_ID unichar_id);
// Returns the "dominant" script ID for the word. By "dominant", the script
// must account for at least half the characters. Otherwise, it returns 0.
// Note that for Japanese, Hiragana and Katakana are simply treated as Han.
int GetTopScriptID() const;
// Fixes the state_ for a chop at the given blob_posiiton.
void UpdateStateForSplit(int blob_position);
// Returns the sum of all the state elements, being the total number of blobs.
int TotalOfStates() const;
void print() const { this->print(""); }
void print(const char *msg) const;
// Prints the segmentation state with an introductory message.
void print_state(const char *msg) const;
// Displays the segmentation state of *this (if not the same as the last
// one displayed) and waits for a click in the window.
void DisplaySegmentation(TWERD* word);
WERD_CHOICE& operator+= ( // concatanate
const WERD_CHOICE & second);// second on first
@ -412,41 +561,55 @@ class WERD_CHOICE {
private:
const UNICHARSET *unicharset_;
// TODO(rays) Perhaps replace the multiple arrays with an array of structs?
// unichar_ids_ is an array of classifier "results" that make up a word.
// For each unichar_ids_[i], script_pos_[i] has the sub/super/normal position
// of each unichar_id.
// state_[i] indicates the number of blobs in WERD_RES::chopped_word that
// were put together to make the classification results in the ith position
// in unichar_ids_, and certainties_[i] is the certainty of the choice that
// was used in this word.
// == Change from before ==
// Previously there was fragment_lengths_ that allowed a word to be
// artificially composed of multiple fragment results. Since the new
// segmentation search doesn't do fragments, treatment of fragments has
// been moved to a lower level, augmenting the ratings matrix with the
// combined fragments, and allowing the language-model/segmentation-search
// to deal with only the combined unichar_ids.
UNICHAR_ID *unichar_ids_; // unichar ids that represent the text of the word
char *fragment_lengths_; // number of fragments in each unichar
tesseract::ScriptPos* script_pos_; // Normal/Sub/Superscript of each unichar.
int* state_; // Number of blobs in each unichar.
float* certainties_; // Certainty of each unichar.
int reserved_; // size of the above arrays
int length_; // word length
// Factor that was used to adjust the rating.
float adjust_factor_;
// Rating is the sum of the ratings of the individual blobs in the word.
float rating_; // size related
// certainty is the min (worst) certainty of the individual blobs in the word.
float certainty_; // absolute
// xheight computed from the result, or 0 if inconsistent.
float min_x_height_;
float max_x_height_;
uinT8 permuter_; // permuter code
bool fragment_mark_; // if true, indicates that this choice
// was chosen over a better one that
// contained a fragment
BLOB_CHOICE_LIST_CLIST *blob_choices_; // best choices for each blob
// Normally, the blob_choices_ represent the recognition results in order
// Normally, the ratings_ matrix represents the recognition results in order
// from left-to-right. However, some engines (say Cube) may return
// recognition results in the order of the script's major reading direction
// (for Arabic, that is right-to-left).
bool unichars_in_script_order_;
// True if NoDangerousAmbig found an ambiguity.
bool dangerous_ambig_found_;
// The following variables are populated and passed by reference any
// time unichar_string() or unichar_lengths() are called.
mutable STRING unichar_string_;
mutable STRING unichar_lengths_;
bool unichar_info_present;
private:
void delete_blob_choices();
};
// Make WERD_CHOICE listable.
ELISTIZEH (WERD_CHOICE)
ELISTIZEH(WERD_CHOICE)
typedef GenericVector<BLOB_CHOICE_LIST *> BLOB_CHOICE_LIST_VECTOR;
typedef GenericVector<WERD_CHOICE_LIST *> WERD_CHOICE_LIST_VECTOR;
// Utilities for comparing WERD_CHOICEs
@ -454,27 +617,11 @@ bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1,
const WERD_CHOICE &word2);
// Utilities for debug printing.
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings);
void print_ratings_list(
const char *msg, // intro message
BLOB_CHOICE_LIST *ratings, // list of results
const UNICHARSET &current_unicharset // unicharset that can be used
// for id-to-unichar conversion
);
void print_ratings_info(
FILE *fp, // file to use
BLOB_CHOICE_LIST *ratings, // list of results
const UNICHARSET &current_unicharset // unicharset that can be used
// for id-to-unichar conversion
);
void print_char_choices_list(
const char *msg,
const BLOB_CHOICE_LIST_VECTOR &char_choices,
const UNICHARSET &current_unicharset,
BOOL8 detailed
);
void print_word_alternates_list(
WERD_CHOICE *word,
GenericVector<WERD_CHOICE *> *alternates);
#endif

View File

@ -171,6 +171,16 @@ void TBOX::plot( //paint box
}
#endif
// Appends the bounding box as (%d,%d)->(%d,%d) to a STRING.
void TBOX::print_to_str(STRING *str) const {
// "(%d,%d)->(%d,%d)", left(), bottom(), right(), top()
str->add_str_int("(", left());
str->add_str_int(",", bottom());
str->add_str_int(")->(", right());
str->add_str_int(",", top());
*str += ')';
}
// Writes to the given file. Returns false in case of error.
bool TBOX::Serialize(FILE* fp) const {
if (!bot_left.Serialize(fp)) return false;

View File

@ -24,6 +24,7 @@
#include "points.h"
#include "ndminx.h"
#include "scrollview.h"
#include "strngs.h"
#include "tprintf.h"
class DLLSYM TBOX { // bounding box
@ -264,15 +265,8 @@ class DLLSYM TBOX { // bounding box
tprintf("Bounding box=(%d,%d)->(%d,%d)\n",
left(), bottom(), right(), top());
}
// Same as print(), but appends debug information to the given string
// instead of printing it to stdout.
void append_debug(STRING *str) const {
char buffer[256];
sprintf(buffer, "Bounding box=(%d,%d)->(%d,%d)\n",
left(), bottom(), right(), top());
*str += buffer;
}
// Appends the bounding box as (%d,%d)->(%d,%d) to a STRING.
void print_to_str(STRING *str) const;
#ifndef GRAPHICS_DISABLED
void plot( // use current settings

View File

@ -27,8 +27,8 @@
----------------------------------------------------------------------*/
#include "seam.h"
#include "blobs.h"
#include "callcpp.h"
#include "structures.h"
#include "freelist.h"
#include "tprintf.h"
#ifdef __UNIX__
#include <assert.h>
@ -38,7 +38,6 @@
V a r i a b l e s
----------------------------------------------------------------------*/
#define NUM_STARTING_SEAMS 20
makestructure(newseam, free_seam, SEAM);
/*----------------------------------------------------------------------
Public Function Code
@ -66,7 +65,7 @@ bool point_in_split(SPLIT *split, EDGEPT *point1, EDGEPT *point2) {
* seam.
* @returns TRUE if one of them is.
*/
bool point_in_seam(SEAM *seam, SPLIT *split) {
bool point_in_seam(const SEAM *seam, SPLIT *split) {
return (point_in_split(seam->split1, split->point1, split->point2) ||
point_in_split(seam->split2, split->point1, split->point2) ||
point_in_split(seam->split3, split->point1, split->point2));
@ -96,16 +95,6 @@ bool point_used_by_seam(SEAM *seam, EDGEPT *point) {
point_used_by_split(seam->split3, point);
}
/**
* @name add_seam
*
* Add another seam to a collection of seams.
*/
SEAMS add_seam(SEAMS seam_list, SEAM *seam) {
return (array_push (seam_list, seam));
}
/**
* @name combine_seam
*
@ -126,7 +115,8 @@ void combine_seams(SEAM *dest_seam, SEAM *source_seam) {
else if (!dest_seam->split3)
dest_seam->split3 = source_seam->split1;
else
cprintf("combine_seam: Seam is too crowded, can't be combined !\n");
delete source_seam->split1; // Wouldn't have fitted.
source_seam->split1 = NULL;
}
if (source_seam->split2) {
if (!dest_seam->split2)
@ -134,35 +124,17 @@ void combine_seams(SEAM *dest_seam, SEAM *source_seam) {
else if (!dest_seam->split3)
dest_seam->split3 = source_seam->split2;
else
cprintf("combine_seam: Seam is too crowded, can't be combined !\n");
delete source_seam->split2; // Wouldn't have fitted.
source_seam->split2 = NULL;
}
if (source_seam->split3) {
if (!dest_seam->split3)
dest_seam->split3 = source_seam->split3;
else
cprintf("combine_seam: Seam is too crowded, can't be combined !\n");
}
free_seam(source_seam);
}
/**
* @name delete_seam
*
* Free this seam record and the splits that are attached to it.
*/
void delete_seam(void *arg) { //SEAM *seam)
SEAM *seam = (SEAM *) arg;
if (seam) {
if (seam->split1)
delete_split(seam->split1);
if (seam->split2)
delete_split(seam->split2);
if (seam->split3)
delete_split(seam->split3);
free_seam(seam);
delete source_seam->split3; // Wouldn't have fitted.
source_seam->split3 = NULL;
}
delete source_seam;
}
/**
@ -172,36 +144,17 @@ void delete_seam(void *arg) { //SEAM *seam)
* present in the starting segmentation. Each of the seams created
* by this routine have location information only.
*/
SEAMS start_seam_list(TBLOB *blobs) {
TBLOB *blob;
SEAMS seam_list;
void start_seam_list(TWERD *word, GenericVector<SEAM*>* seam_array) {
seam_array->truncate(0);
TPOINT location;
/* Seam slot per char */
seam_list = new_seam_list ();
for (blob = blobs; blob->next != NULL; blob = blob->next) {
TBOX bbox = blob->bounding_box();
TBOX nbox = blob->next->bounding_box();
for (int b = 1; b < word->NumBlobs(); ++b) {
TBOX bbox = word->blobs[b - 1]->bounding_box();
TBOX nbox = word->blobs[b]->bounding_box();
location.x = (bbox.right() + nbox.left()) / 2;
location.y = (bbox.bottom() + bbox.top() + nbox.bottom() + nbox.top()) / 4;
seam_list = add_seam(seam_list,
new_seam(0.0, location, NULL, NULL, NULL));
seam_array->push_back(new SEAM(0.0f, location, NULL, NULL, NULL));
}
return seam_list;
}
/**
* @name free_seam_list
*
* Free all the seams that have been allocated in this list. Reclaim
* the memory for each of the splits as well.
*/
void free_seam_list(SEAMS seam_list) {
int x;
array_loop(seam_list, x) delete_seam(array_value (seam_list, x));
array_free(seam_list);
}
@ -210,32 +163,26 @@ void free_seam_list(SEAMS seam_list) {
*
* @returns true if insert_seam will succeed.
*/
bool test_insert_seam(SEAMS seam_list,
int index,
TBLOB *left_blob,
TBLOB *first_blob) {
bool test_insert_seam(const GenericVector<SEAM*>& seam_array,
TWERD *word, int index) {
SEAM *test_seam;
TBLOB *blob;
int test_index;
int list_length;
list_length = array_count (seam_list);
for (test_index=0, blob=first_blob->next;
test_index < index;
test_index++, blob=blob->next) {
test_seam = (SEAM *) array_value(seam_list, test_index);
list_length = seam_array.size();
for (int test_index = 0; test_index < index; ++test_index) {
test_seam = seam_array[test_index];
if (test_index + test_seam->widthp < index &&
test_seam->widthp + test_index == index - 1 &&
account_splits_right(test_seam, blob) < 0)
account_splits(test_seam, word, test_index + 1, 1) < 0)
return false;
}
for (test_index=index, blob=left_blob->next;
test_index < list_length;
test_index++, blob=blob->next) {
test_seam = (SEAM *) array_value(seam_list, test_index);
for (int test_index = index; test_index < list_length; test_index++) {
test_seam = seam_array[test_index];
if (test_index - test_seam->widthn >= index &&
test_index - test_seam->widthn == index &&
account_splits_left(test_seam, first_blob, blob) < 0)
account_splits(test_seam, word, test_index + 1, -1) < 0)
return false;
}
return true;
@ -247,58 +194,51 @@ bool test_insert_seam(SEAMS seam_list,
* Add another seam to a collection of seams at a particular location
* in the seam array.
*/
SEAMS insert_seam(SEAMS seam_list,
int index,
SEAM *seam,
TBLOB *left_blob,
TBLOB *first_blob) {
void insert_seam(const TWERD* word, int index, SEAM *seam,
GenericVector<SEAM*>* seam_array) {
SEAM *test_seam;
TBLOB *blob;
int test_index;
int list_length;
list_length = array_count(seam_list);
for (test_index=0, blob=first_blob->next;
test_index < index;
test_index++, blob=blob->next) {
test_seam = (SEAM *) array_value(seam_list, test_index);
list_length = seam_array->size();
for (int test_index = 0; test_index < index; ++test_index) {
test_seam = seam_array->get(test_index);
if (test_index + test_seam->widthp >= index) {
test_seam->widthp++; /*got in the way */
} else if (test_seam->widthp + test_index == index - 1) {
test_seam->widthp = account_splits_right(test_seam, blob);
test_seam->widthp = account_splits(test_seam, word, test_index + 1, 1);
if (test_seam->widthp < 0) {
cprintf("Failed to find any right blob for a split!\n");
tprintf("Failed to find any right blob for a split!\n");
print_seam("New dud seam", seam);
print_seam("Failed seam", test_seam);
}
}
}
for (test_index=index, blob=left_blob->next;
test_index < list_length;
test_index++, blob=blob->next) {
test_seam = (SEAM *) array_value(seam_list, test_index);
for (int test_index = index; test_index < list_length; test_index++) {
test_seam = seam_array->get(test_index);
if (test_index - test_seam->widthn < index) {
test_seam->widthn++; /*got in the way */
} else if (test_index - test_seam->widthn == index) {
test_seam->widthn = account_splits_left(test_seam, first_blob, blob);
test_seam->widthn = account_splits(test_seam, word, test_index + 1, -1);
if (test_seam->widthn < 0) {
cprintf("Failed to find any left blob for a split!\n");
tprintf("Failed to find any left blob for a split!\n");
print_seam("New dud seam", seam);
print_seam("Failed seam", test_seam);
}
}
}
return (array_insert (seam_list, index, seam));
seam_array->insert(seam, index);
}
/**
* @name account_splits_right
* @name account_splits
*
* Account for all the splits by looking to the right.
* in the blob list.
* Account for all the splits by looking to the right (blob_direction == 1),
* or to the left (blob_direction == -1) in the word.
*/
int account_splits_right(SEAM *seam, TBLOB *blob) {
int account_splits(const SEAM *seam, const TWERD *word, int blob_index,
int blob_direction) {
inT8 found_em[3];
inT8 width;
@ -309,6 +249,7 @@ int account_splits_right(SEAM *seam, TBLOB *blob) {
return 0;
width = 0;
do {
TBLOB* blob = word->blobs[blob_index];
if (!found_em[0])
found_em[0] = find_split_in_blob(seam->split1, blob);
if (!found_em[1])
@ -319,54 +260,12 @@ int account_splits_right(SEAM *seam, TBLOB *blob) {
return width;
}
width++;
blob = blob->next;
} while (blob != NULL);
blob_index += blob_direction;
} while (0 <= blob_index && blob_index < word->NumBlobs());
return -1;
}
/**
* @name account_splits_left
*
* Account for all the splits by looking to the left.
* in the blob list.
*/
int account_splits_left(SEAM *seam, TBLOB *blob, TBLOB *end_blob) {
inT32 depth = 0;
inT8 width = 0;
inT8 found_em[3];
account_splits_left_helper(seam, blob, end_blob, &depth, &width, found_em);
return width;
}
void account_splits_left_helper(SEAM *seam, TBLOB *blob, TBLOB *end_blob,
inT32 *depth, inT8 *width, inT8* found_em) {
if (blob != end_blob) {
(*depth)++;
account_splits_left_helper(seam, blob->next, end_blob,
depth, width, found_em);
(*depth)--;
} else {
found_em[0] = seam->split1 == NULL;
found_em[1] = seam->split2 == NULL;
found_em[2] = seam->split3 == NULL;
*width = 0;
}
if (!found_em[0])
found_em[0] = find_split_in_blob(seam->split1, blob);
if (!found_em[1])
found_em[1] = find_split_in_blob(seam->split2, blob);
if (!found_em[2])
found_em[2] = find_split_in_blob(seam->split3, blob);
if (!found_em[0] || !found_em[1] || !found_em[2]) {
(*width)++;
if (*depth == 0) {
*width = -1;
}
}
}
/**
* @name find_split_in_blob
*
@ -393,7 +292,7 @@ bool find_split_in_blob(SPLIT *split, TBLOB *blob) {
* Merge these two seams into a new seam. Duplicate the split records
* in both of the input seams. Return the resultant seam.
*/
SEAM *join_two_seams(SEAM *seam1, SEAM *seam2) {
SEAM *join_two_seams(const SEAM *seam1, const SEAM *seam2) {
SEAM *result = NULL;
SEAM *temp;
@ -403,52 +302,13 @@ SEAM *join_two_seams(SEAM *seam1, SEAM *seam2) {
(seam1->split2 == NULL && seam2->split3 == NULL) ||
seam1->split1 == NULL || seam2->split1 == NULL) &&
(!shared_split_points(seam1, seam2))) {
clone_seam(result, seam1);
clone_seam(temp, seam2);
result = new SEAM(*seam1);
temp = new SEAM(*seam2);
combine_seams(result, temp);
}
return (result);
}
/**
* @name new_seam
*
* Create a structure for a "seam" between two blobs. This data
* structure may actually hold up to three different splits.
* Initailization of this record is done by this routine.
*/
SEAM *new_seam(PRIORITY priority,
const TPOINT& location,
SPLIT *split1,
SPLIT *split2,
SPLIT *split3) {
SEAM *seam;
seam = newseam ();
seam->priority = priority;
seam->location = location;
seam->widthp = 0;
seam->widthn = 0;
seam->split1 = split1;
seam->split2 = split2;
seam->split3 = split3;
return (seam);
}
/**
* @name new_seam_list
*
* Create a collection of seam records in an array.
*/
SEAMS new_seam_list() {
return (array_new (NUM_STARTING_SEAMS));
}
/**
* @name print_seam
*
@ -457,21 +317,21 @@ SEAMS new_seam_list() {
*/
void print_seam(const char *label, SEAM *seam) {
if (seam) {
cprintf(label);
cprintf(" %6.2f @ (%d,%d), p=%d, n=%d ",
tprintf(label);
tprintf(" %6.2f @ (%d,%d), p=%d, n=%d ",
seam->priority, seam->location.x, seam->location.y,
seam->widthp, seam->widthn);
print_split(seam->split1);
if (seam->split2) {
cprintf(", ");
tprintf(", ");
print_split (seam->split2);
if (seam->split3) {
cprintf(", ");
tprintf(", ");
print_split (seam->split3);
}
}
cprintf ("\n");
tprintf("\n");
}
}
@ -482,17 +342,16 @@ void print_seam(const char *label, SEAM *seam) {
* Print a list of splits. Show the coordinates of both points in
* each split.
*/
void print_seams(const char *label, SEAMS seams) {
int x;
void print_seams(const char *label, const GenericVector<SEAM*>& seams) {
char number[CHARS_PER_LINE];
if (seams) {
cprintf("%s\n", label);
array_loop(seams, x) {
if (!seams.empty()) {
tprintf("%s\n", label);
for (int x = 0; x < seams.size(); ++x) {
sprintf(number, "%2d: ", x);
print_seam(number, (SEAM *) array_value(seams, x));
print_seam(number, seams[x]);
}
cprintf("\n");
tprintf("\n");
}
}
@ -504,7 +363,7 @@ void print_seams(const char *label, SEAMS seams) {
* points in common. Return TRUE if any of the same points are present
* in any of the splits of both seams.
*/
int shared_split_points(SEAM *seam1, SEAM *seam2) {
int shared_split_points(const SEAM *seam1, const SEAM *seam2) {
if (seam1 == NULL || seam2 == NULL)
return (FALSE);
@ -532,23 +391,20 @@ int shared_split_points(SEAM *seam1, SEAM *seam2) {
* Break up the blobs in this chain so that they are all independent.
* This operation should undo the affect of join_pieces.
**********************************************************************/
void break_pieces(TBLOB *blobs, SEAMS seams, inT16 start, inT16 end) {
TESSLINE *outline = blobs->outlines;
TBLOB *next_blob;
inT16 x;
void break_pieces(const GenericVector<SEAM*>& seams, int first, int last,
TWERD *word) {
for (int x = first; x < last; ++x)
reveal_seam(seams[x]);
for (x = start; x < end; x++)
reveal_seam ((SEAM *) array_value (seams, x));
TESSLINE *outline = word->blobs[first]->outlines;
int next_blob = first + 1;
next_blob = blobs->next;
while (outline && next_blob) {
if (outline->next == next_blob->outlines) {
while (outline != NULL && next_blob <= last) {
if (outline->next == word->blobs[next_blob]->outlines) {
outline->next = NULL;
outline = next_blob->outlines;
next_blob = next_blob->next;
}
else {
outline = word->blobs[next_blob]->outlines;
++next_blob;
} else {
outline = outline->next;
}
}
@ -561,30 +417,19 @@ void break_pieces(TBLOB *blobs, SEAMS seams, inT16 start, inT16 end) {
* Join a group of base level pieces into a single blob that can then
* be classified.
**********************************************************************/
void join_pieces(TBLOB *piece_blobs, SEAMS seams, inT16 start, inT16 end) {
TBLOB *next_blob;
TBLOB *blob;
inT16 x;
TESSLINE *outline;
SEAM *seam;
for (x = 0, blob = piece_blobs; x < start; x++)
blob = blob->next;
next_blob = blob->next;
outline = blob->outlines;
void join_pieces(const GenericVector<SEAM*>& seams, int first, int last,
TWERD *word) {
TESSLINE *outline = word->blobs[first]->outlines;
if (!outline)
return;
while (x < end) {
seam = (SEAM *) array_value (seams, x);
if (x - seam->widthn >= start && x + seam->widthp < end)
for (int x = first; x < last; ++x) {
SEAM *seam = seams[x];
if (x - seam->widthn >= first && x + seam->widthp < last)
hide_seam(seam);
while (outline->next)
outline = outline->next;
outline->next = next_blob->outlines;
next_blob = next_blob->next;
x++;
outline->next = word->blobs[x + 1]->outlines;
}
}
@ -626,7 +471,7 @@ void hide_edge_pair(EDGEPT *pt1, EDGEPT *pt2) {
}
while (!exact_point (edgept, pt2) && edgept != pt1);
if (edgept == pt1) {
/* cprintf("Hid entire outline at (%d,%d)!!\n",
/* tprintf("Hid entire outline at (%d,%d)!!\n",
edgept->pos.x,edgept->pos.y); */
}
edgept = pt2;
@ -636,7 +481,7 @@ void hide_edge_pair(EDGEPT *pt1, EDGEPT *pt2) {
}
while (!exact_point (edgept, pt1) && edgept != pt2);
if (edgept == pt2) {
/* cprintf("Hid entire outline at (%d,%d)!!\n",
/* tprintf("Hid entire outline at (%d,%d)!!\n",
edgept->pos.x,edgept->pos.y); */
}
}
@ -679,7 +524,7 @@ void reveal_edge_pair(EDGEPT *pt1, EDGEPT *pt2) {
}
while (!exact_point (edgept, pt2) && edgept != pt1);
if (edgept == pt1) {
/* cprintf("Hid entire outline at (%d,%d)!!\n",
/* tprintf("Hid entire outline at (%d,%d)!!\n",
edgept->pos.x,edgept->pos.y); */
}
edgept = pt2;
@ -689,7 +534,7 @@ void reveal_edge_pair(EDGEPT *pt1, EDGEPT *pt2) {
}
while (!exact_point (edgept, pt1) && edgept != pt2);
if (edgept == pt2) {
/* cprintf("Hid entire outline at (%d,%d)!!\n",
/* tprintf("Hid entire outline at (%d,%d)!!\n",
edgept->pos.x,edgept->pos.y); */
}
}

View File

@ -30,15 +30,36 @@
----------------------------------------------------------------------*/
#include "blobs.h"
#include "split.h"
#include "tessarray.h"
/*----------------------------------------------------------------------
T y p e s
----------------------------------------------------------------------*/
typedef float PRIORITY; /* PRIORITY */
typedef struct seam_record
{ /* SEAM */
struct SEAM {
// Constructor that was formerly new_seam.
SEAM(PRIORITY priority0, const TPOINT& location0,
SPLIT *splita, SPLIT *splitb, SPLIT *splitc)
: priority(priority0), widthp(0), widthn(0), location(location0),
split1(splita), split2(splitb), split3(splitc) {}
// Copy constructor that was formerly clone_seam.
SEAM(const SEAM& src)
: priority(src.priority), widthp(src.widthp), widthn(src.widthn),
location(src.location) {
clone_split(split1, src.split1);
clone_split(split2, src.split2);
clone_split(split3, src.split3);
}
// Destructor was delete_seam.
~SEAM() {
if (split1)
delete_split(split1);
if (split2)
delete_split(split2);
if (split3)
delete_split(split3);
}
PRIORITY priority;
inT8 widthp;
inT8 widthn;
@ -46,36 +67,7 @@ typedef struct seam_record
SPLIT *split1;
SPLIT *split2;
SPLIT *split3;
} SEAM;
typedef ARRAY SEAMS; /* SEAMS */
extern SEAM *newseam();
/*----------------------------------------------------------------------
M a c r o s
----------------------------------------------------------------------*/
/**
* @name clone_seam
*
* Create a new seam record and copy the contents of this seam into it.
*/
#define clone_seam(dest,source) \
if (source) { \
(dest) = newseam (); \
(dest)->location = (source)->location; \
(dest)->widthp = (source)->widthp; \
(dest)->widthn = (source)->widthn; \
(dest)->priority = (source)->priority; \
clone_split ((dest)->split1, (source)->split1); \
clone_split ((dest)->split2, (source)->split2); \
clone_split ((dest)->split3, (source)->split3); \
} \
else { \
(dest) = (SEAM*) NULL; \
} \
};
/**
* exact_point
@ -92,61 +84,40 @@ else { \
----------------------------------------------------------------------*/
bool point_in_split(SPLIT *split, EDGEPT *point1, EDGEPT *point2);
bool point_in_seam(SEAM *seam, SPLIT *split);
bool point_in_seam(const SEAM *seam, SPLIT *split);
bool point_used_by_split(SPLIT *split, EDGEPT *point);
bool point_used_by_seam(SEAM *seam, EDGEPT *point);
SEAMS add_seam(SEAMS seam_list, SEAM *seam);
void combine_seams(SEAM *dest_seam, SEAM *source_seam);
void delete_seam(void *arg); //SEAM *seam);
void start_seam_list(TWERD *word, GenericVector<SEAM*>* seam_array);
SEAMS start_seam_list(TBLOB *blobs);
bool test_insert_seam(const GenericVector<SEAM*>& seam_array,
TWERD *word, int index);
void free_seam_list(SEAMS seam_list);
void insert_seam(const TWERD *word, int index, SEAM *seam,
GenericVector<SEAM*>* seam_array);
bool test_insert_seam(SEAMS seam_list,
int index,
TBLOB *left_blob,
TBLOB *first_blob);
SEAMS insert_seam(SEAMS seam_list,
int index,
SEAM *seam,
TBLOB *left_blob,
TBLOB *first_blob);
int account_splits_right(SEAM *seam, TBLOB *blob);
int account_splits_left(SEAM *seam, TBLOB *blob, TBLOB *end_blob);
void account_splits_left_helper(SEAM *seam, TBLOB *blob, TBLOB *end_blob,
inT32 *depth, inT8 *width, inT8 *found_em);
int account_splits(const SEAM *seam, const TWERD *word, int blob_index,
int blob_direction);
bool find_split_in_blob(SPLIT *split, TBLOB *blob);
SEAM *join_two_seams(SEAM *seam1, SEAM *seam2);
SEAM *new_seam(PRIORITY priority,
const TPOINT& location,
SPLIT *split1,
SPLIT *split2,
SPLIT *split3);
SEAMS new_seam_list();
SEAM *join_two_seams(const SEAM *seam1, const SEAM *seam2);
void print_seam(const char *label, SEAM *seam);
void print_seams(const char *label, SEAMS seams);
void print_seams(const char *label, const GenericVector<SEAM*>& seams);
int shared_split_points(SEAM *seam1, SEAM *seam2);
int shared_split_points(const SEAM *seam1, const SEAM *seam2);
void break_pieces(TBLOB *blobs, SEAMS seams, inT16 start, inT16 end);
void break_pieces(const GenericVector<SEAM*>& seams,
int first, int last, TWERD *word);
void join_pieces(TBLOB *piece_blobs, SEAMS seams, inT16 start, inT16 end);
void join_pieces(const GenericVector<SEAM*>& seams,
int first, int last, TWERD *word);
void hide_seam(SEAM *seam);

View File

@ -26,8 +26,8 @@
I n c l u d e s
----------------------------------------------------------------------*/
#include "split.h"
#include "structures.h"
#include "callcpp.h"
#include "coutln.h"
#include "tprintf.h"
#ifdef __UNIX__
#include <assert.h>
@ -38,8 +38,6 @@
----------------------------------------------------------------------*/
BOOL_VAR(wordrec_display_splits, 0, "Display splits");
makestructure(newsplit, free_split, SPLIT);
/*----------------------------------------------------------------------
F u n c t i o n s
----------------------------------------------------------------------*/
@ -47,12 +45,11 @@ makestructure(newsplit, free_split, SPLIT);
/**********************************************************************
* delete_split
*
* Remove this split from existance. Take if off the display list and
* deallocate its memory.
* Remove this split from existence.
**********************************************************************/
void delete_split(SPLIT *split) {
if (split) {
free_split(split);
delete split;
}
}
@ -68,6 +65,43 @@ EDGEPT *make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev) {
this_edgept = new EDGEPT;
this_edgept->pos.x = x;
this_edgept->pos.y = y;
// Now deal with the src_outline steps.
C_OUTLINE* prev_ol = prev->src_outline;
if (prev_ol != NULL && prev->next == next) {
// Compute the fraction of the segment that is being cut.
FCOORD segment_vec(next->pos.x - prev->pos.x, next->pos.y - prev->pos.y);
FCOORD target_vec(x - prev->pos.x, y - prev->pos.y);
double cut_fraction = target_vec.length() / segment_vec.length();
// Get the start and end at the step level.
ICOORD step_start = prev_ol->position_at_index(prev->start_step);
int end_step = prev->start_step + prev->step_count;
int step_length = prev_ol->pathlength();
ICOORD step_end = prev_ol->position_at_index(end_step % step_length);
ICOORD step_vec = step_end - step_start;
double target_length = step_vec.length() * cut_fraction;
// Find the point on the segment that gives the length nearest to target.
int best_step = prev->start_step;
ICOORD total_step(0, 0);
double best_dist = target_length;
for (int s = prev->start_step; s < end_step; ++s) {
total_step += prev_ol->step(s % step_length);
double dist = fabs(target_length - total_step.length());
if (dist < best_dist) {
best_dist = dist;
best_step = s + 1;
}
}
// The new point is an intermediate point.
this_edgept->src_outline = prev_ol;
this_edgept->step_count = end_step - best_step;
this_edgept->start_step = best_step % step_length;
prev->step_count = best_step - prev->start_step;
} else {
// The new point is poly only.
this_edgept->src_outline = NULL;
this_edgept->step_count = 0;
this_edgept->start_step = 0;
}
/* Hook it up */
this_edgept->next = next;
this_edgept->prev = prev;
@ -78,8 +112,7 @@ EDGEPT *make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev) {
this_edgept->vec.y = this_edgept->next->pos.y - y;
this_edgept->prev->vec.x = x - this_edgept->prev->pos.x;
this_edgept->prev->vec.y = y - this_edgept->prev->pos.y;
return (this_edgept);
return this_edgept;
}
/**********************************************************************
@ -90,6 +123,10 @@ EDGEPT *make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev) {
void remove_edgept(EDGEPT *point) {
EDGEPT *prev = point->prev;
EDGEPT *next = point->next;
// Add point's steps onto prev's steps if they are from the same outline.
if (prev->src_outline == point->src_outline && prev->src_outline != NULL) {
prev->step_count += point->step_count;
}
prev->next = next;
next->prev = prev;
prev->vec.x = next->pos.x - prev->pos.x;
@ -104,8 +141,7 @@ void remove_edgept(EDGEPT *point) {
* list.
**********************************************************************/
SPLIT *new_split(EDGEPT *point1, EDGEPT *point2) {
SPLIT *s;
s = (SPLIT *) newsplit ();
SPLIT *s = new SPLIT;
s->point1 = point1;
s->point2 = point2;
return (s);
@ -120,9 +156,9 @@ SPLIT *new_split(EDGEPT *point1, EDGEPT *point2) {
**********************************************************************/
void print_split(SPLIT *split) {
if (split) {
cprintf ("(%d,%d)--(%d,%d)",
split->point1->pos.x, split->point1->pos.y,
split->point2->pos.x, split->point2->pos.y);
tprintf("(%d,%d)--(%d,%d)",
split->point1->pos.x, split->point1->pos.y,
split->point2->pos.x, split->point2->pos.y);
}
}
@ -130,23 +166,35 @@ void print_split(SPLIT *split) {
/**********************************************************************
* split_outline
*
* Split between these two edge points. Apply a split and return a
* pointer to the other side of the split.
* Split between these two edge points.
**********************************************************************/
void split_outline(EDGEPT *join_point1, EDGEPT *join_point2) {
EDGEPT *join_point1a;
EDGEPT *temp2;
EDGEPT *temp1;
assert(join_point1 != join_point2);
assert (join_point1 != join_point2);
temp2 = join_point2->next;
temp1 = join_point1->next;
EDGEPT* temp2 = join_point2->next;
EDGEPT* temp1 = join_point1->next;
/* Create two new points */
join_point1a = make_edgept (join_point1->pos.x,
join_point1->pos.y, temp1, join_point2);
make_edgept (join_point2->pos.x, join_point2->pos.y, temp2, join_point1);
EDGEPT* new_point1 = make_edgept(join_point1->pos.x, join_point1->pos.y,
temp1, join_point2);
EDGEPT* new_point2 = make_edgept(join_point2->pos.x, join_point2->pos.y,
temp2, join_point1);
// Join_point1 and 2 are now cross-over points, so they must have NULL
// src_outlines and give their src_outline information their new
// replacements.
new_point1->src_outline = join_point1->src_outline;
new_point1->start_step = join_point1->start_step;
new_point1->step_count = join_point1->step_count;
new_point2->src_outline = join_point2->src_outline;
new_point2->start_step = join_point2->start_step;
new_point2->step_count = join_point2->step_count;
join_point1->src_outline = NULL;
join_point1->start_step = 0;
join_point1->step_count = 0;
join_point2->src_outline = NULL;
join_point2->start_step = 0;
join_point2->step_count = 0;
join_point1->MarkChop();
join_point2->MarkChop();
}
@ -164,8 +212,18 @@ void unsplit_outlines(EDGEPT *p1, EDGEPT *p2) {
tmp1->next->prev = p2;
tmp2->next->prev = p1;
// tmp2 is coincident with p1. p1 takes tmp2's place as tmp2 is deleted.
p1->next = tmp2->next;
p1->src_outline = tmp2->src_outline;
p1->start_step = tmp2->start_step;
p1->step_count = tmp2->step_count;
// Likewise p2 takes tmp1's place.
p2->next = tmp1->next;
p2->src_outline = tmp1->src_outline;
p2->start_step = tmp1->start_step;
p2->step_count = tmp1->step_count;
p1->UnmarkChop();
p2->UnmarkChop();
delete tmp1;
delete tmp2;

View File

@ -42,8 +42,7 @@ class EDGEPT;
#define point_diff(p,p1,p2) \
((p).x = (p1).x - (p2).x, \
(p).y = (p1).y - (p2).y, \
(p))
(p).y = (p1).y - (p2).y)
/**********************************************************************
* CROSS

View File

@ -465,7 +465,7 @@ WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
TBOX a_blob_box = a_blob->bounding_box();
if ((not_found_box.major_overlap(a_blob_box) ||
a_blob_box.major_overlap(not_found_box)) &&
not_found_box.y_overlap(a_blob_box)) {
not_found_box.y_overlap(a_blob_box) > 0.8) {
// Already taken care of.
delete not_found_it.extract();
break;

View File

@ -10,18 +10,16 @@ AM_CXXFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
AM_CPPFLAGS += -DTESS_EXPORTS
endif
EXTRA_DIST = mfcpch.cpp
include_HEADERS = \
basedir.h errcode.h fileerr.h genericvector.h helpers.h host.h memry.h \
ndminx.h params.h ocrclass.h platform.h serialis.h strngs.h \
tesscallback.h unichar.h unicharmap.h unicharset.h
noinst_HEADERS = \
ambigs.h bits16.h bitvector.h ccutil.h clst.h elst2.h \
elst.h globaloc.h hashfn.h indexmapbidi.h lsterr.h \
nwmain.h qrsequence.h secname.h sorthelper.h stderr.h tessdatamanager.h \
tprintf.h unicity_table.h unicodes.h
ambigs.h bits16.h bitvector.h ccutil.h clst.h doubleptr.h elst2.h \
elst.h genericheap.h globaloc.h hashfn.h indexmapbidi.h kdpair.h lsterr.h \
nwmain.h object_cache.h qrsequence.h secname.h sorthelper.h stderr.h tessdatamanager.h \
tprintf.h unicity_table.h unicodes.h universalambigs.h
if !USING_MULTIPLELIBS
noinst_LTLIBRARIES = libtesseract_ccutil.la
@ -39,7 +37,7 @@ libtesseract_ccutil_la_SOURCES = \
serialis.cpp strngs.cpp \
tessdatamanager.cpp tprintf.cpp \
unichar.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \
params.cpp
params.cpp universalambigs.cpp
if EMBEDDED
include_HEADERS += scanutils.h
@ -50,4 +48,4 @@ if MINGW
AM_CPPFLAGS += -I$(top_srcdir)/vs2008/port -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\"
noinst_HEADERS += ../vs2008/port/strtok_r.h
libtesseract_ccutil_la_SOURCES += ../vs2008/port/strtok_r.cpp
endif
endif

View File

@ -19,7 +19,10 @@
///////////////////////////////////////////////////////////////////////
#include "ambigs.h"
#include <stdio.h>
#include "helpers.h"
#include "universalambigs.h"
#ifdef _WIN32
#ifndef __GNUC__
@ -31,6 +34,11 @@
namespace tesseract {
// Maximum line size:
// 10 for sizes of ambigs, tabs, abmig type and newline
// UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig
const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);
AmbigSpec::AmbigSpec() {
wrong_ngram[0] = INVALID_UNICHAR_ID;
correct_fragments[0] = INVALID_UNICHAR_ID;
@ -41,14 +49,10 @@ AmbigSpec::AmbigSpec() {
ELISTIZE(AmbigSpec);
void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile,
inT64 end_offset,
int debug_level,
bool use_ambigs_for_adaption,
UNICHARSET *unicharset) {
int i, j;
UnicharIdVector *adaption_ambigs_entry;
for (i = 0; i < unicharset->size(); ++i) {
// Initializes the ambigs by adding a NULL pointer to each table.
void UnicharAmbigs::InitUnicharAmbigs(const UNICHARSET& unicharset,
bool use_ambigs_for_adaption) {
for (int i = 0; i < unicharset.size(); ++i) {
replace_ambigs_.push_back(NULL);
dang_ambigs_.push_back(NULL);
one_to_one_definite_ambigs_.push_back(NULL);
@ -57,85 +61,103 @@ void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile,
reverse_ambigs_for_adaption_.push_back(NULL);
}
}
}
// Loads the universal ambigs that are useful for any language.
void UnicharAmbigs::LoadUniversal(const UNICHARSET& encoder_set,
UNICHARSET* unicharset) {
FILE* fp = fmemopen(const_cast<char*>(kUniversalAmbigsFile),
ksizeofUniversalAmbigsFile, "rb");
if (fp == NULL) return;
LoadUnicharAmbigs(encoder_set, fp, -1ll, 0, false, unicharset);
fclose(fp);
}
void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET& encoder_set,
FILE *ambig_file,
inT64 end_offset,
int debug_level,
bool use_ambigs_for_adaption,
UNICHARSET *unicharset) {
int i, j;
UnicharIdVector *adaption_ambigs_entry;
if (debug_level) tprintf("Reading ambiguities\n");
int TestAmbigPartSize;
int ReplacementAmbigPartSize;
// Maximum line size:
// 10 for sizes of ambigs, tabs, abmig type and newline
// UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig
int test_ambig_part_size;
int replacement_ambig_part_size;
// The space for buffer is allocated on the heap to avoid
// GCC frame size warning.
const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);
const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;
char *buffer = new char[kBufferSize];
char ReplacementString[kMaxAmbigStringSize];
UNICHAR_ID TestUnicharIds[MAX_AMBIG_SIZE + 1];
char replacement_string[kMaxAmbigStringSize];
UNICHAR_ID test_unichar_ids[MAX_AMBIG_SIZE + 1];
int line_num = 0;
int type = NOT_AMBIG;
// Determine the version of the ambigs file.
int version = 0;
ASSERT_HOST(fgets(buffer, kBufferSize, AmbigFile) != NULL &&
ASSERT_HOST(fgets(buffer, kBufferSize, ambig_file) != NULL &&
strlen(buffer) > 0);
if (*buffer == 'v') {
version = static_cast<int>(strtol(buffer+1, NULL, 10));
++line_num;
} else {
rewind(AmbigFile);
rewind(ambig_file);
}
while ((end_offset < 0 || ftell(AmbigFile) < end_offset) &&
fgets(buffer, kBufferSize, AmbigFile) != NULL) {
while ((end_offset < 0 || ftell(ambig_file) < end_offset) &&
fgets(buffer, kBufferSize, ambig_file) != NULL) {
chomp_string(buffer);
if (debug_level > 2) tprintf("read line %s\n", buffer);
++line_num;
if (!ParseAmbiguityLine(line_num, version, debug_level, *unicharset,
buffer, &TestAmbigPartSize, TestUnicharIds,
&ReplacementAmbigPartSize,
ReplacementString, &type)) continue;
if (!ParseAmbiguityLine(line_num, version, debug_level, encoder_set,
buffer, &test_ambig_part_size, test_unichar_ids,
&replacement_ambig_part_size,
replacement_string, &type)) continue;
// Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.
AmbigSpec *ambig_spec = new AmbigSpec();
InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_,
TestAmbigPartSize, TestUnicharIds,
ReplacementAmbigPartSize, ReplacementString, type,
ambig_spec, unicharset);
if (!InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_
: dang_ambigs_,
test_ambig_part_size, test_unichar_ids,
replacement_ambig_part_size, replacement_string, type,
ambig_spec, unicharset))
continue;
// Update one_to_one_definite_ambigs_.
if (TestAmbigPartSize == 1 &&
ReplacementAmbigPartSize == 1 && type == DEFINITE_AMBIG) {
if (one_to_one_definite_ambigs_[TestUnicharIds[0]] == NULL) {
one_to_one_definite_ambigs_[TestUnicharIds[0]] = new UnicharIdVector();
if (test_ambig_part_size == 1 &&
replacement_ambig_part_size == 1 && type == DEFINITE_AMBIG) {
if (one_to_one_definite_ambigs_[test_unichar_ids[0]] == NULL) {
one_to_one_definite_ambigs_[test_unichar_ids[0]] = new UnicharIdVector();
}
one_to_one_definite_ambigs_[TestUnicharIds[0]]->push_back(
one_to_one_definite_ambigs_[test_unichar_ids[0]]->push_back(
ambig_spec->correct_ngram_id);
}
// Update ambigs_for_adaption_.
if (use_ambigs_for_adaption) {
for (i = 0; i < TestAmbigPartSize; ++i) {
if (ambigs_for_adaption_[TestUnicharIds[i]] == NULL) {
ambigs_for_adaption_[TestUnicharIds[i]] = new UnicharIdVector();
}
adaption_ambigs_entry = ambigs_for_adaption_[TestUnicharIds[i]];
const char *tmp_ptr = ReplacementString;
const char *tmp_ptr_end = ReplacementString + strlen(ReplacementString);
int step = unicharset->step(tmp_ptr);
while (step > 0) {
UNICHAR_ID id_to_insert = unicharset->unichar_to_id(tmp_ptr, step);
ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID);
// Add the new unichar id to adaption_ambigs_entry (only if the
// vector does not already contain it) keeping it in sorted order.
for (j = 0; j < adaption_ambigs_entry->size() &&
(*adaption_ambigs_entry)[j] > id_to_insert; ++j);
if (j < adaption_ambigs_entry->size()) {
if ((*adaption_ambigs_entry)[j] != id_to_insert) {
adaption_ambigs_entry->insert(id_to_insert, j);
}
} else {
adaption_ambigs_entry->push_back(id_to_insert);
GenericVector<UNICHAR_ID> encoding;
// Silently ignore invalid strings, as before, so it is safe to use a
// universal ambigs file.
if (unicharset->encode_string(replacement_string, true, &encoding,
NULL, NULL)) {
for (i = 0; i < test_ambig_part_size; ++i) {
if (ambigs_for_adaption_[test_unichar_ids[i]] == NULL) {
ambigs_for_adaption_[test_unichar_ids[i]] = new UnicharIdVector();
}
adaption_ambigs_entry = ambigs_for_adaption_[test_unichar_ids[i]];
for (int r = 0; r < encoding.size(); ++r) {
UNICHAR_ID id_to_insert = encoding[r];
ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID);
// Add the new unichar id to adaption_ambigs_entry (only if the
// vector does not already contain it) keeping it in sorted order.
for (j = 0; j < adaption_ambigs_entry->size() &&
(*adaption_ambigs_entry)[j] > id_to_insert; ++j);
if (j < adaption_ambigs_entry->size()) {
if ((*adaption_ambigs_entry)[j] != id_to_insert) {
adaption_ambigs_entry->insert(id_to_insert, j);
}
} else {
adaption_ambigs_entry->push_back(id_to_insert);
}
}
// Update tmp_ptr and step.
tmp_ptr += step;
step = tmp_ptr < tmp_ptr_end ? unicharset->step(tmp_ptr) : 0;
}
}
}
@ -204,51 +226,96 @@ void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile,
bool UnicharAmbigs::ParseAmbiguityLine(
int line_num, int version, int debug_level, const UNICHARSET &unicharset,
char *buffer, int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
int *ReplacementAmbigPartSize, char *ReplacementString, int *type) {
char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
int *replacement_ambig_part_size, char *replacement_string, int *type) {
if (version > 1) {
// Simpler format is just wrong-string correct-string type\n.
STRING input(buffer);
GenericVector<STRING> fields;
input.split(' ', &fields);
if (fields.size() != 3) {
if (debug_level) tprintf(kIllegalMsg, line_num);
return false;
}
// Encode wrong-string.
GenericVector<UNICHAR_ID> unichars;
if (!unicharset.encode_string(fields[0].string(), true, &unichars, NULL,
NULL)) {
return false;
}
*test_ambig_part_size = unichars.size();
if (*test_ambig_part_size > MAX_AMBIG_SIZE) {
if (debug_level)
tprintf("Too many unichars in ambiguity on line %d\n", line_num);
return false;
}
// Copy encoded string to output.
for (int i = 0; i < unichars.size(); ++i)
test_unichar_ids[i] = unichars[i];
test_unichar_ids[unichars.size()] = INVALID_UNICHAR_ID;
// Encode replacement-string to check validity.
if (!unicharset.encode_string(fields[1].string(), true, &unichars, NULL,
NULL)) {
return false;
}
*replacement_ambig_part_size = unichars.size();
if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) {
if (debug_level)
tprintf("Too many unichars in ambiguity on line %d\n", line_num);
return false;
}
if (sscanf(fields[2].string(), "%d", type) != 1) {
if (debug_level) tprintf(kIllegalMsg, line_num);
return false;
}
snprintf(replacement_string, kMaxAmbigStringSize, "%s", fields[1].string());
return true;
}
int i;
char *token;
char *next_token;
if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) ||
!sscanf(token, "%d", TestAmbigPartSize) || TestAmbigPartSize <= 0) {
!sscanf(token, "%d", test_ambig_part_size) || test_ambig_part_size <= 0) {
if (debug_level) tprintf(kIllegalMsg, line_num);
return false;
}
if (*TestAmbigPartSize > MAX_AMBIG_SIZE) {
tprintf("Too many unichars in ambiguity on line %d\n");
if (*test_ambig_part_size > MAX_AMBIG_SIZE) {
if (debug_level)
tprintf("Too many unichars in ambiguity on line %d\n", line_num);
return false;
}
for (i = 0; i < *TestAmbigPartSize; ++i) {
for (i = 0; i < *test_ambig_part_size; ++i) {
if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
if (!unicharset.contains_unichar(token)) {
if (debug_level) tprintf(kIllegalUnicharMsg, token);
break;
}
TestUnicharIds[i] = unicharset.unichar_to_id(token);
test_unichar_ids[i] = unicharset.unichar_to_id(token);
}
TestUnicharIds[i] = INVALID_UNICHAR_ID;
test_unichar_ids[i] = INVALID_UNICHAR_ID;
if (i != *TestAmbigPartSize ||
if (i != *test_ambig_part_size ||
!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
!sscanf(token, "%d", ReplacementAmbigPartSize) ||
*ReplacementAmbigPartSize <= 0) {
!sscanf(token, "%d", replacement_ambig_part_size) ||
*replacement_ambig_part_size <= 0) {
if (debug_level) tprintf(kIllegalMsg, line_num);
return false;
}
if (*ReplacementAmbigPartSize > MAX_AMBIG_SIZE) {
tprintf("Too many unichars in ambiguity on line %d\n");
if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) {
if (debug_level)
tprintf("Too many unichars in ambiguity on line %d\n", line_num);
return false;
}
ReplacementString[0] = '\0';
for (i = 0; i < *ReplacementAmbigPartSize; ++i) {
replacement_string[0] = '\0';
for (i = 0; i < *replacement_ambig_part_size; ++i) {
if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
strcat(ReplacementString, token);
strcat(replacement_string, token);
if (!unicharset.contains_unichar(token)) {
if (debug_level) tprintf(kIllegalUnicharMsg, token);
break;
}
}
if (i != *ReplacementAmbigPartSize) {
if (i != *replacement_ambig_part_size) {
if (debug_level) tprintf(kIllegalMsg, line_num);
return false;
}
@ -271,20 +338,20 @@ bool UnicharAmbigs::ParseAmbiguityLine(
return true;
}
void UnicharAmbigs::InsertIntoTable(
UnicharAmbigsVector &table, int TestAmbigPartSize,
UNICHAR_ID *TestUnicharIds, int ReplacementAmbigPartSize,
const char *ReplacementString, int type,
bool UnicharAmbigs::InsertIntoTable(
UnicharAmbigsVector &table, int test_ambig_part_size,
UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size,
const char *replacement_string, int type,
AmbigSpec *ambig_spec, UNICHARSET *unicharset) {
ambig_spec->type = static_cast<AmbigType>(type);
if (TestAmbigPartSize == 1 && ReplacementAmbigPartSize == 1 &&
unicharset->to_lower(TestUnicharIds[0]) ==
unicharset->to_lower(unicharset->unichar_to_id(ReplacementString))) {
if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 &&
unicharset->to_lower(test_unichar_ids[0]) ==
unicharset->to_lower(unicharset->unichar_to_id(replacement_string))) {
ambig_spec->type = CASE_AMBIG;
}
ambig_spec->wrong_ngram_size =
UnicharIdArrayUtils::copy(TestUnicharIds, ambig_spec->wrong_ngram);
UnicharIdArrayUtils::copy(test_unichar_ids, ambig_spec->wrong_ngram);
// Since we need to maintain a constant number of unichar positions in
// order to construct ambig_blob_choices vector in NoDangerousAmbig(), for
@ -297,21 +364,21 @@ void UnicharAmbigs::InsertIntoTable(
// Insert the corresponding correct ngram into the unicharset.
// Unicharset code assumes that the "base" ngram is inserted into
// the unicharset before fragments of this ngram are inserted.
unicharset->unichar_insert(ReplacementString);
unicharset->unichar_insert(replacement_string);
ambig_spec->correct_ngram_id =
unicharset->unichar_to_id(ReplacementString);
if (ReplacementAmbigPartSize > 1) {
unicharset->unichar_to_id(replacement_string);
if (replacement_ambig_part_size > 1) {
unicharset->set_isngram(ambig_spec->correct_ngram_id, true);
}
// Add the corresponding fragments of the wrong ngram to unicharset.
int i;
for (i = 0; i < TestAmbigPartSize; ++i) {
for (i = 0; i < test_ambig_part_size; ++i) {
UNICHAR_ID unichar_id;
if (TestAmbigPartSize == 1) {
if (test_ambig_part_size == 1) {
unichar_id = ambig_spec->correct_ngram_id;
} else {
STRING frag_str = CHAR_FRAGMENT::to_string(
ReplacementString, i, TestAmbigPartSize, false);
replacement_string, i, test_ambig_part_size, false);
unicharset->unichar_insert(frag_str.string());
unichar_id = unicharset->unichar_to_id(frag_str.string());
}
@ -321,11 +388,14 @@ void UnicharAmbigs::InsertIntoTable(
// Add AmbigSpec for this ambiguity to the corresponding AmbigSpec_LIST.
// Keep AmbigSpec_LISTs sorted by AmbigSpec.wrong_ngram.
if (table[TestUnicharIds[0]] == NULL) {
table[TestUnicharIds[0]] = new AmbigSpec_LIST();
if (table[test_unichar_ids[0]] == NULL) {
table[test_unichar_ids[0]] = new AmbigSpec_LIST();
}
table[TestUnicharIds[0]]->add_sorted(
AmbigSpec::compare_ambig_specs, false, ambig_spec);
if (table[test_unichar_ids[0]]->add_sorted(
AmbigSpec::compare_ambig_specs, true, ambig_spec))
return true;
delete ambig_spec;
return false;
}
} // namespace tesseract

View File

@ -123,7 +123,10 @@ class AmbigSpec : public ELIST_LINK {
*reinterpret_cast<const AmbigSpec * const *>(spec1);
const AmbigSpec *s2 =
*reinterpret_cast<const AmbigSpec * const *>(spec2);
return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
int result = UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
if (result != 0) return result;
return UnicharIdArrayUtils::compare(s1->correct_fragments,
s2->correct_fragments);
}
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
@ -150,6 +153,13 @@ class UnicharAmbigs {
const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }
// Initializes the ambigs by adding a NULL pointer to each table.
void InitUnicharAmbigs(const UNICHARSET& unicharset,
bool use_ambigs_for_adaption);
// Loads the universal ambigs that are useful for any language.
void LoadUniversal(const UNICHARSET& encoder_set, UNICHARSET* unicharset);
// Fills in two ambiguity tables (replaceable and dangerous) with information
// read from the ambigs file. An ambiguity table is an array of lists.
// The array is indexed by a class id. Each entry in the table provides
@ -160,7 +170,10 @@ class UnicharAmbigs {
// one_to_one_definite_ambigs_. This vector is also indexed by the class id
// of the wrong part of the ambiguity and each entry contains a vector of
// unichar ids that are ambiguous to it.
void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset, int debug_level,
// encoder_set is used to encode the ambiguity strings, undisturbed by new
// unichar_ids that may be created by adding the ambigs.
void LoadUnicharAmbigs(const UNICHARSET& encoder_set,
FILE *ambigs_file, inT64 end_offset, int debug_level,
bool use_ambigs_for_adaption, UNICHARSET *unicharset);
// Returns definite 1-1 ambigs for the given unichar id.
@ -191,17 +204,18 @@ class UnicharAmbigs {
}
private:
bool ParseAmbiguityLine(int line_num, int version, int debug_level,
const UNICHARSET &unicharset, char *buffer,
int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
int *ReplacementAmbigPartSize,
char *ReplacementString, int *type);
void InsertIntoTable(UnicharAmbigsVector &table,
int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
int ReplacementAmbigPartSize,
const char *ReplacementString, int type,
int *test_ambig_part_size,
UNICHAR_ID *test_unichar_ids,
int *replacement_ambig_part_size,
char *replacement_string, int *type);
bool InsertIntoTable(UnicharAmbigsVector &table,
int test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
int replacement_ambig_part_size,
const char *replacement_string, int type,
AmbigSpec *ambig_spec, UNICHARSET *unicharset);
UnicharAmbigsVector dang_ambigs_;
UnicharAmbigsVector replace_ambigs_;
GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;

93
ccutil/doubleptr.h Normal file
View File

@ -0,0 +1,93 @@
// Copyright 2012 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: doubleptr.h
// Description: Double-ended pointer that keeps pointing correctly even
// when reallocated or copied.
// Author: Ray Smith
// Created: Wed Mar 14 12:22:57 PDT 2012
//
// (C) Copyright 2012, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCUTIL_DOUBLEPTR_H_
#define TESSERACT_CCUTIL_DOUBLEPTR_H_
#include "errcode.h"
namespace tesseract {
// A smart pointer class that implements a double-ended pointer. Each end
// points to the other end. The copy constructor and operator= have MOVE
// semantics, meaning that the relationship with the other end moves to the
// destination of the copy, leaving the source unattached.
// For this reason both the copy constructor and the operator= take a non-const
// reference argument, and the const reference versions cannot be used.
// DoublePtr is useful to incorporate into structures that are part of a
// collection such as GenericVector or STL containers, where reallocs can
// relocate the members. DoublePtr is also useful in a GenericHeap, where it
// can correctly maintain the pointer to an element of the heap despite it
// getting moved around on the heap.
class DoublePtr {
public:
DoublePtr() : other_end_(NULL) {}
// Copy constructor steals the partner off src and is therefore a non
// const reference arg.
// Copying a const DoublePtr generates a compiler error.
DoublePtr(DoublePtr& src) {
other_end_ = src.other_end_;
if (other_end_ != NULL) {
other_end_->other_end_ = this;
src.other_end_ = NULL;
}
}
// Operator= steals the partner off src, and therefore needs src to be a non-
// const reference.
// Assigning from a const DoublePtr generates a compiler error.
void operator=(DoublePtr& src) {
Disconnect();
other_end_ = src.other_end_;
if (other_end_ != NULL) {
other_end_->other_end_ = this;
src.other_end_ = NULL;
}
}
// Connects this and other, discarding any existing connections.
void Connect(DoublePtr* other) {
other->Disconnect();
Disconnect();
other->other_end_ = this;
other_end_ = other;
}
// Disconnects this and other, making OtherEnd() return NULL for both.
void Disconnect() {
if (other_end_ != NULL) {
other_end_->other_end_ = NULL;
other_end_ = NULL;
}
}
// Returns the pointer to the other end of the double pointer.
DoublePtr* OtherEnd() const {
return other_end_;
}
private:
// Pointer to the other end of the link. It is always true that either
// other_end_ == NULL or other_end_->other_end_ == this.
DoublePtr* other_end_;
};
} // namespace tesseract.
#endif // THIRD_PARTY_TESSERACT_CCUTIL_DOUBLEPTR_H_

View File

@ -90,12 +90,6 @@ const ERRCODE ASSERT_FAILED = "Assert failed";
void signal_exit( //
int signal_code //Signal which
);
extern "C"
{
void err_exit();
//The real signal
void signal_termination_handler(int sig);
};
void set_global_loc_code(int loc_code);

225
ccutil/genericheap.h Normal file
View File

@ -0,0 +1,225 @@
// Copyright 2012 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: genericheap.h
// Description: Template heap class.
// Author: Ray Smith, based on Dan Johnson's original code.
// Created: Wed Mar 14 08:13:00 PDT 2012
//
// (C) Copyright 2012, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "errcode.h"
#include "genericvector.h"
#ifndef TESSERACT_CCUTIL_GENERICHEAP_H_
#define TESSERACT_CCUTIL_GENERICHEAP_H_
namespace tesseract {
// GenericHeap requires 1 template argument:
// Pair will normally be either KDPairInc<Key, Data> or KDPairDec<Key, Data>
// for some arbitrary Key and scalar, smart pointer, or non-ownership pointer
// Data type, according to whether a MIN heap or a MAX heap is desired,
// respectively. Using KDPtrPairInc<Key, Data> or KDPtrPairDec<Key, Data>,
// GenericHeap can also handle simple Data pointers and own them.
// If no additional data is required, Pair can also be a scalar, since
// GenericHeap doesn't look inside it except for operator<.
//
// The heap is stored as a packed binary tree in an array hosted by a
// GenericVector<Pair>, with the invariant that the children of each node are
// both NOT Pair::operator< the parent node. KDPairInc defines Pair::operator<
// to use Key::operator< to generate a MIN heap and KDPairDec defines
// Pair::operator< to use Key::operator> to generate a MAX heap by reversing
// all the comparisons.
// See http://en.wikipedia.org/wiki/Heap_(data_structure) for more detail on
// the basic heap implementation.
//
// Insertion and removal are both O(log n) and, unlike the STL heap, an
// explicit Reshuffle function allows a node to be repositioned in time O(log n)
// after changing its value.
//
// Accessing the element for revaluation is a more complex matter, since the
// index and pointer can be changed arbitrarily by heap operations.
// Revaluation can be done by making the Data type in the Pair derived from or
// contain a DoublePtr as its first data element, making it possible to convert
// the pointer to a Pair using KDPairInc::RecastDataPointer.
template <typename Pair>
class GenericHeap {
public:
GenericHeap() {}
// The initial size is only a GenericVector::reserve. It is not enforced as
// the size limit of the heap. Caller must implement their own enforcement.
explicit GenericHeap(int initial_size) {
heap_.reserve(initial_size);
}
// Simple accessors.
bool empty() const {
return heap_.empty();
}
int size() const {
return heap_.size();
}
int size_reserved() const {
return heap_.size_reserved();
}
void clear() {
// Clear truncates to 0 to keep the number reserved in tact.
heap_.truncate(0);
}
// Provides access to the underlying vector.
// Caution! any changes that modify the keys will invalidate the heap!
GenericVector<Pair>* heap() {
return &heap_;
}
// Add entry to the heap, keeping the smallest item at the top, by operator<.
// Note that *entry is used as the source of operator=, but it is non-const
// to allow for a smart pointer to be contained within.
// Time = O(log n).
void Push(Pair* entry) {
int hole_index = heap_.size();
// Make a hole in the end of heap_ and sift it up to be the correct
// location for the new *entry. To avoid needing a default constructor
// for primitive types, and to allow for use of DoublePtr in the Pair
// somewhere, we have to incur a double copy here.
heap_.push_back(*entry);
*entry = heap_.back();
hole_index = SiftUp(hole_index, *entry);
heap_[hole_index] = *entry;
}
// Get the value of the top (smallest, defined by operator< ) element.
const Pair& PeekTop() const {
return heap_[0];
}
// Removes the top element of the heap. If entry is not NULL, the element
// is copied into *entry, otherwise it is discarded.
// Returns false if the heap was already empty.
// Time = O(log n).
bool Pop(Pair* entry) {
int new_size = heap_.size() - 1;
if (new_size < 0)
return false; // Already empty.
if (entry != NULL)
*entry = heap_[0];
if (new_size > 0) {
// Sift the hole at the start of the heap_ downwards to match the last
// element.
Pair hole_pair = heap_[new_size];
heap_.truncate(new_size);
int hole_index = SiftDown(0, hole_pair);
heap_[hole_index] = hole_pair;
} else {
heap_.truncate(new_size);
}
return true;
}
// Removes the MAXIMUM element of the heap. (MIN from a MAX heap.) If entry is
// not NULL, the element is copied into *entry, otherwise it is discarded.
// Time = O(n). Returns false if the heap was already empty.
bool PopWorst(Pair* entry) {
int heap_size = heap_.size();
if (heap_size == 0) return false; // It cannot be empty!
// Find the maximum element. Its index is guaranteed to be greater than
// the index of the parent of the last element, since by the heap invariant
// the parent must be less than or equal to the children.
int worst_index = heap_size - 1;
int end_parent = ParentNode(worst_index);
for (int i = worst_index - 1; i > end_parent; --i) {
if (heap_[worst_index] < heap_[i])
worst_index = i;
}
// Extract the worst element from the heap, leaving a hole at worst_index.
if (entry != NULL)
*entry = heap_[worst_index];
--heap_size;
if (heap_size > 0) {
// Sift the hole upwards to match the last element of the heap_
Pair hole_pair = heap_[heap_size];
int hole_index = SiftUp(worst_index, hole_pair);
heap_[hole_index] = hole_pair;
}
heap_.truncate(heap_size);
return true;
}
// The pointed-to Pair has changed its key value, so the location of pair
// is reshuffled to maintain the heap invariant.
// Must be a valid pointer to an element of the heap_!
// Caution! Since GenericHeap is based on GenericVector, reallocs may occur
// whenever the vector is extended and elements may get shuffled by any
// Push or Pop operation. Therefore use this function only if Data in Pair is
// of type DoublePtr, derived (first) from DoublePtr, or has a DoublePtr as
// its first element. Reshuffles the heap to maintain the invariant.
// Time = O(log n).
void Reshuffle(Pair* pair) {
int index = pair - &heap_[0];
Pair hole_pair = heap_[index];
index = SiftDown(index, hole_pair);
index = SiftUp(index, hole_pair);
heap_[index] = hole_pair;
}
private:
// A hole in the heap exists at hole_index, and we want to fill it with the
// given pair. SiftUp sifts the hole upward to the correct position and
// returns the destination index without actually putting pair there.
int SiftUp(int hole_index, const Pair& pair) {
int parent;
while (hole_index > 0 && pair < heap_[parent = ParentNode(hole_index)]) {
heap_[hole_index] = heap_[parent];
hole_index = parent;
}
return hole_index;
}
// A hole in the heap exists at hole_index, and we want to fill it with the
// given pair. SiftDown sifts the hole downward to the correct position and
// returns the destination index without actually putting pair there.
int SiftDown(int hole_index, const Pair& pair) {
int heap_size = heap_.size();
int child;
while ((child = LeftChild(hole_index)) < heap_size) {
if (child + 1 < heap_size && heap_[child + 1] < heap_[child])
++child;
if (heap_[child] < pair) {
heap_[hole_index] = heap_[child];
hole_index = child;
} else {
break;
}
}
return hole_index;
}
// Functions to navigate the tree. Unlike the original implementation, we
// store the root at index 0.
int ParentNode(int index) const {
return (index + 1) / 2 - 1;
}
int LeftChild(int index) const {
return index * 2 + 1;
}
private:
GenericVector<Pair> heap_;
};
} // namespace tesseract
#endif // TESSERACT_CCUTIL_GENERICHEAP_H_

View File

@ -20,6 +20,7 @@
#ifndef TESSERACT_CCUTIL_GENERICVECTOR_H_
#define TESSERACT_CCUTIL_GENERICVECTOR_H_
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
@ -34,8 +35,13 @@
template <typename T>
class GenericVector {
public:
GenericVector() { this->init(kDefaultVectorSize); }
explicit GenericVector(int size) { this->init(size); }
GenericVector() {
init(kDefaultVectorSize);
}
GenericVector(int size, T init_val) {
init(size);
init_to_size(size, init_val);
}
// Copy
GenericVector(const GenericVector& other) {
@ -45,7 +51,7 @@ class GenericVector {
GenericVector<T> &operator+=(const GenericVector& other);
GenericVector<T> &operator=(const GenericVector& other);
virtual ~GenericVector();
~GenericVector();
// Reserve some memory.
void reserve(int size);
@ -59,6 +65,9 @@ class GenericVector {
int size() const {
return size_used_;
}
int size_reserved() const {
return size_reserved_;
}
int length() const {
return size_used_;
@ -73,6 +82,8 @@ class GenericVector {
T &get(int index) const;
T &back() const;
T &operator[](int index) const;
// Returns the last object and removes it.
T pop_back();
// Return the index of the T object.
// This method NEEDS a compare_callback to be passed to
@ -105,11 +116,11 @@ class GenericVector {
// Removes an element at the given index and
// shifts the remaining elements to the left.
virtual void remove(int index);
void remove(int index);
// Truncates the array to the given size by removing the end.
// If the current size is less, the array is not expanded.
virtual void truncate(int size) {
void truncate(int size) {
if (size < size_used_)
size_used_ = size;
}
@ -126,7 +137,7 @@ class GenericVector {
// All the owned callbacks are also deleted.
// If you don't want the callbacks to be deleted, before calling clear, set
// the callback to NULL.
virtual void clear();
void clear();
// Delete objects pointed to by data_[i]
void delete_data_pointers();
@ -147,12 +158,12 @@ class GenericVector {
bool read(FILE* f, TessResultCallback3<bool, FILE*, T*, bool>* cb, bool swap);
// Writes a vector of simple types to the given file. Assumes that bitwise
// read/write of T will work. Returns false in case of error.
virtual bool Serialize(FILE* fp) const;
bool Serialize(FILE* fp) const;
// Reads a vector of simple types from the given file. Assumes that bitwise
// read/write will work with ReverseN according to sizeof(T).
// Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
virtual bool DeSerialize(bool swap, FILE* fp);
bool DeSerialize(bool swap, FILE* fp);
// Writes a vector of classes to the given file. Assumes the existence of
// bool T::Serialize(FILE* fp) const that returns false in case of error.
// Returns false in case of error.
@ -262,7 +273,32 @@ class GenericVector {
return result;
}
// Returns the index of what would be the target_index_th item in the array
// if the members were sorted, without actually sorting. Members are
// shuffled around, but it takes O(n) time.
// NOTE: uses operator< and operator== on the members.
int choose_nth_item(int target_index) {
// Make sure target_index is legal.
if (target_index < 0)
target_index = 0; // ensure legal
else if (target_index >= size_used_)
target_index = size_used_ - 1;
unsigned int seed = 1;
return choose_nth_item(target_index, 0, size_used_, &seed);
}
// Swaps the elements with the given indices.
void swap(int index1, int index2) {
if (index1 != index2) {
T tmp = data_[index1];
data_[index1] = data_[index2];
data_[index2] = tmp;
}
}
protected:
// Internal recursive version of choose_nth_item.
int choose_nth_item(int target_index, int start, int end, unsigned int* seed);
// Init the object, allocating size memory.
void init(int size);
@ -328,7 +364,7 @@ class PointerVector : public GenericVector<T*> {
public:
PointerVector() : GenericVector<T*>() { }
explicit PointerVector(int size) : GenericVector<T*>(size) { }
virtual ~PointerVector() {
~PointerVector() {
// Clear must be called here, even though it is called again by the base,
// as the base will call the wrong clear.
clear();
@ -355,14 +391,14 @@ class PointerVector : public GenericVector<T*> {
// Removes an element at the given index and
// shifts the remaining elements to the left.
virtual void remove(int index) {
void remove(int index) {
delete GenericVector<T*>::data_[index];
GenericVector<T*>::remove(index);
}
// Truncates the array to the given size by removing the end.
// If the current size is less, the array is not expanded.
virtual void truncate(int size) {
void truncate(int size) {
for (int i = size; i < GenericVector<T*>::size_used_; ++i)
delete GenericVector<T*>::data_[i];
GenericVector<T*>::truncate(size);
@ -394,14 +430,14 @@ class PointerVector : public GenericVector<T*> {
// All the owned callbacks are also deleted.
// If you don't want the callbacks to be deleted, before calling clear, set
// the callback to NULL.
virtual void clear() {
void clear() {
GenericVector<T*>::delete_data_pointers();
GenericVector<T*>::clear();
}
// Writes a vector of simple types to the given file. Assumes that bitwise
// read/write of T will work. Returns false in case of error.
virtual bool Serialize(FILE* fp) const {
bool Serialize(FILE* fp) const {
inT32 used = GenericVector<T*>::size_used_;
if (fwrite(&used, sizeof(used), 1, fp) != 1) return false;
for (int i = 0; i < used; ++i) {
@ -416,7 +452,7 @@ class PointerVector : public GenericVector<T*> {
// Also needs T::T(), as new T is used in this function.
// Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
virtual bool DeSerialize(bool swap, FILE* fp) {
bool DeSerialize(bool swap, FILE* fp) {
inT32 reserved;
if (fread(&reserved, sizeof(reserved), 1, fp) != 1) return false;
if (swap) Reverse32(&reserved);
@ -515,7 +551,8 @@ T &GenericVector<T>::get(int index) const {
template <typename T>
T &GenericVector<T>::operator[](int index) const {
return data_[index];
assert(index >= 0 && index < size_used_);
return data_[index];
}
template <typename T>
@ -523,6 +560,12 @@ T &GenericVector<T>::back() const {
ASSERT_HOST(size_used_ > 0);
return data_[size_used_ - 1];
}
// Returns the last object and removes it.
template <typename T>
T GenericVector<T>::pop_back() {
ASSERT_HOST(size_used_ > 0);
return data_[--size_used_];
}
// Return the object from an index.
template <typename T>
@ -536,7 +579,7 @@ void GenericVector<T>::set(T t, int index) {
// at the specified index.
template <typename T>
void GenericVector<T>::insert(T t, int index) {
ASSERT_HOST(index >= 0 && index < size_used_);
ASSERT_HOST(index >= 0 && index <= size_used_);
if (size_reserved_ == size_used_)
double_the_size();
for (int i = size_used_; i > index; --i) {
@ -642,7 +685,8 @@ void GenericVector<T>::set_clear_callback(TessCallback1<T>* cb) {
// Add a callback to be called to delete the elements when the array took
// their ownership.
template <typename T>
void GenericVector<T>::set_compare_callback(TessResultCallback2<bool, T const &, T const &>* cb) {
void GenericVector<T>::set_compare_callback(
TessResultCallback2<bool, T const &, T const &>* cb) {
compare_cb_ = cb;
}
@ -804,4 +848,61 @@ void GenericVector<T>::sort() {
sort(&tesseract::sort_cmp<T>);
}
// Internal recursive version of choose_nth_item.
// The algorithm used comes from "Algorithms" by Sedgewick:
// http://books.google.com/books/about/Algorithms.html?id=idUdqdDXqnAC
// The principle is to choose a random pivot, and move everything less than
// the pivot to its left, and everything greater than the pivot to the end
// of the array, then recurse on the part that contains the desired index, or
// just return the answer if it is in the equal section in the middle.
// The random pivot guarantees average linear time for the same reason that
// n times vector::push_back takes linear time on average.
// target_index, start and and end are all indices into the full array.
// Seed is a seed for rand_r for thread safety purposes. Its value is
// unimportant as the random numbers do not affect the result except
// between equal answers.
template <typename T>
int GenericVector<T>::choose_nth_item(int target_index, int start, int end,
unsigned int* seed) {
// Number of elements to process.
int num_elements = end - start;
// Trivial cases.
if (num_elements <= 1)
return start;
if (num_elements == 2) {
if (data_[start] < data_[start + 1]) {
return target_index > start ? start + 1 : start;
} else {
return target_index > start ? start : start + 1;
}
}
// Place the pivot at start.
int pivot = rand_r(seed) % num_elements + start;
swap(pivot, start);
// The invariant condition here is that items [start, next_lesser) are less
// than the pivot (which is at index next_lesser) and items
// [prev_greater, end) are greater than the pivot, with items
// [next_lesser, prev_greater) being equal to the pivot.
int next_lesser = start;
int prev_greater = end;
for (int next_sample = start + 1; next_sample < prev_greater;) {
if (data_[next_sample] < data_[next_lesser]) {
swap(next_lesser++, next_sample++);
} else if (data_[next_sample] == data_[next_lesser]) {
++next_sample;
} else {
swap(--prev_greater, next_sample);
}
}
// Now the invariant is set up, we recurse on just the section that contains
// the desired index.
if (target_index < next_lesser)
return choose_nth_item(target_index, start, next_lesser, seed);
else if (target_index < prev_greater)
return next_lesser; // In equal bracket.
else
return choose_nth_item(target_index, prev_greater, end, seed);
}
#endif // TESSERACT_CCUTIL_GENERICVECTOR_H_

View File

@ -18,84 +18,56 @@
**********************************************************************/
#include <signal.h>
#ifdef __linux__
#include <sys/syscall.h> // For SYS_gettid.
#include <unistd.h> // For syscall itself.
#endif
#include "allheaders.h"
#include "errcode.h"
#include "tprintf.h"
/*inT16 global_loc_code = LOC_INIT;//location code
inT16 global_subloc_code = SUBLOC_NORM;
//pass2 subloc code
inT16 global_subsubloc_code = SUBSUBLOC_OTHER;
//location code
inT16 global_abort_code = NO_ABORT_CODE;
//Prog abort code
*/
void signal_exit( //
int signal_code //Signal which
) {
/*int exit_status;
// Size of thread-id array of pixes to keep in case of crash.
const int kMaxNumThreadPixes = 32768;
if ((global_loc_code == LOC_PASS2) || (global_loc_code == LOC_FUZZY_SPACE))
global_loc_code += global_subloc_code + global_subsubloc_code;
Pix* global_crash_pixes[kMaxNumThreadPixes];
if (signal_code < 0) {
exit_status = global_loc_code * 8 + global_abort_code * 2 + 1;
tprintf ("Signal_exit %d ABORT. LocCode: %d AbortCode: %d\n",
exit_status, global_loc_code, global_abort_code);
void SavePixForCrash(int resolution, Pix* pix) {
#ifdef __linux__
int thread_id = syscall(SYS_gettid) % kMaxNumThreadPixes;
pixDestroy(&global_crash_pixes[thread_id]);
if (pix != NULL) {
Pix* clone = pixClone(pix);
pixSetXRes(clone, resolution);
pixSetYRes(clone, resolution);
global_crash_pixes[thread_id] = clone;
}
else {
exit_status = global_loc_code * 8 + signal_code * 2;
tprintf ("Signal_exit %d SIGNAL ABORT. LocCode: %d SignalCode: %d\n",
exit_status, global_loc_code, signal_code);
}
exit(exit_status);*/
exit(signal_code);
#endif
}
/*************************************************************************
* err_exit()
* All program exits should go through this point. It allows a meaningful status
* code to be generated for the real exit() call. The status code is made up
* as follows:
* Bit 0 : 1 = Program Abort 0 = System Abort
* Bits 1,2 : IF bit 0 = 1 THEN ERRCODE::abort_code
* ELSE 0 = Bus Err or Seg Vi
* 1 = Floating point exception
* 2 = TimeOut (Signal 15 from command timer)
* 3 = Any other signal
* Bits 3..7 : Location code NEVER 0 !
*************************************************************************/
//extern "C" {
// CALL ONLY from a signal handler! Writes a crash image to stderr.
void signal_exit(int signal_code) {
tprintf("Received signal %d!\n", signal_code);
#ifdef __linux__
int thread_id = syscall(SYS_gettid) % kMaxNumThreadPixes;
if (global_crash_pixes[thread_id] != NULL) {
fprintf(stderr, "Crash caused by image with resolution %d\n",
pixGetYRes(global_crash_pixes[thread_id]));
fprintf(stderr, "<Cut here>\n");
pixWriteStreamPng(stderr, global_crash_pixes[thread_id], 0.0);
fprintf(stderr, "\n<End cut>\n");
}
// Raise an uncaught signal, so as to get a useful stack trace.
raise(SIGILL);
#else
abort();
#endif
}
void err_exit() {
signal_exit (-1);
ASSERT_HOST("Fatal error encountered!" == NULL);
}
void signal_termination_handler(int sig) {
const ERRCODE SIGNAL_HANDLER_ERR = "Signal_termination_handler called";
SIGNAL_HANDLER_ERR.error("signal_termination_handler", ABORT, "Code %d", sig);
switch (sig) {
case SIGABRT:
signal_exit (-1); //use abort code
// case SIGBUS:
case SIGSEGV:
signal_exit (0);
case SIGFPE:
signal_exit (1); //floating point
case SIGTERM:
signal_exit (2); //timeout by cmdtimer
default:
signal_exit (3); //Anything else
}
}
//}; //end extern "C"
void set_global_loc_code(int loc_code) {
// global_loc_code = loc_code;

View File

@ -22,14 +22,14 @@
#include "host.h"
void signal_exit( //
int signal_code //Signal which
);
//extern "C" {
// Saves a clone of the given pix, and notes its resolution in thread-specific
// data, so that the image can be written prior to a crash.
struct Pix;
void SavePixForCrash(int resolution, Pix* pix);
void signal_exit(int signal_code);
void err_exit();
//The real signal
void signal_termination_handler(int sig);
//};
void set_global_loc_code(int loc_code);

189
ccutil/kdpair.h Normal file
View File

@ -0,0 +1,189 @@
// Copyright 2012 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
///////////////////////////////////////////////////////////////////////
// File: kdpair.h
// Description: Template pair class like STL pair but geared towards
// the Key+Data design pattern in which some data needs
// to be sorted or kept in a heap sorted on some separate key.
// Author: Ray Smith.
// Created: Thu Mar 15 14:48:05 PDT 2012
//
// (C) Copyright 2012, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCUTIL_KDPAIR_H_
#define TESSERACT_CCUTIL_KDPAIR_H_
#include "genericvector.h"
namespace tesseract {
// A useful base struct to facilitate the common operation of sorting a vector
// of simple or smart-pointer data using a separate key. Similar to STL pair.
template <typename Key, typename Data>
struct KDPair {
KDPair() {}
KDPair(Key k, Data d) : data(d), key(k) {}
int operator==(const KDPair<Key, Data>& other) const {
return key == other.key;
}
// WARNING! Keep data as the first element! KDPairInc and KDPairDec depend
// on the order of these elements so they can downcast pointers appropriately
// for use by GenericHeap::Reshuffle.
Data data;
Key key;
};
// Specialization of KDPair to provide operator< for sorting in increasing order
// and recasting of data pointers for use with DoublePtr.
template <typename Key, typename Data>
struct KDPairInc : public KDPair<Key, Data> {
KDPairInc() {}
KDPairInc(Key k, Data d) : KDPair<Key, Data>(k, d) {}
// Operator< facilitates sorting in increasing order.
int operator<(const KDPairInc<Key, Data>& other) const {
return this->key < other.key;
}
// Returns the input Data pointer recast to a KDPairInc pointer.
// Just casts a pointer to the first element to a pointer to the whole struct.
static KDPairInc* RecastDataPointer(Data* data_ptr) {
return reinterpret_cast<KDPairInc*>(data_ptr);
}
};
// Specialization of KDPair to provide operator< for sorting in decreasing order
// and recasting of data pointers for use with DoublePtr.
template <typename Key, typename Data>
struct KDPairDec : public KDPair<Key, Data> {
KDPairDec() {}
KDPairDec(Key k, Data d) : KDPair<Key, Data>(k, d) {}
// Operator< facilitates sorting in decreasing order by using operator> on
// the key values.
int operator<(const KDPairDec<Key, Data>& other) const {
return this->key > other.key;
}
// Returns the input Data pointer recast to a KDPairDec pointer.
// Just casts a pointer to the first element to a pointer to the whole struct.
static KDPairDec* RecastDataPointer(Data* data_ptr) {
return reinterpret_cast<KDPairDec*>(data_ptr);
}
};
// A useful base class to facilitate the common operation of sorting a vector
// of owned pointer data using a separate key. This class owns its data pointer,
// deleting it when it has finished with it, and providing copy constructor and
// operator= that have move semantics so that the data does not get copied and
// only a single instance of KDPtrPair holds a specific data pointer.
template <typename Key, typename Data>
class KDPtrPair {
public:
KDPtrPair() : data_(NULL) {}
KDPtrPair(Key k, Data* d) : data_(d), key_(k) {}
// Copy constructor steals the pointer from src and NULLs it in src, thereby
// moving the (single) ownership of the data.
KDPtrPair(KDPtrPair& src) : data_(src.data_), key_(src.key_) {
src.data_ = NULL;
}
// Destructor deletes data, assuming it is the sole owner.
~KDPtrPair() {
delete this->data_;
this->data_ = NULL;
}
// Operator= steals the pointer from src and NULLs it in src, thereby
// moving the (single) ownership of the data.
void operator=(KDPtrPair& src) {
delete this->data_;
this->data_ = src.data_;
src.data_ = NULL;
this->key_ = src.key_;
}
int operator==(const KDPtrPair<Key, Data>& other) const {
return key_ == other.key_;
}
// Accessors.
const Key& key() const {
return key_;
}
void set_key(const Key& new_key) {
key_ = new_key;
}
const Data* data() const {
return data_;
}
// Sets the data pointer, taking ownership of the data.
void set_data(Data* new_data) {
delete data_;
data_ = new_data;
}
// Relinquishes ownership of the data pointer (setting it to NULL).
Data* extract_data() {
Data* result = data_;
data_ = NULL;
return result;
}
private:
// Data members are private to keep deletion of data_ encapsulated.
Data* data_;
Key key_;
};
// Specialization of KDPtrPair to provide operator< for sorting in increasing
// order.
template <typename Key, typename Data>
struct KDPtrPairInc : public KDPtrPair<Key, Data> {
// Since we are doing non-standard stuff we have to duplicate *all* the
// constructors and operator=.
KDPtrPairInc() : KDPtrPair<Key, Data>() {}
KDPtrPairInc(Key k, Data* d) : KDPtrPair<Key, Data>(k, d) {}
KDPtrPairInc(KDPtrPairInc& src) : KDPtrPair<Key, Data>(src) {}
void operator=(KDPtrPairInc& src) {
KDPtrPair<Key, Data>::operator=(src);
}
// Operator< facilitates sorting in increasing order.
int operator<(const KDPtrPairInc<Key, Data>& other) const {
return this->key() < other.key();
}
};
// Specialization of KDPtrPair to provide operator< for sorting in decreasing
// order.
template <typename Key, typename Data>
struct KDPtrPairDec : public KDPtrPair<Key, Data> {
// Since we are doing non-standard stuff we have to duplicate *all* the
// constructors and operator=.
KDPtrPairDec() : KDPtrPair<Key, Data>() {}
KDPtrPairDec(Key k, Data* d) : KDPtrPair<Key, Data>(k, d) {}
KDPtrPairDec(KDPtrPairDec& src) : KDPtrPair<Key, Data>(src) {}
void operator=(KDPtrPairDec& src) {
KDPtrPair<Key, Data>::operator=(src);
}
// Operator< facilitates sorting in decreasing order by using operator> on
// the key values.
int operator<(const KDPtrPairDec<Key, Data>& other) const {
return this->key() > other.key();
}
};
// Specialization for a pair of ints in increasing order.
typedef KDPairInc<int, int> IntKDPair;
// Vector of IntKDPair.
class KDVector : public GenericVector<IntKDPair> {
// TODO(rays) Add some code to manipulate a KDVector. For now there
// is nothing and this class is effectively a specialization typedef.
};
} // namespace tesseract
#endif // TESSERACT_CCUTIL_KDPAIR_H_

125
ccutil/object_cache.h Normal file
View File

@ -0,0 +1,125 @@
///////////////////////////////////////////////////////////////////////
// File: object_cache.h
// Description: A string indexed object cache.
// Author: David Eger
// Created: Fri Jan 27 12:08:00 PST 2012
//
// (C) Copyright 2012, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCUTIL_OBJECT_CACHE_H_
#define TESSERACT_CCUTIL_OBJECT_CACHE_H_
#include "ccutil.h"
#include "errcode.h"
#include "genericvector.h"
#include "tesscallback.h"
namespace tesseract {
// A simple object cache which maps a string to an object of type T.
// Usually, these are expensive objects that are loaded from disk.
// Reference counting is performed, so every Get() needs to be followed later
// by a Free(). Actual deletion is accomplished by DeleteUnusedObjects().
template<typename T>
class ObjectCache {
public:
ObjectCache() {}
~ObjectCache() {
mu_.Lock();
for (int i = 0; i < cache_.size(); i++) {
if (cache_[i].count > 0) {
tprintf("ObjectCache(%p)::~ObjectCache(): WARNING! LEAK! object %p "
"still has count %d (id %s)\n",
this, cache_[i].object, cache_[i].count,
cache_[i].id.string());
} else {
delete cache_[i].object;
cache_[i].object = NULL;
}
}
mu_.Unlock();
}
// Return a pointer to the object identified by id.
// If we haven't yet loaded the object, use loader to load it.
// If loader fails to load it, record a NULL entry in the cache
// and return NULL -- further attempts to load will fail (even
// with a different loader) until DeleteUnusedObjects() is called.
// We delete the given loader.
T *Get(STRING id,
TessResultCallback<T *> *loader) {
T *retval = NULL;
mu_.Lock();
for (int i = 0; i < cache_.size(); i++) {
if (id == cache_[i].id) {
retval = cache_[i].object;
if (cache_[i].object != NULL) {
cache_[i].count++;
}
mu_.Unlock();
delete loader;
return retval;
}
}
cache_.push_back(ReferenceCount());
ReferenceCount &rc = cache_.back();
rc.id = id;
retval = rc.object = loader->Run();
rc.count = (retval != NULL) ? 1 : 0;
mu_.Unlock();
return retval;
}
// Decrement the count for t.
// Return whether we knew about the given pointer.
bool Free(T *t) {
if (t == NULL) return false;
mu_.Lock();
for (int i = 0; i < cache_.size(); i++) {
if (cache_[i].object == t) {
--cache_[i].count;
mu_.Unlock();
return true;
}
}
mu_.Unlock();
return false;
}
void DeleteUnusedObjects() {
mu_.Lock();
for (int i = cache_.size() - 1; i >= 0; i--) {
if (cache_[i].count <= 0) {
delete cache_[i].object;
cache_.remove(i);
}
}
mu_.Unlock();
}
private:
struct ReferenceCount {
STRING id; // A unique ID to identify the object (think path on disk)
T *object; // A copy of the object in memory. Can be delete'd.
int count; // A count of the number of active users of this object.
};
CCUtilMutex mu_;
GenericVector<ReferenceCount> cache_;
};
} // namespace tesseract
#endif // TESSERACT_CCUTIL_OBJECT_CACHE_H_

View File

@ -207,4 +207,25 @@ void ParamUtils::PrintParams(FILE *fp, const ParamsVectors *member_params) {
}
}
// Resets all parameters back to default values;
void ParamUtils::ResetToDefaults(ParamsVectors* member_params) {
int v, i;
int num_iterations = (member_params == NULL) ? 1 : 2;
for (v = 0; v < num_iterations; ++v) {
ParamsVectors *vec = (v == 0) ? GlobalParams() : member_params;
for (i = 0; i < vec->int_params.size(); ++i) {
vec->int_params[i]->ResetToDefault();
}
for (i = 0; i < vec->bool_params.size(); ++i) {
vec->bool_params[i]->ResetToDefault();
}
for (int i = 0; i < vec->string_params.size(); ++i) {
vec->string_params[i]->ResetToDefault();
}
for (int i = 0; i < vec->double_params.size(); ++i) {
vec->double_params[i]->ResetToDefault();
}
}
}
} // namespace tesseract

View File

@ -104,6 +104,9 @@ class ParamUtils {
// Print parameters to the given file.
static void PrintParams(FILE *fp, const ParamsVectors *member_params);
// Resets all parameters back to default values;
static void ResetToDefaults(ParamsVectors* member_params);
};
// Definition of various parameter types.
@ -142,15 +145,20 @@ class IntParam : public Param {
IntParam(inT32 value, const char *name, const char *comment, bool init,
ParamsVectors *vec) : Param(name, comment, init) {
value_ = value;
default_ = value;
params_vec_ = &(vec->int_params);
vec->int_params.push_back(this);
}
~IntParam() { ParamUtils::RemoveParam<IntParam>(this, params_vec_); }
operator inT32() const { return value_; }
void set_value(inT32 value) { value_ = value; }
void ResetToDefault() {
value_ = default_;
}
private:
inT32 value_;
inT32 default_;
// Pointer to the vector that contains this param (not owened by this class).
GenericVector<IntParam *> *params_vec_;
};
@ -160,15 +168,20 @@ class BoolParam : public Param {
BoolParam(bool value, const char *name, const char *comment, bool init,
ParamsVectors *vec) : Param(name, comment, init) {
value_ = value;
default_ = value;
params_vec_ = &(vec->bool_params);
vec->bool_params.push_back(this);
}
~BoolParam() { ParamUtils::RemoveParam<BoolParam>(this, params_vec_); }
operator BOOL8() const { return value_; }
void set_value(BOOL8 value) { value_ = value; }
void ResetToDefault() {
value_ = default_;
}
private:
BOOL8 value_;
BOOL8 default_;
// Pointer to the vector that contains this param (not owned by this class).
GenericVector<BoolParam *> *params_vec_;
};
@ -179,17 +192,23 @@ class StringParam : public Param {
const char *comment, bool init,
ParamsVectors *vec) : Param(name, comment, init) {
value_ = value;
default_ = value;
params_vec_ = &(vec->string_params);
vec->string_params.push_back(this);
}
~StringParam() { ParamUtils::RemoveParam<StringParam>(this, params_vec_); }
operator STRING &() { return value_; }
const char *string() const { return value_.string(); }
const char *c_str() const { return value_.string(); }
bool empty() { return value_.length() <= 0; }
void set_value(const STRING &value) { value_ = value; }
void ResetToDefault() {
value_ = default_;
}
private:
STRING value_;
STRING default_;
// Pointer to the vector that contains this param (not owened by this class).
GenericVector<StringParam *> *params_vec_;
};
@ -199,15 +218,20 @@ class DoubleParam : public Param {
DoubleParam(double value, const char *name, const char *comment,
bool init, ParamsVectors *vec) : Param(name, comment, init) {
value_ = value;
default_ = value;
params_vec_ = &(vec->double_params);
vec->double_params.push_back(this);
}
~DoubleParam() { ParamUtils::RemoveParam<DoubleParam>(this, params_vec_); }
operator double() const { return value_; }
void set_value(double value) { value_ = value; }
void ResetToDefault() {
value_ = default_;
}
private:
double value_;
double default_;
// Pointer to the vector that contains this param (not owned by this class).
GenericVector<DoubleParam *> *params_vec_;
};

View File

@ -20,16 +20,12 @@
#ifndef TESSERACT_CCUTIL_PLATFORM_H__
#define TESSERACT_CCUTIL_PLATFORM_H__
#include <string.h>
#define DLLSYM
#ifdef _WIN32
#ifdef __GNUC__
#define ultoa _ultoa
#ifndef __MINGW32__
typedef struct _BLOB {
unsigned int cbSize;
char *pBlobData;
} BLOB, *LPBLOB;
#endif /* __MINGW32__ */
#endif /* __GNUC__ */
#define SIGNED
#define snprintf _snprintf
@ -71,4 +67,12 @@ typedef struct _BLOB {
#endif
#endif
#if defined(_WIN32) || defined(__CYGWIN__)
#define _TESS_FILE_BASENAME_ \
(strrchr(__FILE__, '\\') ? strrchr(__FILE__, '\\') + 1 : __FILE__)
#else // Unices
#define _TESS_FILE_BASENAME_ \
(strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
#endif
#endif // TESSERACT_CCUTIL_PLATFORM_H__

View File

@ -56,7 +56,9 @@ class SortHelper {
}
// Constructor takes a hint of the array size, but it need not be accurate.
explicit SortHelper(int sizehint) : counts_(sizehint) {}
explicit SortHelper(int sizehint) {
counts_.reserve(sizehint);
}
// Add a value that may be a duplicate of an existing value.
// Uses a linear search.

View File

@ -24,8 +24,11 @@
#include <assert.h>
// Size of buffer needed to host the decimal representation of the maximum
// possible length of an int (in 64 bits, being -<20 digits>.
// possible length of an int (in 64 bits), being -<20 digits>.
const int kMaxIntSize = 22;
// Size of buffer needed to host the decimal representation of the maximum
// possible length of a %.8g being -0.12345678e+999<nul> = 15.
const int kMaxDoubleSize = 15;
/**********************************************************************
* STRING_HEADER provides metadata about the allocated buffer,
@ -163,6 +166,10 @@ const char* STRING::string() const {
return GetCStr();
}
const char* STRING::c_str() const {
return string();
}
/******
* The STRING_IS_PROTECTED interface adds additional support to migrate
* code that needs to modify the STRING in ways not otherwise supported
@ -220,6 +227,8 @@ void STRING::erase_range(inT32 index, int len) {
#else
void STRING::truncate_at(inT32 index) {
ASSERT_HOST(index >= 0);
FixHeader();
char* this_cstr = ensure_cstr(index + 1);
this_cstr[index] = '\0';
GetHeader()->used_ = index + 1;
@ -339,6 +348,16 @@ void STRING::add_str_int(const char* str, int number) {
num_buffer[kMaxIntSize - 1] = '\0';
*this += num_buffer;
}
// Appends the given string and double (as a %.8g) to this.
void STRING::add_str_double(const char* str, double number) {
if (str != NULL)
*this += str;
// Allow space for the maximum possible length of %8g.
char num_buffer[kMaxDoubleSize];
snprintf(num_buffer, kMaxDoubleSize - 1, "%.8g", number);
num_buffer[kMaxDoubleSize - 1] = '\0';
*this += num_buffer;
}
STRING & STRING::operator=(const char* cstr) {
STRING_HEADER* this_header = GetHeader();

View File

@ -55,6 +55,7 @@ class TESS_API STRING
inT32 length() const;
inT32 size() const { return length(); }
const char *string() const;
const char *c_str() const;
inline char* strdup() const {
inT32 len = length() + 1;
@ -94,8 +95,10 @@ class TESS_API STRING
// be ambiguous, and ints usually need a string before or between them
// anyway.
void add_str_int(const char* str, int number);
// Appends the given string and double (as a %.8g) to this.
void add_str_double(const char* str, double number);
// ensure capcaity but keep pointer encapsulated
// ensure capacity but keep pointer encapsulated
inline void ensure(inT32 min_capacity) { ensure_cstr(min_capacity); }
private:

File diff suppressed because it is too large Load Diff

View File

@ -35,6 +35,7 @@ namespace tesseract {
bool TessdataManager::Init(const char *data_file_name, int debug_level) {
int i;
debug_level_ = debug_level;
data_file_name_ = data_file_name;
data_file_ = fopen(data_file_name, "rb");
if (data_file_ == NULL) {
tprintf("Error opening data file %s\n", data_file_name);
@ -244,7 +245,7 @@ bool TessdataManager::ExtractToFile(const char *filename) {
FILE *output_file = fopen(filename, "wb");
if (output_file == NULL) {
tprintf("Error openning %s\n", filename);
tprintf("Error opening %s\n", filename);
exit(1);
}
inT64 begin_offset = ftell(GetDataFilePtr());

View File

@ -21,7 +21,9 @@
#define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
#include <stdio.h>
#include "host.h"
#include "strngs.h"
#include "tprintf.h"
static const char kTrainedDataSuffix[] = "traineddata";
@ -44,7 +46,7 @@ static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
static const char kShapeTableFileSuffix[] = "shapetable";
static const char kBigramDawgFileSuffix[] = "bigram-dawg";
static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
static const char kParamsTrainingModelFileSuffix[] = "params-training-model";
static const char kParamsModelFileSuffix[] = "params-model";
namespace tesseract {
@ -59,13 +61,13 @@ enum TessdataType {
TESSDATA_SYSTEM_DAWG, // 7
TESSDATA_NUMBER_DAWG, // 8
TESSDATA_FREQ_DAWG, // 9
TESSDATA_FIXED_LENGTH_DAWGS, // 10
TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated
TESSDATA_CUBE_UNICHARSET, // 11
TESSDATA_CUBE_SYSTEM_DAWG, // 12
TESSDATA_SHAPE_TABLE, // 13
TESSDATA_BIGRAM_DAWG, // 14
TESSDATA_UNAMBIG_DAWG, // 15
TESSDATA_PARAMS_TRAINING_MODEL, // 16
TESSDATA_PARAMS_MODEL, // 16
TESSDATA_NUM_ENTRIES
};
@ -85,13 +87,13 @@ static const char * const kTessdataFileSuffixes[] = {
kSystemDawgFileSuffix, // 7
kNumberDawgFileSuffix, // 8
kFreqDawgFileSuffix, // 9
kFixedLengthDawgsFileSuffix, // 10
kFixedLengthDawgsFileSuffix, // 10 // deprecated
kCubeUnicharsetFileSuffix, // 11
kCubeSystemDawgFileSuffix, // 12
kShapeTableFileSuffix, // 13
kBigramDawgFileSuffix, // 14
kUnambigDawgFileSuffix, // 15
kParamsTrainingModelFileSuffix, // 16
kParamsModelFileSuffix, // 16
};
/**
@ -109,13 +111,13 @@ static const bool kTessdataFileIsText[] = {
false, // 7
false, // 8
false, // 9
false, // 10
false, // 10 // deprecated
true, // 11
false, // 12
false, // 13
false, // 14
false, // 15
false, // 16
true, // 16
};
/**
@ -146,6 +148,9 @@ class TessdataManager {
*/
bool Init(const char *data_file_name, int debug_level);
// Return the name of the underlying data file.
const STRING &GetDataFileName() const { return data_file_name_; }
/** Returns data file pointer. */
inline FILE *GetDataFilePtr() const { return data_file_; }
@ -279,6 +284,7 @@ class TessdataManager {
* when new tessdata types are introduced.
*/
inT32 actual_tessdata_num_entries_;
STRING data_file_name_; // name of the data file.
FILE *data_file_; ///< pointer to the data file.
int debug_level_;
// True if the bytes need swapping.

View File

@ -24,43 +24,46 @@
#include <stdio.h>
#include <stdarg.h>
#include "strngs.h"
#include "params.h"
#include "tprintf.h"
#include "ccutil.h"
#include "params.h"
#include "strngs.h"
#include "tprintf.h"
#define MAX_MSG_LEN 65536
#define EXTERN
// Since tprintf is protected by a mutex, these parameters can rmain global.
// Since tprintf is protected by a mutex, these parameters can remain global.
DLLSYM STRING_VAR(debug_file, "", "File to send tprintf output to");
DLLSYM INT_VAR(FLAGS_v, 0, "Minimum logging level for tlog() output");
DLLSYM void
tprintf( // Trace printf
const char *format, ... // special message
tprintf_internal( // Trace printf
const int level, // Logging level
const char *format, ... // Message
) {
if (FLAGS_v < level) return;
tesseract::tprintfMutex.Lock();
va_list args; //variable args
static FILE *debugfp = NULL; //debug file
//debug window
inT32 offset = 0; //into message
va_list args; // variable args
static FILE *debugfp = NULL; // debug file
// debug window
inT32 offset = 0; // into message
static char msg[MAX_MSG_LEN + 1];
va_start(args, format); //variable list
va_start(args, format); // variable list
// Format into msg
#ifdef _WIN32
//Format into msg
offset += _vsnprintf (msg + offset, MAX_MSG_LEN - offset, format, args);
offset += _vsnprintf(msg + offset, MAX_MSG_LEN - offset, format, args);
if (strcmp(debug_file.string(), "/dev/null") == 0)
debug_file.set_value("nul");
#else
//Format into msg
offset += vsprintf (msg + offset, format, args);
offset += vsnprintf(msg + offset, MAX_MSG_LEN - offset, format, args);
#endif
va_end(args);
if (debugfp == NULL && strlen (debug_file.string ()) > 0) {
debugfp = fopen (debug_file.string (), "wb");
} else if (debugfp != NULL && strlen (debug_file.string ()) == 0) {
if (debugfp == NULL && strlen(debug_file.string()) > 0) {
debugfp = fopen(debug_file.string(), "wb");
} else if (debugfp != NULL && strlen(debug_file.string()) == 0) {
fclose(debugfp);
debugfp = NULL;
}
@ -70,46 +73,3 @@ const char *format, ... // special message
fprintf(stderr, "%s", msg);
tesseract::tprintfMutex.Unlock();
}
/*************************************************************************
* pause_continue()
* UI for a debugging pause - to see an intermediate state
* Returns TRUE to continue as normal to the next pause in the current mode;
* FALSE to quit the current pausing mode.
*************************************************************************/
DLLSYM BOOL8
//special message
pause_continue (const char *format, ...
) {
va_list args; //variable args
char msg[1000];
STRING str = STRING ("DEBUG PAUSE:\n");
va_start(args, format); //variable list
vsprintf(msg, format, args); //Format into msg
va_end(args);
#ifdef GRAPHICS_DISABLED
// No interaction allowed -> simply go on
return true;
#else
#ifdef __UNIX__
printf ("%s\n", msg);
printf ("Type \"c\" to cancel, anything else to continue: ");
char c = getchar ();
return (c != 'c');
#endif
#ifdef _WIN32
str +=
STRING (msg) + STRING ("\nUse OK to continue, CANCEL to stop pausing");
// return AfxMessageBox( str.string(), MB_OKCANCEL ) == IDOK;
return::MessageBox (NULL, msg, "IMGAPP",
MB_APPLMODAL | MB_OKCANCEL) == IDOK;
#endif
#endif
}

View File

@ -17,19 +17,29 @@
*
**********************************************************************/
#ifndef TPRINTF_H
#define TPRINTF_H
#ifndef TESSERACT_CCUTIL_TPRINTF_H
#define TESSERACT_CCUTIL_TPRINTF_H
#include "params.h"
#include "params.h"
extern DLLSYM STRING_VAR_H (debug_file, "", "File to send tprintf output to");
extern DLLSYM BOOL_VAR_H (debug_window_on, TRUE,
"Send tprintf to window unless file set");
extern DLLSYM STRING_VAR_H(debug_file, "",
"File to send tprintf output to");
extern DLLSYM BOOL_VAR_H(debug_window_on, TRUE,
"Send tprintf to window unless file set");
extern TESS_API void tprintf( // Trace printf
const char *format, ... // special message
);
// special message
DLLSYM BOOL8 pause_continue (const char *format, ...
);
#endif
// Main logging function.
#define tprintf(args...) tprintf_internal(0, args)
// Variant guarded by the numeric logging level parameter FLAGS_v (default 0).
// Code using ParseCommandLineFlags() can control its value using the --v
// commandline argument. Otherwise it must be specified in a config file like
// other params.
#define tlog(level, args...) tprintf_internal(level, args)
#define TLOG_IS_ON(level) (FLAGS_v >= level)
extern TESS_API void tprintf_internal( // Trace printf
const int level, // Logging level
const char *format, ...); // Message
#endif // define TESSERACT_CCUTIL_TPRINTF_H

21370
ccutil/universalambigs.cpp Normal file

File diff suppressed because it is too large Load Diff

26
ccutil/universalambigs.h Normal file
View File

@ -0,0 +1,26 @@
///////////////////////////////////////////////////////////////////////
// File: universalambigs.h
// Description: Data for a universal ambigs file that is useful for
// any language.
// Author: Ray Smith
// Created: Mon Mar 18 11:26:00 PDT 2013
//
// (C) Copyright 2013, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
namespace tesseract {
extern const char kUniversalAmbigsFile[];
extern const int ksizeofUniversalAmbigsFile;
} // namespace tesseract

View File

@ -7,7 +7,7 @@
# ----------------------------------------
AC_PREREQ(2.50)
AC_INIT([tesseract], [3.02.03], [http://code.google.com/p/tesseract-ocr/issues/list])
AC_INIT([tesseract], [3.03], [http://code.google.com/p/tesseract-ocr/issues/list])
AC_CONFIG_MACRO_DIR([m4])
AC_REVISION($Id: configure.ac,v 1.4 2007/02/02 22:38:17 theraysmith Exp $)
AC_CONFIG_AUX_DIR(config)
@ -18,7 +18,7 @@ AC_PREFIX_DEFAULT(/usr/local)
# documentation.
# TODO(luc) Generate good documentation using doxygen or equivalent
PACKAGE_YEAR=2013
PACKAGE_DATE="07/03"
PACKAGE_DATE="08/13"
AC_DEFINE_UNQUOTED(PACKAGE_NAME,["${PACKAGE_NAME}"],[Name of package])
AC_DEFINE_UNQUOTED(PACKAGE_VERSION,["${PACKAGE_VERSION}"],[Version number])
@ -34,8 +34,8 @@ GENERIC_LIBRARY_NAME=tesseract
# Release versioning
GENERIC_MAJOR_VERSION=3
GENERIC_MINOR_VERSION=2
GENERIC_MICRO_VERSION=3
GENERIC_MINOR_VERSION=3
GENERIC_MICRO_VERSION=0
# API version (often = GENERIC_MAJOR_VERSION.GENERIC_MINOR_VERSION)
GENERIC_API_VERSION=$GENERIC_MAJOR_VERSION.$GENERIC_MINOR_VERSION
@ -248,6 +248,7 @@ AC_HEADER_TIME
AC_HEADER_SYS_WAIT
AC_CHECK_HEADERS(sys/ipc.h sys/shm.h)
AC_CHECK_HEADERS(limits.h malloc.h)
AC_CHECK_HEADERS(allheaders.h)
# Enable use of system-defined bool type if available:
AC_HEADER_STDBOOL
@ -261,6 +262,7 @@ AC_SYS_LARGEFILE
# ----------------------------------------
AC_CHECK_TYPES(wchar_t)
AC_CHECK_TYPES(long long int)
AC_CHECK_TYPES(mbstate_t,,,[#include "wchar.h"])
# ----------------------------------------

View File

@ -65,13 +65,13 @@ CharSet *CharSet::Create(TessdataManager *tessdata_manager,
!tessdata_manager->SeekToStart(TESSDATA_UNICHARSET)) {
fprintf(stderr, "Cube ERROR (CharSet::Create): could not find "
"either cube or tesseract unicharset\n");
return false;
return NULL;
}
FILE *charset_fp = tessdata_manager->GetDataFilePtr();
if (!charset_fp) {
fprintf(stderr, "Cube ERROR (CharSet::Create): could not load "
"a unicharset\n");
return false;
return NULL;
}
// If we found a cube unicharset separate from tesseract's, load it and
@ -90,7 +90,7 @@ CharSet *CharSet::Create(TessdataManager *tessdata_manager,
}
if (!loaded) {
delete char_set;
return false;
return NULL;
}
char_set->init_ = true;

View File

@ -234,8 +234,8 @@ bool ConvNetCharClassifier::LoadFoldingSets(const string &data_file_path,
fclose(fp);
string fold_sets_str;
if (!CubeUtils::ReadFileToString(fold_file_name.c_str(),
&fold_sets_str)) {
if (!CubeUtils::ReadFileToString(fold_file_name,
&fold_sets_str)) {
return false;
}
@ -327,7 +327,7 @@ bool ConvNetCharClassifier::LoadNets(const string &data_file_path,
fclose(fp);
// load main net
char_net_ = tesseract::NeuralNet::FromFile(char_net_file.c_str());
char_net_ = tesseract::NeuralNet::FromFile(char_net_file);
if (char_net_ == NULL) {
fprintf(stderr, "Cube ERROR (ConvNetCharClassifier::LoadNets): "
"could not load %s\n", char_net_file.c_str());

View File

@ -124,7 +124,7 @@ Pixa *CubeLineSegmenter::CrackLine(Pix *cracked_line_pix,
if (line_con_comps == NULL) {
delete []lines_pixa;
return false;
return NULL;
}
// assign each conn comp to the a line based on its centroid
@ -142,7 +142,7 @@ Pixa *CubeLineSegmenter::CrackLine(Pix *cracked_line_pix,
delete []lines_pixa;
boxaDestroy(&line_con_comps);
pixaDestroy(&line_con_comps_pix);
return false;
return NULL;
}
}
@ -413,14 +413,14 @@ Pix *CubeLineSegmenter::Pixa2Pix(Pixa *pixa, Box **dest_box,
(*dest_box) = boxCreate(min_x, min_y, max_x - min_x, max_y - min_y);
if ((*dest_box) == NULL) {
return false;
return NULL;
}
// create the union pix
Pix *union_pix = pixCreate((*dest_box)->w, (*dest_box)->h, img_->d);
if (union_pix == NULL) {
boxDestroy(dest_box);
return false;
return NULL;
}
// create a pix corresponding to the union of all pixs

View File

@ -165,7 +165,7 @@ WordAltList *CubeObject::Recognize(LangModel *lang_mod, bool word_mode) {
if (deslanted_beam_obj_ == NULL) {
fprintf(stderr, "Cube ERROR (CubeObject::Recognize): could not "
"construct deslanted BeamSearch\n");
return false;
return NULL;
}
}

View File

@ -230,8 +230,8 @@ bool HybridNeuralNetCharClassifier::LoadFoldingSets(
fclose(fp);
string fold_sets_str;
if (!CubeUtils::ReadFileToString(fold_file_name.c_str(),
&fold_sets_str)) {
if (!CubeUtils::ReadFileToString(fold_file_name,
&fold_sets_str)) {
return false;
}
@ -323,7 +323,7 @@ bool HybridNeuralNetCharClassifier::LoadNets(const string &data_file_path,
fclose(fp);
string str;
if (!CubeUtils::ReadFileToString(hybrid_net_file.c_str(), &str)) {
if (!CubeUtils::ReadFileToString(hybrid_net_file, &str)) {
return false;
}
@ -348,7 +348,7 @@ bool HybridNeuralNetCharClassifier::LoadNets(const string &data_file_path,
}
// load the net
string net_file_name = data_file_path + tokens_vec[0];
nets_[net_idx] = tesseract::NeuralNet::FromFile(net_file_name.c_str());
nets_[net_idx] = tesseract::NeuralNet::FromFile(net_file_name);
if (nets_[net_idx] == NULL) {
return false;
}

View File

@ -107,7 +107,7 @@ int TessLangModEdge::CreateChildren(CubeRecoContext *cntxt,
LangModEdge **edge_array) {
int edge_cnt = 0;
NodeChildVector vec;
dawg->unichar_ids_of(parent_node, &vec); // find all children of the parent
dawg->unichar_ids_of(parent_node, &vec, false); // find all children
for (int i = 0; i < vec.size(); ++i) {
const NodeChild &child = vec[i];
if (child.unichar_id == INVALID_UNICHAR_ID) continue;

View File

@ -74,7 +74,7 @@ LangModEdge **WordListLangModel::GetEdges(CharAltList *alt_list,
// initialize if necessary
if (init_ == false) {
if (Init() == false) {
return false;
return NULL;
}
}
@ -92,7 +92,7 @@ LangModEdge **WordListLangModel::GetEdges(CharAltList *alt_list,
// advance node
edge_ref = dawg_->next_node(edge_ref);
if (edge_ref == 0) {
return 0;
return NULL;
}
}

View File

@ -8,7 +8,7 @@ endif
noinst_HEADERS = \
bitvec.h callcpp.h const.h cutil.h cutil_class.h danerror.h efio.h \
emalloc.h freelist.h globals.h listio.h \
oldheap.h oldlist.h structures.h tessarray.h
oldlist.h structures.h
if !USING_MULTIPLELIBS
noinst_LTLIBRARIES = libtesseract_cutil.la
@ -22,7 +22,7 @@ endif
libtesseract_cutil_la_SOURCES = \
bitvec.cpp callcpp.cpp cutil.cpp cutil_class.cpp danerror.cpp efio.cpp \
emalloc.cpp freelist.cpp listio.cpp oldheap.cpp \
oldlist.cpp structures.cpp tessarray.cpp
emalloc.cpp freelist.cpp listio.cpp \
oldlist.cpp structures.cpp

View File

@ -73,27 +73,6 @@ void FreeBitVector(BIT_VECTOR BitVector) {
} /* FreeBitVector */
/**
* hamming_distance(array1,array2,length) computes the hamming distance
* between two bit strings.
*/
/*--------------------------------------------------------------------------*/
int hamming_distance(uinT32* array1, uinT32* array2, int length) {
register uinT32 diff; /*bit difference */
register int dist; /*total distance */
dist = 0;
for (; length > 0; length--) {
diff = *array1++ ^ *array2++;/*different bits */
while (diff) {
diff &= diff - 1; /*lose a bit */
dist++;
}
}
return dist; /*total distance */
}
/*---------------------------------------------------------------------------*/
/**
* Allocate and return a new bit vector large enough to

View File

@ -70,8 +70,6 @@ BIT_VECTOR ExpandBitVector(BIT_VECTOR Vector, int NewNumBits);
void FreeBitVector(BIT_VECTOR BitVector);
int hamming_distance(uinT32* array1, uinT32* array2, int length);
BIT_VECTOR NewBitVector(int NumBits);
#endif

View File

@ -53,5 +53,5 @@ void DoError(int Error, const char *Message) {
tprintf("\nError: %s!\n", Message);
}
signal_termination_handler(Error);
err_exit();
} /* DoError */

View File

@ -46,7 +46,6 @@ LIST read_list(const char *filename) {
FILE *infile;
char s[CHARS_PER_LINE];
LIST list;
char *chopAt250();
if ((infile = open_file (filename, "r")) == NULL)
return (NIL_LIST);

View File

@ -1,334 +0,0 @@
/******************************************************************************
** Filename: heap.c
** Purpose: Routines for managing heaps (smallest at root)
** Author: Dan Johnson
** History: 3/13/89, DSJ, Created.
**
** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
******************************************************************************/
/*-----------------------------------------------------------------------------
Include Files and Type Defines
-----------------------------------------------------------------------------*/
#include "oldheap.h"
#include "freelist.h"
#include "danerror.h"
#include "emalloc.h"
#include <stdio.h>
#define FATHER(N) ((N)>>1)
#define LEFTSON(N) ((N)<<1)
#define RIGHTSON(N) ((N)<<1 + 1)
/*-----------------------------------------------------------------------------
Public Code
-----------------------------------------------------------------------------*/
/*---------------------------------------------------------------------------*/
/**
* This routine creates and initializes a new heap data
* structure containing Size elements. In actuality, Size + 1
* elements are allocated. The first element, element 0, is
* unused, this makes the index arithmetic easier.
*
* Globals:
* - None
*
* @param Size maximum number of entries in the heap
* @return Pointer to the new heap.
* @note Exceptions: None
* @note History: 3/13/89, DSJ, Created.
*/
HEAP *MakeHeap(int Size) {
HEAP *NewHeap;
NewHeap = (HEAP *) Emalloc (sizeof (HEAP) + Size * sizeof (HEAPENTRY));
NewHeap->Size = Size;
NewHeap->FirstFree = 1;
return (NewHeap);
} /* MakeHeap */
/*---------------------------------------------------------------------------*/
/**
* This routine removes the top item on the heap and places
* its contents into Key and Data.
*
* Globals:
* - None
*
* @param Heap ptr to heap whose top is to be removed and returned
* @param Key place to put key of top heap item
* @param out_ptr place to put data of top heap item
*
* @return OK if top entry returned, EMPTY if heap is empty
* @note Exceptions: None
* @note History: 5/10/91, DSJ, Created (Modified from GetTopOfHeap).
*/
int HeapPop(HEAP *Heap, FLOAT32 *Key, void *out_ptr) {
inT32 Hole;
FLOAT32 HoleKey;
inT32 Son;
void **Data = (void **) out_ptr;
if (Heap->FirstFree <= 1)
return (EMPTY);
*Key = Heap->Entry[1].Key;
*Data = Heap->Entry[1].Data;
Heap->FirstFree--;
/* imagine the hole at the root is filled with the last entry in the heap */
HoleKey = Heap->Entry[Heap->FirstFree].Key;
Hole = 1;
/* while hole has 2 sons */
while ((Son = LEFTSON (Hole)) < Heap->FirstFree) {
/* find the son with the smallest key */
if (Heap->Entry[Son].Key > Heap->Entry[Son + 1].Key)
Son++;
/* if key for hole is greater than key for son, sift hole down */
if (HoleKey > Heap->Entry[Son].Key) {
Heap->Entry[Hole].Key = Heap->Entry[Son].Key;
Heap->Entry[Hole].Data = Heap->Entry[Son].Data;
Hole = Son;
}
else
break;
}
Heap->Entry[Hole].Key = HoleKey;
Heap->Entry[Hole].Data = Heap->Entry[Heap->FirstFree].Data;
return (TESS_HEAP_OK);
} /* HeapPop */
/**
* HeapPopWorst
*
* Remove the largest item from the heap.
*
* @param Heap ptr to heap whose top is to be removed and returned
* @param Key place to put key of top heap item
* @param out_ptr place to put data of top heap item
*/
int HeapPopWorst(HEAP *Heap, FLOAT32 *Key, void *out_ptr) {
inT32 Index; /*current index */
inT32 Hole;
FLOAT32 HoleKey;
inT32 Father;
void *HoleData;
void **Data = (void **) out_ptr;
if (Heap->FirstFree <= 1)
return (EMPTY);
HoleKey = Heap->Entry[1].Key;
Hole = 1;
Heap->FirstFree--;
for (Index = Heap->FirstFree, Father = FATHER (Index); Index > Father;
Index--)
if (Heap->Entry[Index].Key > HoleKey) {
/*find biggest */
HoleKey = Heap->Entry[Index].Key;
Hole = Index;
}
*Key = HoleKey;
*Data = Heap->Entry[Hole].Data;
HoleKey = Heap->Entry[Heap->FirstFree].Key;
Heap->Entry[Hole].Key = HoleKey;
HoleData = Heap->Entry[Heap->FirstFree].Data;
Heap->Entry[Hole].Data = HoleData;
/* now sift last entry to its rightful place */
Father = FATHER (Hole); /*father of hole */
while (Hole > 1 && Heap->Entry[Father].Key > HoleKey) {
/*swap entries */
Heap->Entry[Hole].Key = Heap->Entry[Father].Key;
Heap->Entry[Hole].Data = Heap->Entry[Father].Data;
Heap->Entry[Father].Data = HoleData;
Heap->Entry[Father].Key = HoleKey;
Hole = Father;
Father = FATHER (Hole);
}
return (TESS_HEAP_OK);
} /* HeapPop */
// Pushes data onto the heap only if there is free space left.
// Returns true if data was added to the heap, false if the heap was full.
bool HeapPushCheckSize(HEAP *Heap, FLOAT32 Key, void *Data) {
if (Heap->FirstFree > Heap->Size) return false;
HeapPush(Heap, Key, Data);
return true;
}
/*---------------------------------------------------------------------------*/
/**
* This routine stores Data into Heap and associates it
* with Key. The heap is
* maintained in such a way that the item with the lowest key
* is always at the top of the heap.
*
* Globals:
* - None
*
* @param Heap ptr to heap to store new item in
* @param Key numeric key associated with new item
* @param Data ptr to data contents of new item
*
* @note Exceptions:
* - HEAPFULL error if heap size is exceeded
*
* @note History: 5/10/91, DSJ, Created (Modified version of HeapStore).
*/
void HeapPush(HEAP *Heap, FLOAT32 Key, void *Data) {
inT32 Item;
inT32 Father;
if (Heap->FirstFree > Heap->Size)
DoError (HEAPFULL, "Heap size exceeded");
Item = Heap->FirstFree;
Heap->FirstFree++;
while (Item != 1) {
Father = FATHER (Item);
if (Heap->Entry[Father].Key > Key) {
Heap->Entry[Item].Key = Heap->Entry[Father].Key;
Heap->Entry[Item].Data = Heap->Entry[Father].Data;
Item = Father;
}
else
break;
}
Heap->Entry[Item].Key = Key;
Heap->Entry[Item].Data = Data;
} /* HeapPush */
/*---------------------------------------------------------------------------*/
/**
* This routine stores Entry into Heap. The heap is
* maintained in such a way that the item with the lowest key
* is always at the top of the heap.
*
* Globals:
* - None
*
* @param Heap ptr to heap to store new item in
* @param Entry ptr to item to be stored in Heap
* @note Exceptions:
* - HEAPFULL error if heap size is exceeded
* @note History: 3/13/89, DSJ, Created.
*/
void HeapStore(HEAP *Heap, HEAPENTRY *Entry) {
inT32 Item;
inT32 Father;
if (Heap->FirstFree > Heap->Size)
DoError (HEAPFULL, "Heap size exceeded");
Item = Heap->FirstFree;
Heap->FirstFree++;
while (Item != 1) {
Father = FATHER (Item);
if (Heap->Entry[Father].Key > Entry->Key) {
Heap->Entry[Item].Key = Heap->Entry[Father].Key;
Heap->Entry[Item].Data = Heap->Entry[Father].Data;
Item = Father;
}
else
break;
}
Heap->Entry[Item].Key = Entry->Key;
Heap->Entry[Item].Data = Entry->Data;
} /* HeapStore */
/*---------------------------------------------------------------------------*/
/**
* This routine removes the top item on the heap and copies its
* contents into Entry.
*
* @param Heap ptr to heap whose top is to be removed and returned
* @param Entry ptr to heap entry to be filled with top entry on Heap
*
* Globals:
* - None
*
* @return OK if top entry returned, EMPTY if heap is empty
* @note Exceptions: None
* @note History: 3/13/89, DSJ, Created.
*/
int GetTopOfHeap(HEAP *Heap, HEAPENTRY *Entry) {
inT32 Hole;
FLOAT32 HoleKey;
inT32 Son;
if (Heap->FirstFree <= 1)
return (EMPTY);
Entry->Key = Heap->Entry[1].Key;
Entry->Data = Heap->Entry[1].Data;
Heap->FirstFree--;
/* imagine the hole at the root is filled with the last entry in the heap */
HoleKey = Heap->Entry[Heap->FirstFree].Key;
Hole = 1;
/* while hole has 2 sons */
while ((Son = LEFTSON (Hole)) < Heap->FirstFree) {
/* find the son with the smallest key */
if (Heap->Entry[Son].Key > Heap->Entry[Son + 1].Key)
Son++;
/* if key for hole is greater than key for son, sift hole down */
if (HoleKey > Heap->Entry[Son].Key) {
Heap->Entry[Hole].Key = Heap->Entry[Son].Key;
Heap->Entry[Hole].Data = Heap->Entry[Son].Data;
Hole = Son;
}
else
break;
}
Heap->Entry[Hole].Key = HoleKey;
Heap->Entry[Hole].Data = Heap->Entry[Heap->FirstFree].Data;
return (TESS_HEAP_OK);
} /* GetTopOfHeap */
/*---------------------------------------------------------------------------*/
/**
* This routine is similar to FreeHeap in that it
* deallocates the memory consumed by the heap. However, it
* also calls Deallocator for each item in the heap so that
* this data is also deallocated.
*
* @param Heap heap whose data is to be freed
* @param destructor function to be used to deallocate data
*
* Globals:
* - None
*
* @note Exceptions: none
* @note History: Tue May 15 08:52:04 1990, DSJ, Created.
*/
void FreeHeapData(HEAP *Heap, void_dest destructor) {
HEAPENTRY Entry;
while (GetTopOfHeap (Heap, &Entry) != EMPTY)
destructor (Entry.Data);
FreeHeap(Heap);
} /* FreeHeapData */

View File

@ -1,80 +0,0 @@
/******************************************************************************
** Filename: heap.h
** Purpose: Definition of heap access routines.
** Author: Dan Johnson
** History: 3/13/89, DSJ, Created.
**
** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
******************************************************************************/
#ifndef HEAP_H
#define HEAP_H
/*-----------------------------------------------------------------------------
Include Files and Type Defines
-----------------------------------------------------------------------------*/
#include "host.h"
#include "cutil.h"
#define HEAPFULL 3000
#define EMPTY -1
#define TESS_HEAP_OK 0
struct HEAPENTRY {
FLOAT32 Key;
void *Data;
};
struct HEAP {
inT32 Size;
inT32 FirstFree;
HEAPENTRY Entry[1];
};
/*-----------------------------------------------------------------------------
Macros
-----------------------------------------------------------------------------*/
#define FreeHeap(H) memfree(H)
#define MaxSizeOfHeap(H) (H->Size)
#define SizeOfHeap(H) (H->FirstFree - 1)
#define InitHeap(H) (H->FirstFree = 1)
#define HeapFull(H) ((H)->FirstFree > (H)->Size)
#define HeapEmpty(H) ((H)->FirstFree <= 1)
/* macros for accessing elements in heap by index. The indicies vary from
0 to SizeOfHeap-1. No bounds checking is done. Elements accessed in
this manner are in random order relative to the Key values. These
macros should never be used as the LHS of an assignment statement as this
will corrupt the heap.*/
#define HeapKeyFor(H,E) ((H)->Entry[(E)+1].Key)
#define HeapDataFor(H,E) ((H)->Entry[(E)+1].Data)
/*-----------------------------------------------------------------------------
Public Function Prototypes
-----------------------------------------------------------------------------*/
HEAP *MakeHeap(int Size);
int HeapPop(HEAP *Heap, FLOAT32 *Key, void *out_ptr);
int HeapPopWorst(HEAP *Heap, FLOAT32 *Key, void *out_ptr);
void HeapPush(HEAP *Heap, FLOAT32 Key, void *Data);
void HeapStore(HEAP *Heap, HEAPENTRY *Entry);
int GetTopOfHeap(HEAP *Heap, HEAPENTRY *Entry);
void FreeHeapData(HEAP *Heap, void_dest destructor);
bool HeapPushCheckSize(HEAP *Heap, FLOAT32 Key, void *Data);
#endif

View File

@ -1,115 +0,0 @@
/* -*-C-*-
################################################################################
#
# File: array.c
# Description: Dynamic Array of Strings
# Author: Mark Seaman, Software Productivity
# Created: Thu Jul 23 13:24:09 1987
# Modified: Wed Mar 6 15:18:33 1991 (Mark Seaman) marks@hpgrlt
# Language: C
# Package: N/A
# Status: Reusable Software Component
#
# (c) Copyright 1987, Hewlett-Packard Company.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
#
################################################################################
This file contains the implentations of a set of dynamic array of string
manipulation routines. For the interface definitions and documentation
of these routines see the file "das.h".
***************************************************************************/
#include "tessarray.h"
#include "callcpp.h"
#include "freelist.h"
#include <stdio.h>
#include <string.h>
#ifdef _WIN32
#include <process.h>
#endif
#include <ctype.h>
#if MAC_OR_DOS
#include <stdlib.h>
#endif
/**********************************************************************
* array_insert
*
* Insert a data element into a particular spot in the array. Move all
* the elements in the array (past that spot) down one to make room for
* the new element.
**********************************************************************/
ARRAY array_insert(ARRAY array, int index, void *value) {
int x;
array = array_push (array, NULL);
for (x = array_count (array) - 1; x > index; x--)
array_value (array, x) = array_value (array, x - 1);
array_value (array, index) = value;
return (array);
}
/**********************************************************************
* array_new
*
* Create a new array with a certain number of elements. If the number
* of elements requested is 0 then the default number will be used.
**********************************************************************/
ARRAY array_new(int num) {
ARRAY temp;
int x;
if (num == 0)
num = DEFAULT_SIZE;
temp = (ARRAY) memalloc ((num - 2) * sizeof (char *) +
sizeof (struct array_record));
if (!temp) {
cprintf ("error: Out of memory in array_new\n");
exit (1); //?err_exit ();
}
array_count (temp) = 0;
array_limit (temp) = num;
for (x = 0; x < num; x++)
array_value (temp, x) = (char *) 0;
return (temp);
}
/**********************************************************************
* array_push
*
* Add a new element onto the top of the array. If there is not room
* more room is made by "realloc"ing the array. This means that the
* new array location may change. All previous references to its old
* location may no longer be valid.
**********************************************************************/
ARRAY array_push(ARRAY array, void *value) {
if (array_count (array) == array_limit (array)) {
array = (ARRAY) memrealloc (array, (array_limit (array) * 2 - 2) *
sizeof (char *) +
sizeof (struct array_record),
(array_limit (array) -
2) * sizeof (char *) +
sizeof (struct array_record));
if (!array) {
cprintf ("error: Out of memory in array_push\n");
exit (1); //?err_exit ();
}
array_limit (array) *= 2;
}
array_count (array)++;
array_top (array) = value;
return (array);
}

View File

@ -1,166 +0,0 @@
/* -*-C-*-
********************************************************************************
*
* File: array.h (Formerly array.h)
* Description: Dynamic Array of String
* Author: Mark Seaman, SW Productivity
* Created: Fri Oct 16 14:37:00 1987
* Modified: Mon Sep 24 14:15:59 1990 (Mark Seaman) marks@hpgrlt
* Language: C
* Package: N/A
* Status: Reusable Software Component
*
* (c) Copyright 1987, Hewlett-Packard Company.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
*****************************************************************************
This file contains a set of general purpose dynamic array of string routines.
These routines can be used in a wide variety of ways to provide several
different popular data structures. A new "das" can be created by declaring
a variable of type 'DAS'
******************************************************************************/
#ifndef TESSARRAY_H
#define TESSARRAY_H
/*
----------------------------------------------------------------------
I n c l u d e s
----------------------------------------------------------------------
*/
#include <stdio.h>
/*
----------------------------------------------------------------------
T y p e s
----------------------------------------------------------------------
*/
typedef struct array_record
{
size_t limit;
size_t top;
void *base[2];
} *ARRAY;
typedef void (*voidProc) ();
typedef int (*intProc) ();
/*
----------------------------------------------------------------------
M a c r o s
----------------------------------------------------------------------
*/
#define DEFAULT_SIZE 2
/**********************************************************************
* array_count
*
* Return the value of the number of elements currently in the array.
**********************************************************************/
#define array_count(a) \
((a)->top)
/**********************************************************************
* array_free
*
* Free the memory allocated to this array.
**********************************************************************/
#define array_free \
memfree
/**********************************************************************
* array_index
*
* Check to make sure that the index value is valid. Return the
* value of the nth element currently in the array.
**********************************************************************/
#define array_index(a,i) \
((i<array_count(a)) ? (a)->base[i] : 0)
/**********************************************************************
* array_limit
*
* Return the maximum number of elements that could be currently held
* in this array without further expansion.
**********************************************************************/
#define array_limit(a) \
((a)->limit)
/**********************************************************************
* array_loop
*
* Iterate through each of the array elements. Each value can then be
* accessed by:
* array_index (a, x)
**********************************************************************/
#define array_loop(a,x) \
for (x=0; x < array_count (a); x++)
/**********************************************************************
* array_top
*
* Return the last element that was pushed on this array.
**********************************************************************/
#define array_top(a) \
((a)->base[array_count (a) - 1])
/**********************************************************************
* array_value
*
* Return the nth element of the array. Don't do range checking.
**********************************************************************/
#define array_value(a,i) \
((a)->base[i])
/*----------------------------------------------------------------------
F u n c t i o n s
----------------------------------------------------------------------*/
ARRAY array_insert(ARRAY array, int index, void *value);
ARRAY array_new(int num);
ARRAY array_push(ARRAY array, void *value);
/*
#if defined(__STDC__) || defined(__cplusplus)
# define _ARGS(s) s
#else
# define _ARGS(s) ()
#endif*/
/* array.c
ARRAY array_insert
_ARGS((ARRAY array,
int index,
char *value));
ARRAY array_new
_ARGS((int num));
ARRAY array_push
_ARGS((ARRAY array,
char *value));
#undef _ARGS
*/
#endif

View File

@ -7,8 +7,8 @@ AM_CPPFLAGS += -DTESS_EXPORTS \
endif
noinst_HEADERS = \
dawg.h dict.h matchdefs.h \
permute.h states.h stopper.h trie.h
dawg.h dawg_cache.h dict.h matchdefs.h \
stopper.h trie.h
if !USING_MULTIPLELIBS
noinst_LTLIBRARIES = libtesseract_dict.la
@ -25,7 +25,7 @@ endif
libtesseract_dict_la_SOURCES = \
context.cpp \
dawg.cpp dict.cpp hyphen.cpp \
permdawg.cpp permute.cpp states.cpp stopper.cpp trie.cpp
dawg.cpp dawg_cache.cpp dict.cpp hyphen.cpp \
permdawg.cpp stopper.cpp trie.cpp

View File

@ -38,6 +38,7 @@
#include "freelist.h"
#include "helpers.h"
#include "strngs.h"
#include "tesscallback.h"
#include "tprintf.h"
/*----------------------------------------------------------------------
@ -45,25 +46,29 @@
----------------------------------------------------------------------*/
namespace tesseract {
bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
if (word.length() == 0) return false;
bool Dawg::prefix_in_dawg(const WERD_CHOICE &word,
bool requires_complete) const {
if (word.length() == 0) return !requires_complete;
NODE_REF node = 0;
int end_index = word.length() - 1;
for (int i = 0; i <= end_index; i++) {
if (debug_level_ > 1) {
tprintf("word_in_dawg: exploring node " REFFORMAT ":\n", node);
print_node(node, MAX_NODE_EDGES_DISPLAY);
tprintf("\n");
for (int i = 0; i < end_index; i++) {
EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
if (edge == NO_EDGE) {
return false;
}
EDGE_REF edge = edge_char_of(node, word.unichar_id(i), i == end_index);
if (edge != NO_EDGE) {
node = next_node(edge);
if (node == 0) node = NO_EDGE;
} else {
if ((node = next_node(edge)) == 0) {
// This only happens if all words following this edge terminate --
// there are no larger words. See Trie::add_word_to_dawg()
return false;
}
}
return true;
// Now check the last character.
return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
NO_EDGE;
}
bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
return prefix_in_dawg(word, true);
}
int Dawg::check_for_words(const char *filename,
@ -99,23 +104,36 @@ int Dawg::check_for_words(const char *filename,
}
void Dawg::iterate_words(const UNICHARSET &unicharset,
TessCallback1<const char *> *cb) const {
TessCallback1<const WERD_CHOICE *> *cb) const {
WERD_CHOICE word(&unicharset);
iterate_words_rec(word, 0, cb);
}
void CallWithUTF8(TessCallback1<const char *> *cb, const WERD_CHOICE *wc) {
STRING s;
wc->string_and_lengths(&s, NULL);
cb->Run(s.string());
}
void Dawg::iterate_words(const UNICHARSET &unicharset,
TessCallback1<const char *> *cb) const {
TessCallback1<const WERD_CHOICE *> *shim =
NewPermanentTessCallback(CallWithUTF8, cb);
WERD_CHOICE word(&unicharset);
iterate_words_rec(word, 0, shim);
delete shim;
}
void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
NODE_REF to_explore,
TessCallback1<const char *> *cb) const {
TessCallback1<const WERD_CHOICE *> *cb) const {
NodeChildVector children;
this->unichar_ids_of(to_explore, &children);
this->unichar_ids_of(to_explore, &children, false);
for (int i = 0; i < children.size(); i++) {
WERD_CHOICE next_word(word_so_far);
next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
if (this->end_of_word(children[i].edge_ref)) {
STRING s;
next_word.string_and_lengths(&s, NULL);
cb->Run(s.string());
cb->Run(&next_word);
}
NODE_REF next = next_node(children[i].edge_ref);
if (next != 0) {
@ -132,7 +150,7 @@ bool Dawg::match_words(WERD_CHOICE *word, inT32 index,
if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
bool any_matched = false;
NodeChildVector vec;
this->unichar_ids_of(node, &vec);
this->unichar_ids_of(node, &vec, false);
for (int i = 0; i < vec.size(); ++i) {
word->set_unichar_id(vec[i].unichar_id, index);
if (match_words(word, index, node, wildcard))

View File

@ -91,10 +91,6 @@ enum DawgType {
#define NUM_FLAG_BITS 3
#define REFFORMAT "%lld"
// Set kBeginningDawgsType[i] to true if a Dawg of
// DawgType i can contain the beginning of a word.
static const bool kBeginningDawgsType[] = { 1, 1, 1, 1 };
static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
{ 0, 1, 1, 0 }, // for DAWG_TYPE_PUNCTUATION
{ 1, 0, 0, 0 }, // for DAWG_TYPE_WORD
@ -137,12 +133,21 @@ class Dawg {
/// Returns true if the given word is in the Dawg.
bool word_in_dawg(const WERD_CHOICE &word) const;
// Returns true if the given word prefix is not contraindicated by the dawg.
// If requires_complete is true, then the exact complete word must be present.
bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const;
/// Checks the Dawg for the words that are listed in the requested file.
/// Returns the number of words in the given file missing from the Dawg.
int check_for_words(const char *filename,
const UNICHARSET &unicharset,
bool enable_wildcard) const;
// For each word in the Dawg, call the given (permanent) callback with the
// text (UTF-8) version of the word.
void iterate_words(const UNICHARSET &unicharset,
TessCallback1<const WERD_CHOICE *> *cb) const;
// For each word in the Dawg, call the given (permanent) callback with the
// text (UTF-8) version of the word.
void iterate_words(const UNICHARSET &unicharset,
@ -156,7 +161,8 @@ class Dawg {
/// Fills the given NodeChildVector with all the unichar ids (and the
/// corresponding EDGE_REFs) for which there is an edge out of this node.
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const = 0;
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
bool word_end) const = 0;
/// Returns the next node visited by following the edge
/// indicated by the given EDGE_REF.
@ -277,7 +283,7 @@ class Dawg {
// Recursively iterate over all words in a dawg (see public iterate_words).
void iterate_words_rec(const WERD_CHOICE &word_so_far,
NODE_REF to_explore,
TessCallback1<const char *> *cb) const;
TessCallback1<const WERD_CHOICE *> *cb) const;
// Member Variables.
DawgType type_;
@ -299,22 +305,71 @@ class Dawg {
};
//
/// DawgInfo struct and DawgInfoVector class are used for
/// storing information about the current Dawg search state.
// DawgPosition keeps track of where we are in the primary dawg we're searching
// as well as where we may be in the "punctuation dawg" which may provide
// surrounding context.
//
struct DawgInfo {
DawgInfo() : dawg_index(-1), ref(NO_EDGE) {}
DawgInfo(int i, EDGE_REF r) : dawg_index(i), ref(r) {}
bool operator==(const DawgInfo &other) {
return (this->dawg_index == other.dawg_index && this->ref == other.ref);
// Example:
// punctuation dawg -- space is the "pattern character"
// " " // no punctuation
// "' '" // leading and trailing apostrophes
// " '" // trailing apostrophe
// word dawg:
// "cat"
// "cab"
// "cat's"
//
// DawgPosition(dawg_index, dawg_ref, punc_index, punc_ref, rtp)
//
// DawgPosition(-1, NO_EDGE, p, pe, false)
// We're in the punctuation dawg, no other dawg has been started.
// (1) If there's a pattern edge as a punc dawg child of us,
// for each punc-following dawg starting with ch, produce:
// Result: DawgPosition(k, w, p', false)
// (2) If there's a valid continuation in the punc dawg, produce:
// Result: DawgPosition(-k, NO_EDGE, p', false)
//
// DawgPosition(k, w, -1, NO_EDGE, false)
// We're in dawg k. Going back to punctuation dawg is not an option.
// Follow ch in dawg k.
//
// DawgPosition(k, w, p, pe, false)
// We're in dawg k. Continue in dawg k and/or go back to the punc dawg.
// If ending, check that the punctuation dawg is also ok to end here.
//
// DawgPosition(k, w, p, pe true)
// We're back in the punctuation dawg. Continuing there is the only option.
struct DawgPosition {
DawgPosition()
: dawg_index(-1), dawg_ref(NO_EDGE), punc_ref(NO_EDGE),
back_to_punc(false) {}
DawgPosition(int dawg_idx, EDGE_REF dawgref,
int punc_idx, EDGE_REF puncref,
bool backtopunc)
: dawg_index(dawg_idx), dawg_ref(dawgref),
punc_index(punc_idx), punc_ref(puncref),
back_to_punc(backtopunc) {
}
int dawg_index;
EDGE_REF ref;
bool operator==(const DawgPosition &other) {
return dawg_index == other.dawg_index &&
dawg_ref == other.dawg_ref &&
punc_index == other.punc_index &&
punc_ref == other.punc_ref &&
back_to_punc == other.back_to_punc;
}
inT8 dawg_index;
EDGE_REF dawg_ref;
inT8 punc_index;
EDGE_REF punc_ref;
// Have we returned to the punc dawg at the end of the word?
bool back_to_punc;
};
class DawgInfoVector : public GenericVector<DawgInfo> {
class DawgPositionVector : public GenericVector<DawgPosition> {
public:
/// Overload destructor, since clear() does not delete data_[] any more.
~DawgInfoVector() {
~DawgPositionVector() {
if (size_reserved_ > 0) {
delete[] data_;
size_used_ = 0;
@ -327,15 +382,17 @@ class DawgInfoVector : public GenericVector<DawgInfo> {
/// Adds an entry for the given dawg_index with the given node to the vec.
/// Returns false if the same entry already exists in the vector,
/// true otherwise.
inline bool add_unique(const DawgInfo &new_info, bool debug,
inline bool add_unique(const DawgPosition &new_pos,
bool debug,
const char *debug_msg) {
for (int i = 0; i < size_used_; ++i) {
if (data_[i] == new_info) return false;
if (data_[i] == new_pos) return false;
}
push_back(new_info);
push_back(new_pos);
if (debug) {
tprintf("%s[%d, " REFFORMAT "]\n", debug_msg,
new_info.dawg_index, new_info.ref);
tprintf("%s[%d, " REFFORMAT "] [punc: " REFFORMAT "%s]\n",
debug_msg, new_pos.dawg_index, new_pos.dawg_ref,
new_pos.punc_ref, new_pos.back_to_punc ? " returned" : "");
}
return true;
}
@ -385,12 +442,15 @@ class SquishedDawg : public Dawg {
/// Fills the given NodeChildVector with all the unichar ids (and the
/// corresponding EDGE_REFs) for which there is an edge out of this node.
void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const {
void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
bool word_end) const {
EDGE_REF edge = node;
if (!edge_occupied(edge) || edge == NO_EDGE) return;
assert(forward_edge(edge)); // we don't expect any backward edges to
do { // be present when this funciton is called
vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
}
} while (!last_edge(edge++));
}

102
dict/dawg_cache.cpp Normal file
View File

@ -0,0 +1,102 @@
///////////////////////////////////////////////////////////////////////
// File: dawg_cache.h
// Description: A class that knows about loading and caching dawgs.
// Author: David Eger
// Created: Fri Jan 27 12:08:00 PST 2012
//
// (C) Copyright 2012, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "dawg_cache.h"
#include "dawg.h"
#include "object_cache.h"
#include "strngs.h"
#include "tessdatamanager.h"
namespace tesseract {
struct DawgLoader {
DawgLoader(const STRING &lang,
const char *data_file_name,
TessdataType tessdata_dawg_type,
int dawg_debug_level)
: lang_(lang),
data_file_name_(data_file_name),
tessdata_dawg_type_(tessdata_dawg_type),
dawg_debug_level_(dawg_debug_level) {}
Dawg *Load();
STRING lang_;
const char *data_file_name_;
TessdataType tessdata_dawg_type_;
int dawg_debug_level_;
};
Dawg *DawgCache::GetSquishedDawg(
const STRING &lang,
const char *data_file_name,
TessdataType tessdata_dawg_type,
int debug_level) {
STRING data_id = data_file_name;
data_id += kTessdataFileSuffixes[tessdata_dawg_type];
DawgLoader loader(lang, data_file_name, tessdata_dawg_type, debug_level);
return dawgs_.Get(data_id, NewTessCallback(&loader, &DawgLoader::Load));
}
Dawg *DawgLoader::Load() {
TessdataManager data_loader;
if (!data_loader.Init(data_file_name_, dawg_debug_level_)) {
return NULL;
}
if (!data_loader.SeekToStart(tessdata_dawg_type_)) return NULL;
FILE *fp = data_loader.GetDataFilePtr();
DawgType dawg_type;
PermuterType perm_type;
switch (tessdata_dawg_type_) {
case TESSDATA_PUNC_DAWG:
dawg_type = DAWG_TYPE_PUNCTUATION;
perm_type = PUNC_PERM;
break;
case TESSDATA_SYSTEM_DAWG:
dawg_type = DAWG_TYPE_WORD;
perm_type = SYSTEM_DAWG_PERM;
break;
case TESSDATA_NUMBER_DAWG:
dawg_type = DAWG_TYPE_NUMBER;
perm_type = NUMBER_PERM;
break;
case TESSDATA_BIGRAM_DAWG:
dawg_type = DAWG_TYPE_WORD; // doesn't actually matter
perm_type = COMPOUND_PERM; // doesn't actually matter
break;
case TESSDATA_UNAMBIG_DAWG:
dawg_type = DAWG_TYPE_WORD;
perm_type = SYSTEM_DAWG_PERM;
break;
case TESSDATA_FREQ_DAWG:
dawg_type = DAWG_TYPE_WORD;
perm_type = FREQ_DAWG_PERM;
break;
default:
data_loader.End();
return NULL;
}
SquishedDawg *retval =
new SquishedDawg(fp, dawg_type, lang_, perm_type, dawg_debug_level_);
data_loader.End();
return retval;
}
} // namespace tesseract

56
dict/dawg_cache.h Normal file
View File

@ -0,0 +1,56 @@
///////////////////////////////////////////////////////////////////////
// File: dawg_cache.h
// Description: A class that knows about loading and caching dawgs.
// Author: David Eger
// Created: Fri Jan 27 12:08:00 PST 2012
//
// (C) Copyright 2012, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_DICT_DAWG_CACHE_H_
#define TESSERACT_DICT_DAWG_CACHE_H_
#include "dawg.h"
#include "object_cache.h"
#include "strngs.h"
#include "tessdatamanager.h"
namespace tesseract {
class DawgCache {
public:
Dawg *GetSquishedDawg(
const STRING &lang,
const char *data_file_name,
TessdataType tessdata_dawg_type,
int debug_level);
// If we manage the given dawg, decrement its count,
// and possibly delete it if the count reaches zero.
// If dawg is unknown to us, return false.
bool FreeDawg(Dawg *dawg) {
return dawgs_.Free(dawg);
}
// Free up any currently unused dawgs.
void DeleteUnusedDawgs() {
dawgs_.DeleteUnusedObjects();
}
private:
ObjectCache<Dawg> dawgs_;
};
} // namespace tesseract
#endif // TESSERACT_DICT_DAWG_CACHE_H_

Some files were not shown because too many files have changed in this diff Show More