mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 12:49:35 +08:00
Major refactor of beam search, elimination of dead code, misc bug fixes, updates to Makefile.am, Changelog etc.
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@878 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
2c909702c9
commit
4d514d5a60
12
ChangeLog
12
ChangeLog
@ -1,3 +1,15 @@
|
||||
2013-09-20 v3.03
|
||||
* Added Renderer to API to allow document-level processing and output
|
||||
of document formats, like hOCR, PDF.
|
||||
* Major refactor of word-level recognition, beam search, eliminating dead code.
|
||||
* Refactored classifier to make it easier to add new ones.
|
||||
* Generalized feature extractor to allow feature extraction from greyscale.
|
||||
* Improved sub/superscript treatment.
|
||||
* Improved baseline fit.
|
||||
* Added set_unicharset_properties to training tools.
|
||||
* Many bug fixes.
|
||||
|
||||
|
||||
2012-02-01 - v3.02
|
||||
* Moved ResultIterator/PageIterator to ccmain.
|
||||
* Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic.
|
||||
|
@ -9,7 +9,7 @@ if VISIBILITY
|
||||
AM_CPPFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
|
||||
endif
|
||||
|
||||
include_HEADERS = apitypes.h baseapi.h capi.h
|
||||
include_HEADERS = apitypes.h baseapi.h capi.h renderer.h
|
||||
lib_LTLIBRARIES =
|
||||
|
||||
if !USING_MULTIPLELIBS
|
||||
@ -35,7 +35,7 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS)
|
||||
if VISIBILITY
|
||||
libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS
|
||||
endif
|
||||
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp
|
||||
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp
|
||||
|
||||
lib_LTLIBRARIES += libtesseract.la
|
||||
libtesseract_la_LDFLAGS =
|
||||
|
14
api/capi.cpp
14
api/capi.cpp
@ -2,6 +2,8 @@
|
||||
# define TESS_CAPI_INCLUDE_BASEAPI
|
||||
#endif
|
||||
#include "capi.h"
|
||||
#include "genericvector.h"
|
||||
#include "strngs.h"
|
||||
|
||||
TESS_API const char* TESS_CALL TessVersion()
|
||||
{
|
||||
@ -382,10 +384,10 @@ TESS_API BOOL TESS_CALL TessBaseAPIDetectOS(TessBaseAPI* handle, OSResults* resu
|
||||
return handle->DetectOS(results) ? TRUE : FALSE;
|
||||
}
|
||||
|
||||
TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, INT_FEATURE_ARRAY int_features,
|
||||
TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, INT_FEATURE_STRUCT* int_features,
|
||||
int* num_features, int* FeatureOutlineIndex)
|
||||
{
|
||||
handle->GetFeaturesForBlob(blob, *denorm, int_features, num_features, FeatureOutlineIndex);
|
||||
handle->GetFeaturesForBlob(blob, int_features, num_features, FeatureOutlineIndex);
|
||||
}
|
||||
|
||||
TESS_API ROW* TESS_CALL TessFindRowForBox(BLOCK_LIST* blocks, int left, int top, int right, int bottom)
|
||||
@ -393,10 +395,10 @@ TESS_API ROW* TESS_CALL TessFindRowForBox(BLOCK_LIST* blocks, int left, int top,
|
||||
return TessBaseAPI::FindRowForBox(blocks, left, top, right, bottom);
|
||||
}
|
||||
|
||||
TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, int num_max_matches,
|
||||
TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, int num_max_matches,
|
||||
int* unichar_ids, float* ratings, int* num_matches_returned)
|
||||
{
|
||||
handle->RunAdaptiveClassifier(blob, *denorm, num_max_matches, unichar_ids, ratings, num_matches_returned);
|
||||
handle->RunAdaptiveClassifier(blob, num_max_matches, unichar_ids, ratings, num_matches_returned);
|
||||
}
|
||||
|
||||
TESS_API const char* TESS_CALL TessBaseAPIGetUnichar(TessBaseAPI* handle, int unichar_id)
|
||||
@ -424,9 +426,9 @@ TESS_API TBLOB* TESS_CALL TessMakeTBLOB(struct Pix *pix)
|
||||
return TessBaseAPI::MakeTBLOB(pix);
|
||||
}
|
||||
|
||||
TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode, DENORM *denorm)
|
||||
TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode)
|
||||
{
|
||||
TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode != FALSE, denorm);
|
||||
TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode != FALSE);
|
||||
}
|
||||
|
||||
TESS_API TessOcrEngineMode TESS_CALL TessBaseAPIOem(const TessBaseAPI* handle)
|
||||
|
@ -205,11 +205,11 @@ TESS_API void TESS_CALL TessBaseAPISetProbabilityInContextFunc(TessBaseAPI* han
|
||||
TESS_API void TESS_CALL TessBaseAPISetFillLatticeFunc(TessBaseAPI* handle, TessFillLatticeFunc f);
|
||||
TESS_API BOOL TESS_CALL TessBaseAPIDetectOS(TessBaseAPI* handle, OSResults* results);
|
||||
|
||||
TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, INT_FEATURE_ARRAY int_features,
|
||||
TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, INT_FEATURE_STRUCT* int_features,
|
||||
int* num_features, int* FeatureOutlineIndex);
|
||||
|
||||
TESS_API ROW* TESS_CALL TessFindRowForBox(BLOCK_LIST* blocks, int left, int top, int right, int bottom);
|
||||
TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, int num_max_matches,
|
||||
TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, int num_max_matches,
|
||||
int* unichar_ids, float* ratings, int* num_matches_returned);
|
||||
#endif
|
||||
|
||||
@ -226,7 +226,7 @@ TESS_API int TESS_CALL TessBaseAPINumDawgs(const TessBaseAPI* handle);
|
||||
TESS_API ROW* TESS_CALL TessMakeTessOCRRow(float baseline, float xheight, float descender, float ascender);
|
||||
TESS_API TBLOB*
|
||||
TESS_CALL TessMakeTBLOB(Pix *pix);
|
||||
TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode, DENORM *denorm);
|
||||
TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode);
|
||||
|
||||
TESS_API TessOcrEngineMode
|
||||
TESS_CALL TessBaseAPIOem(const TessBaseAPI* handle);
|
||||
|
@ -19,7 +19,7 @@ noinst_HEADERS = \
|
||||
equationdetect.h fixspace.h imgscale.h mutableiterator.h osdetect.h \
|
||||
output.h paragraphs.h paragraphs_internal.h paramsd.h pgedit.h \
|
||||
reject.h scaleimg.h tessbox.h tessedit.h tesseractclass.h \
|
||||
tesseract_cube_combiner.h tessvars.h tfacep.h tfacepp.h werdit.h
|
||||
tesseract_cube_combiner.h tessvars.h werdit.h
|
||||
|
||||
if !USING_MULTIPLELIBS
|
||||
noinst_LTLIBRARIES = libtesseract_main.la
|
||||
@ -46,7 +46,7 @@ libtesseract_main_la_SOURCES = \
|
||||
imgscale.cpp ltrresultiterator.cpp \
|
||||
osdetect.cpp output.cpp pageiterator.cpp pagesegmain.cpp \
|
||||
pagewalk.cpp paragraphs.cpp paramsd.cpp pgedit.cpp recogtraining.cpp \
|
||||
reject.cpp resultiterator.cpp scaleimg.cpp \
|
||||
reject.cpp resultiterator.cpp scaleimg.cpp superscript.cpp \
|
||||
tesseract_cube_combiner.cpp \
|
||||
tessbox.cpp tessedit.cpp tesseractclass.cpp tessvars.cpp \
|
||||
tfacepp.cpp thresholder.cpp \
|
||||
|
@ -114,27 +114,12 @@ BOOL8 Tesseract::word_adaptable( //should we adapt?
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
// if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word))
|
||||
if (flags.bit (CHECK_AMBIG_WERD) &&
|
||||
!getDict().NoDangerousAmbig(word->best_choice, NULL, false, NULL, NULL)) {
|
||||
word->best_choice->dangerous_ambig_found()) {
|
||||
if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
// Do not adapt to words that are composed from fragments if
|
||||
// tessedit_adapt_to_char_fragments is false.
|
||||
if (!tessedit_adapt_to_char_fragments) {
|
||||
const char *fragment_lengths = word->best_choice->fragment_lengths();
|
||||
if (fragment_lengths != NULL && *fragment_lengths != '\0') {
|
||||
for (int i = 0; i < word->best_choice->length(); ++i) {
|
||||
if (fragment_lengths[i] > 1) {
|
||||
if (tessedit_adaption_debug) tprintf("won't adapt to fragments\n");
|
||||
return false; // found a character composed from fragments
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (tessedit_adaption_debug) {
|
||||
tprintf("returning status %d\n", status);
|
||||
}
|
||||
|
@ -235,21 +235,6 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
|
||||
return page_res;
|
||||
}
|
||||
|
||||
// Helper to make a WERD_CHOICE from the BLOB_CHOICE_LIST_VECTOR using only
|
||||
// the top choices. Avoids problems with very long words.
|
||||
static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices,
|
||||
const UNICHARSET& unicharset,
|
||||
WERD_CHOICE* word_choice) {
|
||||
*word_choice = WERD_CHOICE(&unicharset); // clear the word choice.
|
||||
word_choice->make_bad();
|
||||
for (int i = 0; i < char_choices.size(); ++i) {
|
||||
BLOB_CHOICE_IT it(char_choices[i]);
|
||||
BLOB_CHOICE* bc = it.data();
|
||||
word_choice->append_unichar_id(bc->unichar_id(), 1,
|
||||
bc->rating(), bc->certainty());
|
||||
}
|
||||
}
|
||||
|
||||
// Tests the chopper by exhaustively running chop_one_blob.
|
||||
// The word_res will contain filled chopped_word, seam_array, denorm,
|
||||
// box_word and best_state for the maximally chopped word.
|
||||
@ -257,7 +242,8 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
|
||||
BLOCK* block, ROW* row,
|
||||
WERD_RES* word_res) {
|
||||
if (!word_res->SetupForTessRecognition(unicharset, this, BestPix(), false,
|
||||
this->textord_use_cjk_fp_model,
|
||||
textord_use_cjk_fp_model,
|
||||
poly_allow_detailed_fx,
|
||||
row, block)) {
|
||||
word_res->CloneChoppedToRebuild();
|
||||
return;
|
||||
@ -266,13 +252,10 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
|
||||
tprintf("Maximally chopping word at:");
|
||||
word_res->word->bounding_box().print();
|
||||
}
|
||||
blob_match_table.init_match_table();
|
||||
BLOB_CHOICE_LIST *match_result;
|
||||
BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
|
||||
ASSERT_HOST(word_res->chopped_word->blobs != NULL);
|
||||
GenericVector<BLOB_CHOICE*> blob_choices;
|
||||
ASSERT_HOST(!word_res->chopped_word->blobs.empty());
|
||||
float rating = static_cast<float>(MAX_INT8);
|
||||
for (TBLOB* blob = word_res->chopped_word->blobs; blob != NULL;
|
||||
blob = blob->next) {
|
||||
for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
|
||||
// The rating and certainty are not quite arbitrary. Since
|
||||
// select_blob_to_chop uses the worst certainty to choose, they all have
|
||||
// to be different, so starting with MAX_INT8, subtract 1/8 for each blob
|
||||
@ -281,32 +264,33 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
|
||||
// produced, however much chopping is required. The chops are thus only
|
||||
// limited by the ability of the chopper to find suitable chop points,
|
||||
// and not by the value of the certainties.
|
||||
match_result = fake_classify_blob(0, rating, -rating);
|
||||
modify_blob_choice(match_result, 0);
|
||||
ASSERT_HOST(!match_result->empty());
|
||||
*char_choices += match_result;
|
||||
BLOB_CHOICE* choice =
|
||||
new BLOB_CHOICE(0, rating, -rating, -1, -1, 0, 0, 0, 0, BCC_FAKE);
|
||||
blob_choices.push_back(choice);
|
||||
rating -= 0.125f;
|
||||
}
|
||||
inT32 blob_number;
|
||||
const double e = exp(1.0); // The base of natural logs.
|
||||
int blob_number;
|
||||
int right_chop_index = 0;
|
||||
if (!assume_fixed_pitch_char_segment) {
|
||||
// We only chop if the language is not fixed pitch like CJK.
|
||||
if (prioritize_division) {
|
||||
while (chop_one_blob2(boxes, word_res, &word_res->seam_array));
|
||||
} else {
|
||||
while (chop_one_blob(word_res->chopped_word, char_choices,
|
||||
&blob_number, &word_res->seam_array,
|
||||
&right_chop_index));
|
||||
SEAM* seam = NULL;
|
||||
while ((seam = chop_one_blob(boxes, blob_choices, word_res,
|
||||
&blob_number)) != NULL) {
|
||||
word_res->InsertSeam(blob_number, seam);
|
||||
BLOB_CHOICE* left_choice = blob_choices[blob_number];
|
||||
rating = left_choice->rating() / e;
|
||||
left_choice->set_rating(rating);
|
||||
left_choice->set_certainty(-rating);
|
||||
// combine confidence w/ serial #
|
||||
BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
|
||||
rating - 0.125f, -rating,
|
||||
-1, -1, 0, 0, 0, 0, BCC_FAKE);
|
||||
blob_choices.insert(right_choice, blob_number + 1);
|
||||
}
|
||||
}
|
||||
MakeWordChoice(*char_choices, unicharset, word_res->best_choice);
|
||||
MakeWordChoice(*char_choices, unicharset, word_res->raw_choice);
|
||||
word_res->CloneChoppedToRebuild();
|
||||
blob_match_table.end_match_table();
|
||||
if (char_choices != NULL) {
|
||||
char_choices->delete_data_pointers();
|
||||
delete char_choices;
|
||||
}
|
||||
word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
|
||||
}
|
||||
|
||||
// Helper to compute the dispute resolution metric.
|
||||
@ -558,7 +542,6 @@ bool Tesseract::ConvertStringToUnichars(const char* utf8,
|
||||
// substitutions ARE used.
|
||||
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
|
||||
WERD_RES* word_res) {
|
||||
blob_match_table.init_match_table();
|
||||
// Classify all required combinations of blobs and save results in choices.
|
||||
int word_length = word_res->box_word->length();
|
||||
GenericVector<BLOB_CHOICE_LIST*>* choices =
|
||||
@ -566,8 +549,8 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
|
||||
for (int i = 0; i < word_length; ++i) {
|
||||
for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
|
||||
BLOB_CHOICE_LIST* match_result = classify_piece(
|
||||
word_res->chopped_word->blobs, word_res->denorm, word_res->seam_array,
|
||||
i, i + j - 1, word_res->blamer_bundle);
|
||||
word_res->seam_array, i, i + j - 1, "Applybox",
|
||||
word_res->chopped_word, word_res->blamer_bundle);
|
||||
if (applybox_debug > 2) {
|
||||
tprintf("%d+%d:", i, j);
|
||||
print_ratings_list("Segment:", match_result, unicharset);
|
||||
@ -583,7 +566,6 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
|
||||
float best_rating = 0.0f;
|
||||
SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
|
||||
&search_segmentation, &best_rating, &word_res->best_state);
|
||||
blob_match_table.end_match_table();
|
||||
for (int i = 0; i < word_length; ++i)
|
||||
choices[i].delete_data_pointers();
|
||||
delete [] choices;
|
||||
@ -591,9 +573,8 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
|
||||
// Build the original segmentation and if it is the same length as the
|
||||
// truth, assume it will do.
|
||||
int blob_count = 1;
|
||||
for (int s = 0; s < array_count(word_res->seam_array); ++s) {
|
||||
SEAM* seam =
|
||||
reinterpret_cast<SEAM*>(array_value(word_res->seam_array, s));
|
||||
for (int s = 0; s < word_res->seam_array.size(); ++s) {
|
||||
SEAM* seam = word_res->seam_array[s];
|
||||
if (seam->split1 == NULL) {
|
||||
word_res->best_state.push_back(blob_count);
|
||||
blob_count = 1;
|
||||
@ -707,21 +688,25 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
|
||||
WERD_RES* word_res;
|
||||
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
|
||||
int ok_in_word = 0;
|
||||
BLOB_CHOICE_LIST_VECTOR char_choices;
|
||||
for (int i = word_res->correct_text.size() - 1; i >= 0; i--) {
|
||||
if (word_res->correct_text[i].length() > 0) {
|
||||
int blob_count = word_res->correct_text.size();
|
||||
WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
|
||||
word_choice->set_permuter(TOP_CHOICE_PERM);
|
||||
for (int c = 0; c < blob_count; ++c) {
|
||||
if (word_res->correct_text[c].length() > 0) {
|
||||
++ok_in_word;
|
||||
}
|
||||
// Since we only need a fake word_res->best_choice, the actual
|
||||
// unichar_ids do not matter. Which is fortunate, since TidyUp()
|
||||
// can be called while training Tesseract, at the stage where
|
||||
// unicharset is not meaningful yet.
|
||||
char_choices += fake_classify_blob(INVALID_UNICHAR_ID, 1.0, -1.0);
|
||||
word_choice->append_unichar_id_space_allocated(
|
||||
INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
|
||||
}
|
||||
if (ok_in_word > 0) {
|
||||
ok_blob_count += ok_in_word;
|
||||
bad_blob_count += word_res->correct_text.size() - ok_in_word;
|
||||
MakeWordChoice(char_choices, unicharset, word_res->best_choice);
|
||||
word_res->LogNewRawChoice(word_choice);
|
||||
word_res->LogNewCookedChoice(1, false, word_choice);
|
||||
} else {
|
||||
++unlabelled_words;
|
||||
if (applybox_debug > 0) {
|
||||
@ -730,7 +715,6 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
|
||||
}
|
||||
pr_it.DeleteCurrentWord();
|
||||
}
|
||||
char_choices.delete_data_pointers();
|
||||
}
|
||||
pr_it.restart_page();
|
||||
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
|
||||
@ -772,11 +756,13 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
|
||||
GenericVector<STRING> tokens;
|
||||
word_res->correct_text[i].split(' ', &tokens);
|
||||
UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
|
||||
choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f);
|
||||
choice->append_unichar_id_space_allocated(char_id,
|
||||
word_res->best_state[i],
|
||||
0.0f, 0.0f);
|
||||
}
|
||||
if (word_res->best_choice != NULL)
|
||||
delete word_res->best_choice;
|
||||
word_res->best_choice = choice;
|
||||
word_res->ClearWordChoices();
|
||||
word_res->LogNewRawChoice(choice);
|
||||
word_res->LogNewCookedChoice(1, false, choice);
|
||||
}
|
||||
}
|
||||
|
||||
@ -787,7 +773,7 @@ void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) {
|
||||
int word_count = 0;
|
||||
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
|
||||
word_res = pr_it.forward()) {
|
||||
LearnWord(filename.string(), NULL, word_res);
|
||||
LearnWord(filename.string(), word_res);
|
||||
++word_count;
|
||||
}
|
||||
tprintf("Generated training data for %d words\n", word_count);
|
||||
|
@ -29,7 +29,6 @@
|
||||
#include "ocrclass.h"
|
||||
#include "werdit.h"
|
||||
#include "drawfx.h"
|
||||
#include "tfacep.h"
|
||||
#include "tessbox.h"
|
||||
#include "tessvars.h"
|
||||
#include "pgedit.h"
|
||||
@ -55,6 +54,9 @@
|
||||
const char* const kBackUpConfigFile = "tempconfigdata.config";
|
||||
// Multiple of x-height to make a repeated word have spaces in it.
|
||||
const double kRepcharGapThreshold = 0.5;
|
||||
// Min believable x-height for any text when refitting as a fraction of
|
||||
// original x-height
|
||||
const double kMinRefitXHeightFraction = 0.5;
|
||||
|
||||
|
||||
/**
|
||||
@ -293,9 +295,9 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
|
||||
// Update misadaption log (we only need to do it on pass 1, since
|
||||
// adaption only happens on this pass).
|
||||
if (page_res_it.word()->blamer_bundle != NULL &&
|
||||
page_res_it.word()->blamer_bundle->misadaption_debug.length() > 0) {
|
||||
page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
|
||||
page_res->misadaption_log.push_back(
|
||||
page_res_it.word()->blamer_bundle->misadaption_debug);
|
||||
page_res_it.word()->blamer_bundle->misadaption_debug());
|
||||
}
|
||||
|
||||
page_res_it.forward();
|
||||
@ -308,7 +310,8 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
|
||||
page_res_it.restart_page();
|
||||
word_index = 0;
|
||||
most_recently_used_ = this;
|
||||
while (!tessedit_test_adaption && page_res_it.word() != NULL) {
|
||||
while (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption &&
|
||||
page_res_it.word() != NULL) {
|
||||
set_global_loc_code(LOC_PASS2);
|
||||
word_index++;
|
||||
if (monitor != NULL) {
|
||||
@ -382,17 +385,6 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res,
|
||||
blamer_pass(page_res);
|
||||
}
|
||||
|
||||
if (!save_blob_choices) {
|
||||
// We aren't saving the blob choices so get rid of them now.
|
||||
// set_blob_choices() does a deep clear.
|
||||
page_res_it.restart_page();
|
||||
while (page_res_it.word() != NULL) {
|
||||
WERD_RES* word = page_res_it.word();
|
||||
word->best_choice->set_blob_choices(NULL);
|
||||
page_res_it.forward();
|
||||
}
|
||||
}
|
||||
|
||||
// Write results pass.
|
||||
set_global_loc_code(LOC_WRITE_RESULTS);
|
||||
// This is now redundant, but retained commented so show how to obtain
|
||||
@ -436,39 +428,21 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
|
||||
continue;
|
||||
}
|
||||
// Two words sharing the same language model, excellent!
|
||||
if (w->alt_choices.empty()) {
|
||||
if (tessedit_bigram_debug) {
|
||||
tprintf("Alt choices not set up for word choice: %s\n",
|
||||
w->best_choice->unichar_string().string());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if (w_prev->alt_choices.empty()) {
|
||||
if (tessedit_bigram_debug) {
|
||||
tprintf("Alt choices not set up for word choice: %s\n",
|
||||
w_prev->best_choice->unichar_string().string());
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// We saved alternate choices, excellent!
|
||||
GenericVector<WERD_CHOICE *> overrides_word1;
|
||||
GenericVector<GenericVector<int> *> overrides_word1_state;
|
||||
GenericVector<WERD_CHOICE *> overrides_word2;
|
||||
GenericVector<GenericVector<int> *> overrides_word2_state;
|
||||
|
||||
STRING orig_w1_str = w_prev->best_choice->unichar_string();
|
||||
STRING orig_w2_str = w->best_choice->unichar_string();
|
||||
WERD_CHOICE prev_best(w->uch_set);
|
||||
{
|
||||
int w1start, w1end;
|
||||
w_prev->WithoutFootnoteSpan(&w1start, &w1end);
|
||||
w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
|
||||
prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
|
||||
}
|
||||
WERD_CHOICE this_best(w->uch_set);
|
||||
{
|
||||
int w2start, w2end;
|
||||
w->WithoutFootnoteSpan(&w2start, &w2end);
|
||||
w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
|
||||
this_best = w->best_choice->shallow_copy(w2start, w2end);
|
||||
}
|
||||
|
||||
@ -484,37 +458,36 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
|
||||
orig_w1_str.string(), orig_w2_str.string());
|
||||
}
|
||||
if (tessedit_bigram_debug > 1) {
|
||||
if (w_prev->alt_choices.size() > 1) {
|
||||
print_word_alternates_list(w_prev->best_choice, &w_prev->alt_choices);
|
||||
if (!w_prev->best_choices.singleton()) {
|
||||
w_prev->PrintBestChoices();
|
||||
}
|
||||
if (w->alt_choices.size() > 1) {
|
||||
print_word_alternates_list(w->best_choice, &w->alt_choices);
|
||||
if (!w->best_choices.singleton()) {
|
||||
w->PrintBestChoices();
|
||||
}
|
||||
}
|
||||
float best_rating = 0.0;
|
||||
int best_idx = 0;
|
||||
for (int i = 0; i < w_prev->alt_choices.size(); i++) {
|
||||
WERD_CHOICE *p1 = w_prev->alt_choices.get(i);
|
||||
WERD_CHOICE_IT prev_it(&w_prev->best_choices);
|
||||
for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
|
||||
WERD_CHOICE *p1 = prev_it.data();
|
||||
WERD_CHOICE strip1(w->uch_set);
|
||||
{
|
||||
int p1start, p1end;
|
||||
w_prev->WithoutFootnoteSpan(*p1, w_prev->alt_states.get(i),
|
||||
&p1start, &p1end);
|
||||
p1->GetNonSuperscriptSpan(&p1start, &p1end);
|
||||
strip1 = p1->shallow_copy(p1start, p1end);
|
||||
}
|
||||
for (int j = 0; j < w->alt_choices.size(); j++) {
|
||||
WERD_CHOICE *p2 = w->alt_choices.get(j);
|
||||
WERD_CHOICE_IT w_it(&w->best_choices);
|
||||
for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
|
||||
WERD_CHOICE *p2 = w_it.data();
|
||||
WERD_CHOICE strip2(w->uch_set);
|
||||
{
|
||||
int p2start, p2end;
|
||||
w->WithoutFootnoteSpan(*p2, w->alt_states.get(j), &p2start, &p2end);
|
||||
p2->GetNonSuperscriptSpan(&p2start, &p2end);
|
||||
strip2 = p2->shallow_copy(p2start, p2end);
|
||||
}
|
||||
if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
|
||||
overrides_word1.push_back(p1);
|
||||
overrides_word1_state.push_back(&w_prev->alt_states.get(i));
|
||||
overrides_word2.push_back(p2);
|
||||
overrides_word2_state.push_back(&w->alt_states.get(j));
|
||||
if (overrides_word1.size() == 1 ||
|
||||
p1->rating() + p2->rating() < best_rating) {
|
||||
best_rating = p1->rating() + p2->rating();
|
||||
@ -538,12 +511,10 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {
|
||||
STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
|
||||
STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
|
||||
if (new_w1_str != orig_w1_str) {
|
||||
w_prev->ReplaceBestChoice(*overrides_word1[best_idx],
|
||||
*overrides_word1_state[best_idx]);
|
||||
w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
|
||||
}
|
||||
if (new_w2_str != orig_w2_str) {
|
||||
w->ReplaceBestChoice(*overrides_word2[best_idx],
|
||||
*overrides_word2_state[best_idx]);
|
||||
w->ReplaceBestChoice(overrides_word2[best_idx]);
|
||||
}
|
||||
if (tessedit_bigram_debug > 0) {
|
||||
STRING choices_description;
|
||||
@ -684,34 +655,8 @@ void Tesseract::blamer_pass(PAGE_RES* page_res) {
|
||||
for (page_res_it.restart_page(); page_res_it.word() != NULL;
|
||||
page_res_it.forward()) {
|
||||
WERD_RES *word = page_res_it.word();
|
||||
if (word->blamer_bundle == NULL) {
|
||||
word->blamer_bundle = new BlamerBundle();
|
||||
word->blamer_bundle->incorrect_result_reason = IRR_PAGE_LAYOUT;
|
||||
word->blamer_bundle->debug = word->blamer_bundle->IncorrectReason();
|
||||
word->blamer_bundle->debug += " to blame";
|
||||
} else if (word->blamer_bundle->incorrect_result_reason ==
|
||||
IRR_NO_TRUTH) {
|
||||
word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth",
|
||||
word->best_choice, wordrec_debug_blamer);
|
||||
} else {
|
||||
bool correct = ChoiceIsCorrect(*word->uch_set, word->best_choice,
|
||||
word->blamer_bundle->truth_text);
|
||||
IncorrectResultReason irr =
|
||||
word->blamer_bundle->incorrect_result_reason;
|
||||
if (irr == IRR_CORRECT && !correct) {
|
||||
STRING debug = "Choice is incorrect after recognition";
|
||||
word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug,
|
||||
word->best_choice,
|
||||
wordrec_debug_blamer);
|
||||
} else if (irr != IRR_CORRECT && correct) {
|
||||
if (wordrec_debug_blamer) {
|
||||
tprintf("Corrected %s\n", word->blamer_bundle->debug.string());
|
||||
}
|
||||
word->blamer_bundle->incorrect_result_reason = IRR_CORRECT;
|
||||
word->blamer_bundle->debug = "";
|
||||
}
|
||||
}
|
||||
page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason]++;
|
||||
BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);
|
||||
page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;
|
||||
}
|
||||
tprintf("Blame reasons:\n");
|
||||
for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
|
||||
@ -730,7 +675,9 @@ void Tesseract::blamer_pass(PAGE_RES* page_res) {
|
||||
// Helper returns true if the new_word is better than the word, using a
|
||||
// simple test of better certainty AND rating (to reduce false positives
|
||||
// from cube) or a dictionary vs non-dictionary word.
|
||||
static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word) {
|
||||
static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word,
|
||||
double rating_ratio,
|
||||
double certainty_margin) {
|
||||
if (new_word.best_choice == NULL) {
|
||||
return false; // New one no good.
|
||||
}
|
||||
@ -742,7 +689,11 @@ static bool NewWordBetter(const WERD_RES& word, const WERD_RES& new_word) {
|
||||
return true; // New word has better confidence.
|
||||
}
|
||||
if (!Dict::valid_word_permuter(word.best_choice->permuter(), false) &&
|
||||
Dict::valid_word_permuter(new_word.best_choice->permuter(), false)) {
|
||||
Dict::valid_word_permuter(new_word.best_choice->permuter(), false) &&
|
||||
new_word.best_choice->rating() <
|
||||
word.best_choice->rating() * rating_ratio &&
|
||||
new_word.best_choice->certainty() >
|
||||
word.best_choice->certainty() - certainty_margin) {
|
||||
return true; // New word is from a dictionary.
|
||||
}
|
||||
return false; // New word is no better.
|
||||
@ -764,7 +715,9 @@ bool Tesseract::RetryWithLanguage(WERD_RES *word, BLOCK* block, ROW *row,
|
||||
// (to reduce false positives from cube) or a dictionary vs non-dictionary
|
||||
// word.
|
||||
(this->*recognizer)(block, row, &lang_word);
|
||||
bool new_is_better = NewWordBetter(*word, lang_word);
|
||||
bool new_is_better = NewWordBetter(*word, lang_word,
|
||||
classify_max_rating_ratio,
|
||||
classify_max_certainty_margin);
|
||||
if (classify_debug_level || cube_debug_level) {
|
||||
if (lang_word.best_choice == NULL) {
|
||||
tprintf("New result %s better:%s\n",
|
||||
@ -793,6 +746,7 @@ void Tesseract::classify_word_and_language(WordRecognizer recognizer,
|
||||
BLOCK* block,
|
||||
ROW *row,
|
||||
WERD_RES *word) {
|
||||
clock_t start_t = clock();
|
||||
if (classify_debug_level || cube_debug_level) {
|
||||
tprintf("Processing word with lang %s at:",
|
||||
most_recently_used_->lang.string());
|
||||
@ -811,12 +765,15 @@ void Tesseract::classify_word_and_language(WordRecognizer recognizer,
|
||||
if (!word->tess_failed && word->tess_accepted)
|
||||
result_type = "Accepted";
|
||||
if (classify_debug_level || cube_debug_level) {
|
||||
tprintf("%s result: %s r=%g, c=%g, accepted=%d, adaptable=%d\n",
|
||||
tprintf("%s result: %s r=%.4g, c=%.4g, accepted=%d, adaptable=%d"
|
||||
" xht=[%g,%g]\n",
|
||||
result_type,
|
||||
word->best_choice->unichar_string().string(),
|
||||
word->best_choice->rating(),
|
||||
word->best_choice->certainty(),
|
||||
word->tess_accepted, word->tess_would_adapt);
|
||||
word->tess_accepted, word->tess_would_adapt,
|
||||
word->best_choice->min_x_height(),
|
||||
word->best_choice->max_x_height());
|
||||
}
|
||||
if (word->tess_failed || !word->tess_accepted) {
|
||||
// Try all the other languages to see if they are any better.
|
||||
@ -846,6 +803,12 @@ void Tesseract::classify_word_and_language(WordRecognizer recognizer,
|
||||
}
|
||||
}
|
||||
}
|
||||
clock_t ocr_t = clock();
|
||||
if (tessedit_timing_debug) {
|
||||
tprintf("%s (ocr took %.2f sec)\n",
|
||||
word->best_choice->unichar_string().string(),
|
||||
static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -860,92 +823,25 @@ void Tesseract::classify_word_pass1(BLOCK* block, ROW *row, WERD_RES *word) {
|
||||
cube_word_pass1(block, row, word);
|
||||
return;
|
||||
}
|
||||
match_word_pass_n(1, word, row, block);
|
||||
if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
|
||||
word->tess_would_adapt = AdaptableWord(word);
|
||||
bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
|
||||
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
|
||||
BOOL8 adapt_ok;
|
||||
const char *rejmap;
|
||||
inT16 index;
|
||||
STRING mapstr = "";
|
||||
|
||||
check_debug_pt(word, 0);
|
||||
if (word->SetupForTessRecognition(unicharset, this, BestPix(),
|
||||
classify_bln_numeric_mode,
|
||||
this->textord_use_cjk_fp_model,
|
||||
row, block))
|
||||
tess_segment_pass1(word, blob_choices);
|
||||
if (!word->tess_failed) {
|
||||
/*
|
||||
The adaption step used to be here. It has been moved to after
|
||||
make_reject_map so that we know whether the word will be accepted in the
|
||||
first pass or not. This move will PREVENT adaption to words containing
|
||||
double quotes because the word will not be identical to what tess thinks
|
||||
its best choice is. (See CurrentBestChoiceIs in
|
||||
stopper.cpp which is used by AdaptableWord in
|
||||
adaptmatch.cpp)
|
||||
*/
|
||||
|
||||
if (!word->word->flag(W_REP_CHAR)) {
|
||||
// TODO(daria) delete these hacks when replaced by more generic code.
|
||||
// Convert '' (double single) to " (single double).
|
||||
word->fix_quotes(blob_choices);
|
||||
if (tessedit_fix_hyphens) // turn -- to -
|
||||
word->fix_hyphens(blob_choices);
|
||||
|
||||
word->tess_accepted = tess_acceptable_word(word->best_choice,
|
||||
word->raw_choice);
|
||||
|
||||
word->tess_would_adapt = word->best_choice && word->raw_choice &&
|
||||
AdaptableWord(word->rebuild_word,
|
||||
*word->best_choice,
|
||||
*word->raw_choice);
|
||||
// Also sets word->done flag
|
||||
make_reject_map(word, blob_choices, row, 1);
|
||||
|
||||
adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
|
||||
|
||||
if (adapt_ok || tessedit_tess_adapt_to_rejmap) {
|
||||
if (!tessedit_tess_adapt_to_rejmap) {
|
||||
rejmap = NULL;
|
||||
} else {
|
||||
ASSERT_HOST(word->reject_map.length() ==
|
||||
word->best_choice->length());
|
||||
|
||||
for (index = 0; index < word->reject_map.length(); index++) {
|
||||
if (adapt_ok || word->reject_map[index].accepted())
|
||||
mapstr += '1';
|
||||
else
|
||||
mapstr += '0';
|
||||
}
|
||||
rejmap = mapstr.string();
|
||||
}
|
||||
// Send word to adaptive classifier for training.
|
||||
word->BestChoiceToCorrectText();
|
||||
set_word_fonts(word, blob_choices);
|
||||
LearnWord(NULL, rejmap, word);
|
||||
// Mark misadaptions if running blamer.
|
||||
if (word->blamer_bundle != NULL &&
|
||||
word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH &&
|
||||
!ChoiceIsCorrect(*word->uch_set, word->best_choice,
|
||||
word->blamer_bundle->truth_text)) {
|
||||
word->blamer_bundle->misadaption_debug ="misadapt to word (";
|
||||
word->blamer_bundle->misadaption_debug +=
|
||||
word->best_choice->permuter_name();
|
||||
word->blamer_bundle->misadaption_debug += "): ";
|
||||
word->blamer_bundle->FillDebugString(
|
||||
"", word->best_choice, &(word->blamer_bundle->misadaption_debug));
|
||||
if (wordrec_debug_blamer) {
|
||||
tprintf("%s\n", word->blamer_bundle->misadaption_debug.string());
|
||||
}
|
||||
}
|
||||
if (adapt_ok) {
|
||||
// Send word to adaptive classifier for training.
|
||||
word->BestChoiceToCorrectText();
|
||||
LearnWord(NULL, word);
|
||||
// Mark misadaptions if running blamer.
|
||||
if (word->blamer_bundle != NULL) {
|
||||
word->blamer_bundle->SetMisAdaptionDebug(word->best_choice,
|
||||
wordrec_debug_blamer);
|
||||
}
|
||||
|
||||
if (tessedit_enable_doc_dict)
|
||||
tess_add_doc_word(word->best_choice);
|
||||
}
|
||||
}
|
||||
|
||||
// Save best choices in the WERD_CHOICE if needed
|
||||
word->best_choice->set_blob_choices(blob_choices);
|
||||
if (tessedit_enable_doc_dict && !word->IsAmbiguous())
|
||||
tess_add_doc_word(word->best_choice);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper to report the result of the xheight fix.
|
||||
@ -976,7 +872,7 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
|
||||
if (original_misfits == 0)
|
||||
return false;
|
||||
float new_x_ht = ComputeCompatibleXheight(word);
|
||||
if (new_x_ht > 0.0f) {
|
||||
if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
|
||||
WERD_RES new_x_ht_word(word->word);
|
||||
if (word->blamer_bundle != NULL) {
|
||||
new_x_ht_word.blamer_bundle = new BlamerBundle();
|
||||
@ -984,7 +880,7 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
|
||||
}
|
||||
new_x_ht_word.x_height = new_x_ht;
|
||||
new_x_ht_word.caps_height = 0.0;
|
||||
match_word_pass2(&new_x_ht_word, row, block);
|
||||
match_word_pass_n(2, &new_x_ht_word, row, block);
|
||||
if (!new_x_ht_word.tess_failed) {
|
||||
int new_misfits = CountMisfitTops(&new_x_ht_word);
|
||||
if (debug_x_ht_level >= 1) {
|
||||
@ -1026,26 +922,24 @@ void Tesseract::classify_word_pass2(BLOCK* block, ROW *row, WERD_RES *word) {
|
||||
tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED)
|
||||
return;
|
||||
|
||||
bool done_this_pass = false;
|
||||
set_global_subloc_code(SUBLOC_NORM);
|
||||
check_debug_pt(word, 30);
|
||||
if (!word->done || tessedit_training_tess) {
|
||||
word->caps_height = 0.0;
|
||||
if (word->x_height == 0.0f)
|
||||
word->x_height = row->x_height();
|
||||
match_word_pass2(word, row, block);
|
||||
done_this_pass = TRUE;
|
||||
match_word_pass_n(2, word, row, block);
|
||||
check_debug_pt(word, 40);
|
||||
}
|
||||
|
||||
SubAndSuperscriptFix(word);
|
||||
|
||||
if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
|
||||
bool accept_new_xht = false;
|
||||
if (unicharset.top_bottom_useful() && unicharset.script_has_xheight()) {
|
||||
if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() &&
|
||||
block->classify_rotation().y() == 0.0f) {
|
||||
// Use the tops and bottoms since they are available.
|
||||
accept_new_xht = TrainedXheightFix(word, block, row);
|
||||
TrainedXheightFix(word, block, row);
|
||||
}
|
||||
if (accept_new_xht)
|
||||
done_this_pass = true;
|
||||
// Test for small caps. Word capheight must be close to block xheight,
|
||||
// and word must contain no lower case letters, and at least one upper case.
|
||||
double small_cap_xheight = block->x_height() * kXHeightCapRatio;
|
||||
@ -1092,60 +986,38 @@ void Tesseract::classify_word_pass2(BLOCK* block, ROW *row, WERD_RES *word) {
|
||||
* Baseline normalize the word and pass it to Tess.
|
||||
*/
|
||||
|
||||
void Tesseract::match_word_pass2(WERD_RES *word, //word to do
|
||||
ROW *row,
|
||||
BLOCK* block) {
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
|
||||
|
||||
void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word,
|
||||
ROW *row, BLOCK* block) {
|
||||
if (word->SetupForTessRecognition(unicharset, this, BestPix(),
|
||||
classify_bln_numeric_mode,
|
||||
this->textord_use_cjk_fp_model,
|
||||
textord_use_cjk_fp_model,
|
||||
poly_allow_detailed_fx,
|
||||
row, block))
|
||||
tess_segment_pass2(word, blob_choices);
|
||||
tess_segment_pass_n(pass_n, word);
|
||||
|
||||
if (!word->tess_failed) {
|
||||
if (!word->word->flag (W_REP_CHAR)) {
|
||||
word->fix_quotes(blob_choices);
|
||||
word->fix_quotes();
|
||||
if (tessedit_fix_hyphens)
|
||||
word->fix_hyphens(blob_choices);
|
||||
word->fix_hyphens();
|
||||
/* Dont trust fix_quotes! - though I think I've fixed the bug */
|
||||
if (word->best_choice->length() != word->box_word->length() ||
|
||||
word->best_choice->length() != blob_choices->length()) {
|
||||
if (word->best_choice->length() != word->box_word->length()) {
|
||||
tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
|
||||
" #Blobs=%d; #Choices=%d\n",
|
||||
" #Blobs=%d\n",
|
||||
word->best_choice->debug_string().string(),
|
||||
word->best_choice->length(),
|
||||
word->box_word->length(), blob_choices->length());
|
||||
word->box_word->length());
|
||||
|
||||
}
|
||||
word->tess_accepted = tess_acceptable_word(word->best_choice,
|
||||
word->raw_choice);
|
||||
word->tess_accepted = tess_acceptable_word(word);
|
||||
|
||||
make_reject_map (word, blob_choices, row, 2);
|
||||
// Also sets word->done flag
|
||||
make_reject_map(word, row, pass_n);
|
||||
}
|
||||
}
|
||||
set_word_fonts(word);
|
||||
|
||||
// Save best choices in the WERD_CHOICE if needed
|
||||
word->best_choice->set_blob_choices(blob_choices);
|
||||
set_word_fonts(word, blob_choices);
|
||||
|
||||
assert (word->raw_choice != NULL);
|
||||
}
|
||||
|
||||
// Helper to find the BLOB_CHOICE in the bc_list that matches the given
|
||||
// unichar_id, or NULL if there is no match.
|
||||
static BLOB_CHOICE* FindMatchingChoice(UNICHAR_ID char_id,
|
||||
BLOB_CHOICE_LIST* bc_list) {
|
||||
// Find the corresponding best BLOB_CHOICE.
|
||||
BLOB_CHOICE_IT choice_it(bc_list);
|
||||
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
|
||||
choice_it.forward()) {
|
||||
BLOB_CHOICE* choice = choice_it.data();
|
||||
if (choice->unichar_id() == char_id) {
|
||||
return choice;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
ASSERT_HOST(word->raw_choice != NULL);
|
||||
}
|
||||
|
||||
// Helper to return the best rated BLOB_CHOICE in the whole word that matches
|
||||
@ -1154,9 +1026,9 @@ static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
|
||||
WERD_RES* word_res) {
|
||||
// Find the corresponding best BLOB_CHOICE from any position in the word_res.
|
||||
BLOB_CHOICE* best_choice = NULL;
|
||||
BLOB_CHOICE_LIST_C_IT bc_it(word_res->best_choice->blob_choices());
|
||||
for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
|
||||
BLOB_CHOICE* choice = FindMatchingChoice(char_id, bc_it.data());
|
||||
for (int i = 0; i < word_res->best_choice->length(); ++i) {
|
||||
BLOB_CHOICE* choice = FindMatchingChoice(char_id,
|
||||
word_res->GetBlobChoices(i));
|
||||
if (choice != NULL) {
|
||||
if (best_choice == NULL || choice->rating() < best_choice->rating())
|
||||
best_choice = choice;
|
||||
@ -1171,12 +1043,11 @@ static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
|
||||
static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice,
|
||||
WERD_RES* word_res) {
|
||||
WERD_CHOICE* word = word_res->best_choice;
|
||||
BLOB_CHOICE_LIST_C_IT bc_it(word->blob_choices());
|
||||
for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
|
||||
for (int i = 0; i < word_res->best_choice->length(); ++i) {
|
||||
BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(),
|
||||
bc_it.data());
|
||||
word_res->GetBlobChoices(i));
|
||||
if (choice == NULL) {
|
||||
BLOB_CHOICE_IT choice_it(bc_it.data());
|
||||
BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
|
||||
choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
|
||||
}
|
||||
}
|
||||
@ -1267,7 +1138,8 @@ void Tesseract::ExplodeRepeatedWord(BLOB_CHOICE* best_choice,
|
||||
// Setup the single char WERD_RES
|
||||
if (rep_word->SetupForTessRecognition(*word_res->uch_set, this, BestPix(),
|
||||
false,
|
||||
this->textord_use_cjk_fp_model,
|
||||
textord_use_cjk_fp_model,
|
||||
poly_allow_detailed_fx,
|
||||
page_res_it->row()->row,
|
||||
page_res_it->block()->block)) {
|
||||
rep_word->CloneChoppedToRebuild();
|
||||
@ -1494,16 +1366,14 @@ static void find_modal_font( //good chars in word
|
||||
*
|
||||
* Get the fonts for the word.
|
||||
*/
|
||||
void Tesseract::set_word_fonts(WERD_RES *word,
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices) {
|
||||
if (blob_choices == NULL) return;
|
||||
void Tesseract::set_word_fonts(WERD_RES *word) {
|
||||
// Don't try to set the word fonts for a cube word, as the configs
|
||||
// will be meaningless.
|
||||
if (word->chopped_word == NULL) return;
|
||||
ASSERT_HOST(word->best_choice != NULL);
|
||||
|
||||
inT32 index; // char id index
|
||||
// character iterator
|
||||
BLOB_CHOICE_LIST_C_IT char_it = blob_choices;
|
||||
BLOB_CHOICE_IT choice_it; // choice iterator
|
||||
int fontinfo_size = get_fontinfo_table().size();
|
||||
int fontset_size = get_fontset_table().size();
|
||||
@ -1516,10 +1386,9 @@ void Tesseract::set_word_fonts(WERD_RES *word,
|
||||
word->best_choice_fontinfo_ids.clear();
|
||||
}
|
||||
// Compute the modal font for the word
|
||||
for (char_it.mark_cycle_pt(), index = 0;
|
||||
!char_it.cycled_list(); ++index, char_it.forward()) {
|
||||
for (index = 0; index < word->best_choice->length(); ++index) {
|
||||
UNICHAR_ID word_ch_id = word->best_choice->unichar_id(index);
|
||||
choice_it.set_to_list(char_it.data());
|
||||
choice_it.set_to_list(word->GetBlobChoices(index));
|
||||
if (tessedit_debug_fonts) {
|
||||
tprintf("Examining fonts in %s\n",
|
||||
word->best_choice->debug_string().string());
|
||||
|
@ -144,54 +144,6 @@ bool Tesseract::create_cube_box_word(Boxa *char_boxes,
|
||||
return true;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* create_werd_choice
|
||||
*
|
||||
**********************************************************************/
|
||||
static WERD_CHOICE *create_werd_choice(
|
||||
CharSamp** char_samples,
|
||||
int num_chars,
|
||||
const char* str,
|
||||
float certainty,
|
||||
const UNICHARSET &unicharset,
|
||||
CharSet* cube_char_set
|
||||
) {
|
||||
// Insert unichar ids into WERD_CHOICE
|
||||
WERD_CHOICE *werd_choice = new WERD_CHOICE(&unicharset, num_chars);
|
||||
// within a word, cube recognizes the word in reading order.
|
||||
werd_choice->set_unichars_in_script_order(true);
|
||||
ASSERT_HOST(werd_choice != NULL);
|
||||
UNICHAR_ID uch_id;
|
||||
for (int i = 0; i < num_chars; ++i) {
|
||||
uch_id = cube_char_set->UnicharID(char_samples[i]->StrLabel());
|
||||
if (uch_id != INVALID_UNICHAR_ID)
|
||||
werd_choice->append_unichar_id_space_allocated(
|
||||
uch_id, 1, 0.0, certainty);
|
||||
}
|
||||
|
||||
BLOB_CHOICE *blob_choice;
|
||||
BLOB_CHOICE_LIST *choices_list;
|
||||
BLOB_CHOICE_IT choices_list_it;
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices = new BLOB_CHOICE_LIST_CLIST();
|
||||
BLOB_CHOICE_LIST_C_IT blob_choices_it;
|
||||
blob_choices_it.set_to_list(blob_choices);
|
||||
|
||||
for (int i = 0; i < werd_choice->length(); ++i) {
|
||||
// Create new BLOB_CHOICE_LIST for this unichar
|
||||
choices_list = new BLOB_CHOICE_LIST();
|
||||
choices_list_it.set_to_list(choices_list);
|
||||
// Add a single BLOB_CHOICE to the list
|
||||
blob_choice = new BLOB_CHOICE(werd_choice->unichar_id(i),
|
||||
0.0, certainty, -1, -1, 0, 0, 0, false);
|
||||
choices_list_it.add_after_then_move(blob_choice);
|
||||
// Add list to the clist
|
||||
blob_choices_it.add_to_end(choices_list);
|
||||
}
|
||||
werd_choice->set_certainty(certainty);
|
||||
werd_choice->set_blob_choices(blob_choices);
|
||||
return werd_choice;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* init_cube_objects
|
||||
*
|
||||
@ -419,29 +371,32 @@ bool Tesseract::cube_recognize(CubeObject *cube_obj, BLOCK* block,
|
||||
return false;
|
||||
}
|
||||
|
||||
// Create cube's best choice.
|
||||
WERD_CHOICE* cube_werd_choice = create_werd_choice(
|
||||
char_samples, num_chars, cube_best_str.c_str(), cube_certainty,
|
||||
unicharset, cube_cntxt_->CharacterSet());
|
||||
delete []char_samples;
|
||||
// Fill tesseract result's fields with cube results
|
||||
fill_werd_res(cube_box_word, cube_best_str.c_str(), word);
|
||||
|
||||
if (!cube_werd_choice) {
|
||||
if (cube_debug_level > 0) {
|
||||
tprintf("Cube WARNING (Tesseract::cube_recognize): Could not "
|
||||
"create cube WERD_CHOICE\n");
|
||||
}
|
||||
word->SetupFake(unicharset);
|
||||
return false;
|
||||
// Create cube's best choice.
|
||||
BLOB_CHOICE** choices = new BLOB_CHOICE*[num_chars];
|
||||
for (int i = 0; i < num_chars; ++i) {
|
||||
UNICHAR_ID uch_id =
|
||||
cube_cntxt_->CharacterSet()->UnicharID(char_samples[i]->StrLabel());
|
||||
choices[i] = new BLOB_CHOICE(uch_id, 0.0, cube_certainty, -1, -1,
|
||||
0, 0, 0, 0, BCC_STATIC_CLASSIFIER);
|
||||
}
|
||||
word->FakeClassifyWord(num_chars, choices);
|
||||
// within a word, cube recognizes the word in reading order.
|
||||
word->best_choice->set_unichars_in_script_order(true);
|
||||
delete [] choices;
|
||||
delete [] char_samples;
|
||||
|
||||
// Some sanity checks
|
||||
ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
|
||||
|
||||
if (cube_debug_level || classify_debug_level) {
|
||||
tprintf("Cube result: %s r=%g, c=%g\n",
|
||||
cube_werd_choice->unichar_string().string(),
|
||||
cube_werd_choice->rating(),
|
||||
cube_werd_choice->certainty());
|
||||
word->best_choice->unichar_string().string(),
|
||||
word->best_choice->rating(),
|
||||
word->best_choice->certainty());
|
||||
}
|
||||
|
||||
// Fill tesseract result's fields with cube results
|
||||
fill_werd_res(cube_box_word, cube_werd_choice, cube_best_str.c_str(), word);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -452,13 +407,8 @@ bool Tesseract::cube_recognize(CubeObject *cube_obj, BLOCK* block,
|
||||
*
|
||||
**********************************************************************/
|
||||
void Tesseract::fill_werd_res(const BoxWord& cube_box_word,
|
||||
WERD_CHOICE* cube_werd_choice,
|
||||
const char* cube_best_str,
|
||||
WERD_RES* tess_werd_res) {
|
||||
// Replace tesseract results's best choice with cube's
|
||||
tess_werd_res->best_choice = cube_werd_choice;
|
||||
tess_werd_res->raw_choice = new WERD_CHOICE(*cube_werd_choice);
|
||||
|
||||
delete tess_werd_res->box_word;
|
||||
tess_werd_res->box_word = new BoxWord(cube_box_word);
|
||||
tess_werd_res->box_word->ClipToOriginalWord(tess_werd_res->denorm.block(),
|
||||
@ -466,23 +416,13 @@ void Tesseract::fill_werd_res(const BoxWord& cube_box_word,
|
||||
// Fill text and remaining fields
|
||||
tess_werd_res->word->set_text(cube_best_str);
|
||||
tess_werd_res->tess_failed = FALSE;
|
||||
tess_werd_res->tess_accepted =
|
||||
tess_acceptable_word(tess_werd_res->best_choice,
|
||||
tess_werd_res->raw_choice);
|
||||
tess_werd_res->tess_accepted = tess_acceptable_word(tess_werd_res);
|
||||
// There is no output word, so we can' call AdaptableWord, but then I don't
|
||||
// think we need to. Fudge the result with accepted.
|
||||
tess_werd_res->tess_would_adapt = tess_werd_res->tess_accepted;
|
||||
|
||||
// Initialize the reject_map and set it to done, i.e., ignore all of
|
||||
// tesseract's tests for rejection
|
||||
tess_werd_res->reject_map.initialise(cube_werd_choice->length());
|
||||
// Set word to done, i.e., ignore all of tesseract's tests for rejection
|
||||
tess_werd_res->done = tess_werd_res->tess_accepted;
|
||||
|
||||
// Some sanity checks
|
||||
ASSERT_HOST(tess_werd_res->best_choice->length() ==
|
||||
tess_werd_res->best_choice->blob_choices()->length());
|
||||
ASSERT_HOST(tess_werd_res->best_choice->length() ==
|
||||
tess_werd_res->reject_map.length());
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
|
@ -23,7 +23,6 @@
|
||||
|
||||
#include <ctype.h>
|
||||
#include "docqual.h"
|
||||
#include "tfacep.h"
|
||||
#include "reject.h"
|
||||
#include "tesscallback.h"
|
||||
#include "tessvars.h"
|
||||
@ -66,7 +65,7 @@ struct DocQualCallbacks {
|
||||
*************************************************************************/
|
||||
inT16 Tesseract::word_blob_quality(WERD_RES *word, ROW *row) {
|
||||
if (word->bln_boxes == NULL ||
|
||||
word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
|
||||
word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
|
||||
return 0;
|
||||
|
||||
DocQualCallbacks cb(word);
|
||||
@ -81,8 +80,8 @@ inT16 Tesseract::word_outline_errs(WERD_RES *word) {
|
||||
inT16 err_count = 0;
|
||||
|
||||
if (word->rebuild_word != NULL) {
|
||||
TBLOB* blob = word->rebuild_word->blobs;
|
||||
for (; blob != NULL; blob = blob->next) {
|
||||
for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
|
||||
TBLOB* blob = word->rebuild_word->blobs[b];
|
||||
err_count += count_outline_errs(word->best_choice->unichar_string()[i],
|
||||
blob->NumOutlines());
|
||||
i++;
|
||||
@ -101,7 +100,7 @@ void Tesseract::word_char_quality(WERD_RES *word,
|
||||
inT16 *match_count,
|
||||
inT16 *accepted_match_count) {
|
||||
if (word->bln_boxes == NULL ||
|
||||
word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
|
||||
word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
|
||||
return;
|
||||
|
||||
DocQualCallbacks cb(word);
|
||||
@ -118,7 +117,7 @@ void Tesseract::word_char_quality(WERD_RES *word,
|
||||
*************************************************************************/
|
||||
void Tesseract::unrej_good_chs(WERD_RES *word, ROW *row) {
|
||||
if (word->bln_boxes == NULL ||
|
||||
word->rebuild_word == NULL || word->rebuild_word->blobs == NULL)
|
||||
word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
|
||||
return;
|
||||
|
||||
DocQualCallbacks cb(word);
|
||||
@ -990,7 +989,8 @@ BOOL8 Tesseract::noise_outlines(TWERD *word) {
|
||||
inT16 max_dimension;
|
||||
float small_limit = kBlnXHeight * crunch_small_outlines_size;
|
||||
|
||||
for (TBLOB* blob = word->blobs; blob != NULL; blob = blob->next) {
|
||||
for (int b = 0; b < word->NumBlobs(); ++b) {
|
||||
TBLOB* blob = word->blobs[b];
|
||||
for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
|
||||
outline_count++;
|
||||
box = ol->bounding_box();
|
||||
@ -1002,6 +1002,7 @@ BOOL8 Tesseract::noise_outlines(TWERD *word) {
|
||||
small_outline_count++;
|
||||
}
|
||||
}
|
||||
return (small_outline_count >= outline_count);
|
||||
return small_outline_count >= outline_count;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
|
@ -19,7 +19,7 @@
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#pragma warning(disable:4244) // Conversion warnings
|
||||
#include "mathfix.h"
|
||||
#include <mathfix.h>
|
||||
#endif
|
||||
|
||||
#ifdef __MINGW32__
|
||||
@ -173,21 +173,21 @@ void EquationDetect::IdentifySpecialText(
|
||||
|
||||
BLOB_CHOICE_LIST ratings_equ, ratings_lang;
|
||||
C_BLOB* blob = blobnbox->cblob();
|
||||
TBLOB* tblob = TBLOB::PolygonalCopy(blob);
|
||||
// TODO(joeliu/rays) Fix this. We may have to normalize separately for
|
||||
// each classifier here, as they may require different PolygonalCopy.
|
||||
TBLOB* tblob = TBLOB::PolygonalCopy(false, blob);
|
||||
const TBOX& box = tblob->bounding_box();
|
||||
|
||||
// Normalize the blob. Set the origin to the place we want to be the
|
||||
// bottom-middle, and scaling is to make the height the x-height.
|
||||
float scaling = static_cast<float>(kBlnXHeight) / box.height();
|
||||
DENORM denorm;
|
||||
float x_orig = (box.left() + box.right()) / 2.0f, y_orig = box.bottom();
|
||||
denorm.SetupNormalization(NULL, NULL, NULL, NULL, NULL, 0,
|
||||
x_orig, y_orig, scaling, scaling,
|
||||
0.0f, static_cast<float>(kBlnBaselineOffset));
|
||||
TBLOB* normed_blob = new TBLOB(*tblob);
|
||||
normed_blob->Normalize(denorm);
|
||||
equ_tesseract_->AdaptiveClassifier(normed_blob, denorm, &ratings_equ, NULL);
|
||||
lang_tesseract_->AdaptiveClassifier(normed_blob, denorm, &ratings_lang, NULL);
|
||||
normed_blob->Normalize(NULL, NULL, NULL, x_orig, y_orig, scaling, scaling,
|
||||
0.0f, static_cast<float>(kBlnBaselineOffset),
|
||||
false, NULL);
|
||||
equ_tesseract_->AdaptiveClassifier(normed_blob, &ratings_equ, NULL);
|
||||
lang_tesseract_->AdaptiveClassifier(normed_blob, &ratings_lang, NULL);
|
||||
delete normed_blob;
|
||||
delete tblob;
|
||||
|
||||
|
@ -35,6 +35,7 @@
|
||||
#define MAXSPACING 128 /*max expected spacing in pix */
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/**
|
||||
* @name fix_fuzzy_spaces()
|
||||
* Walk over the page finding sequences of words joined by fuzzy spaces. Extract
|
||||
@ -183,7 +184,7 @@ void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
|
||||
for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
|
||||
src_wd = src_it.data();
|
||||
if (!src_wd->combination) {
|
||||
new_wd = new WERD_RES(*src_wd);
|
||||
new_wd = WERD_RES::deep_copy(src_wd);
|
||||
new_wd->combination = FALSE;
|
||||
new_wd->part_of_combo = FALSE;
|
||||
new_it.add_after_then_move(new_wd);
|
||||
@ -502,86 +503,6 @@ void Tesseract::dump_words(WERD_RES_LIST &perm, inT16 score,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @name uniformly_spaced()
|
||||
* Return true if one of the following are true:
|
||||
* - All inter-char gaps are the same width
|
||||
* - The largest gap is no larger than twice the mean/median of the others
|
||||
* - The largest gap is < normalised_max_nonspace
|
||||
* **** REMEMBER - WE'RE NOW WORKING WITH A BLN WERD !!!
|
||||
*/
|
||||
BOOL8 Tesseract::uniformly_spaced(WERD_RES *word) {
|
||||
TBOX box;
|
||||
inT16 prev_right = -MAX_INT16;
|
||||
inT16 gap;
|
||||
inT16 max_gap = -MAX_INT16;
|
||||
inT16 max_gap_count = 0;
|
||||
STATS gap_stats(0, MAXSPACING);
|
||||
BOOL8 result;
|
||||
const ROW *row = word->denorm.row();
|
||||
float max_non_space;
|
||||
float normalised_max_nonspace;
|
||||
inT16 i = 0;
|
||||
inT16 offset = 0;
|
||||
STRING punct_chars = "\"`',.:;";
|
||||
|
||||
for (TBLOB* blob = word->rebuild_word->blobs; blob != NULL;
|
||||
blob = blob->next) {
|
||||
box = blob->bounding_box();
|
||||
if ((prev_right > -MAX_INT16) &&
|
||||
(!punct_chars.contains(
|
||||
word->best_choice->unichar_string()
|
||||
[offset - word->best_choice->unichar_lengths()[i - 1]]) &&
|
||||
!punct_chars.contains(
|
||||
word->best_choice->unichar_string()[offset]))) {
|
||||
gap = box.left() - prev_right;
|
||||
if (gap < max_gap) {
|
||||
gap_stats.add(gap, 1);
|
||||
} else if (gap == max_gap) {
|
||||
max_gap_count++;
|
||||
} else {
|
||||
if (max_gap_count > 0)
|
||||
gap_stats.add(max_gap, max_gap_count);
|
||||
max_gap = gap;
|
||||
max_gap_count = 1;
|
||||
}
|
||||
}
|
||||
prev_right = box.right();
|
||||
offset += word->best_choice->unichar_lengths()[i++];
|
||||
}
|
||||
|
||||
max_non_space = (row->space() + 3 * row->kern()) / 4;
|
||||
normalised_max_nonspace = max_non_space * kBlnXHeight / row->x_height();
|
||||
|
||||
result = (
|
||||
gap_stats.get_total() == 0 ||
|
||||
max_gap <= normalised_max_nonspace ||
|
||||
(gap_stats.get_total() > 2 && max_gap <= 2 * gap_stats.median()) ||
|
||||
(gap_stats.get_total() <= 2 && max_gap <= 2 * gap_stats.mean()));
|
||||
#ifndef SECURE_NAMES
|
||||
if ((debug_fix_space_level > 1)) {
|
||||
if (result) {
|
||||
tprintf(
|
||||
"ACCEPT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d "
|
||||
"total=%d mean=%f median=%f\n",
|
||||
word->best_choice->unichar_string().string(), normalised_max_nonspace,
|
||||
max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(),
|
||||
gap_stats.median());
|
||||
} else {
|
||||
tprintf(
|
||||
"REJECT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d "
|
||||
"total=%d mean=%f median=%f\n",
|
||||
word->best_choice->unichar_string().string(), normalised_max_nonspace,
|
||||
max_gap, max_gap_count, gap_stats.get_total(), gap_stats.mean(),
|
||||
gap_stats.median());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
|
||||
if (word->done)
|
||||
return TRUE;
|
||||
@ -655,7 +576,6 @@ void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
|
||||
WERD_RES_LIST current_perm;
|
||||
WERD_RES_IT current_perm_it(¤t_perm);
|
||||
WERD_RES *old_word_res;
|
||||
WERD_RES *new_word_res;
|
||||
inT16 current_score;
|
||||
BOOL8 improved = FALSE;
|
||||
|
||||
@ -663,12 +583,12 @@ void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
|
||||
|
||||
dump_words(best_perm, best_score, 1, improved);
|
||||
|
||||
new_word_res = new WERD_RES;
|
||||
old_word_res = best_perm_it.data();
|
||||
// Even deep_copy doesn't copy the underlying WERD unless its combination
|
||||
// flag is true!.
|
||||
old_word_res->combination = TRUE; // Kludge to force deep copy
|
||||
*new_word_res = *old_word_res; // deep copy
|
||||
current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
|
||||
old_word_res->combination = FALSE; // Undo kludge
|
||||
current_perm_it.add_to_end(new_word_res);
|
||||
|
||||
break_noisiest_blob_word(current_perm);
|
||||
|
||||
@ -774,7 +694,6 @@ inT16 Tesseract::worst_noise_blob(WERD_RES *word_res,
|
||||
if (word_res->rebuild_word == NULL)
|
||||
return -1; // Can't handle cube words.
|
||||
|
||||
TBLOB* blob = word_res->rebuild_word->blobs;
|
||||
// Normalised.
|
||||
int blob_count = word_res->box_word->length();
|
||||
ASSERT_HOST(blob_count <= 512);
|
||||
@ -789,7 +708,8 @@ inT16 Tesseract::worst_noise_blob(WERD_RES *word_res,
|
||||
word_res->best_choice->unichar_string().string());
|
||||
#endif
|
||||
|
||||
for (i = 0; i < blob_count && blob != NULL; i++, blob = blob->next) {
|
||||
for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
|
||||
TBLOB* blob = word_res->rebuild_word->blobs[i];
|
||||
if (word_res->reject_map[i].accepted())
|
||||
noise_score[i] = non_noise_limit;
|
||||
else
|
||||
@ -929,10 +849,10 @@ inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
|
||||
word->best_choice->permuter() == FREQ_DAWG_PERM ||
|
||||
word->best_choice->permuter() == USER_DAWG_PERM ||
|
||||
safe_dict_word(word) > 0) {
|
||||
TBLOB* blob = word->rebuild_word->blobs;
|
||||
int num_blobs = word->rebuild_word->NumBlobs();
|
||||
UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
|
||||
for (i = 0; i < word->best_choice->length() && blob != NULL;
|
||||
++i, blob = blob->next) {
|
||||
for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
|
||||
TBLOB* blob = word->rebuild_word->blobs[i];
|
||||
if (word->best_choice->unichar_id(i) == space ||
|
||||
blob_noise_score(blob) < small_limit) {
|
||||
score -= 1; // penalise possibly erroneous non-space
|
||||
|
@ -62,9 +62,9 @@ const int kMaxCharTopRange = 48;
|
||||
// Returns the number of misfit blob tops in this word.
|
||||
int Tesseract::CountMisfitTops(WERD_RES *word_res) {
|
||||
int bad_blobs = 0;
|
||||
TBLOB* blob = word_res->rebuild_word->blobs;
|
||||
int blob_id = 0;
|
||||
for (; blob != NULL; blob = blob->next, ++blob_id) {
|
||||
int num_blobs = word_res->rebuild_word->NumBlobs();
|
||||
for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
|
||||
TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
|
||||
UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
|
||||
if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
|
||||
int top = blob->bounding_box().top();
|
||||
@ -94,9 +94,9 @@ int Tesseract::CountMisfitTops(WERD_RES *word_res) {
|
||||
// See comment above for overall algorithm.
|
||||
float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) {
|
||||
STATS top_stats(0, MAX_UINT8);
|
||||
TBLOB* blob = word_res->rebuild_word->blobs;
|
||||
int blob_id = 0;
|
||||
for (; blob != NULL; blob = blob->next, ++blob_id) {
|
||||
int num_blobs = word_res->rebuild_word->NumBlobs();
|
||||
for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
|
||||
TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
|
||||
UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
|
||||
if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
|
||||
int top = blob->bounding_box().top();
|
||||
|
@ -33,7 +33,7 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "errcode.h"
|
||||
#include "globaloc.h" // For err_exit.
|
||||
|
||||
#define f(xc, yc) ((xc - factor*yc)*(xc - factor*yc))
|
||||
|
||||
|
@ -132,23 +132,7 @@ float LTRResultIterator::Confidence(PageIteratorLevel level) const {
|
||||
++certainty_count;
|
||||
break;
|
||||
case RIL_SYMBOL:
|
||||
BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
|
||||
if (choices != NULL) {
|
||||
BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
|
||||
for (int blob = 0; blob < blob_index_; ++blob)
|
||||
blob_choices_it.forward();
|
||||
BLOB_CHOICE_IT choice_it(blob_choices_it.data());
|
||||
for (choice_it.mark_cycle_pt();
|
||||
!choice_it.cycled_list();
|
||||
choice_it.forward()) {
|
||||
if (choice_it.data()->unichar_id() ==
|
||||
best_choice->unichar_id(blob_index_))
|
||||
break;
|
||||
}
|
||||
mean_certainty += choice_it.data()->certainty();
|
||||
} else {
|
||||
mean_certainty += best_choice->certainty();
|
||||
}
|
||||
mean_certainty += best_choice->certainty(blob_index_);
|
||||
++certainty_count;
|
||||
}
|
||||
if (certainty_count > 0) {
|
||||
@ -237,55 +221,83 @@ bool LTRResultIterator::WordIsNumeric() const {
|
||||
|
||||
// Returns true if the word contains blamer information.
|
||||
bool LTRResultIterator::HasBlamerInfo() const {
|
||||
return (it_->word() != NULL && it_->word()->blamer_bundle != NULL &&
|
||||
(it_->word()->blamer_bundle->debug.length() > 0 ||
|
||||
it_->word()->blamer_bundle->misadaption_debug.length() > 0));
|
||||
return it_->word() != NULL && it_->word()->blamer_bundle != NULL &&
|
||||
it_->word()->blamer_bundle->HasDebugInfo();
|
||||
}
|
||||
|
||||
// Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
|
||||
// of the current word.
|
||||
void *LTRResultIterator::GetParamsTrainingBundle() const {
|
||||
const void *LTRResultIterator::GetParamsTrainingBundle() const {
|
||||
return (it_->word() != NULL && it_->word()->blamer_bundle != NULL) ?
|
||||
&(it_->word()->blamer_bundle->params_training_bundle) : NULL;
|
||||
&(it_->word()->blamer_bundle->params_training_bundle()) : NULL;
|
||||
}
|
||||
|
||||
// Returns the pointer to the string with blamer information for this word.
|
||||
// Assumes that the word's blamer_bundle is not NULL.
|
||||
const char *LTRResultIterator::GetBlamerDebug() const {
|
||||
return it_->word()->blamer_bundle->debug.string();
|
||||
return it_->word()->blamer_bundle->debug().string();
|
||||
}
|
||||
|
||||
// Returns the pointer to the string with misadaption information for this word.
|
||||
// Assumes that the word's blamer_bundle is not NULL.
|
||||
const char *LTRResultIterator::GetBlamerMisadaptionDebug() const {
|
||||
return it_->word()->blamer_bundle->misadaption_debug.string();
|
||||
return it_->word()->blamer_bundle->misadaption_debug().string();
|
||||
}
|
||||
|
||||
// Returns true if a truth string was recorded for the current word.
|
||||
bool LTRResultIterator::HasTruthString() const {
|
||||
if (it_->word() == NULL) return false; // Already at the end!
|
||||
if (it_->word()->blamer_bundle == NULL ||
|
||||
it_->word()->blamer_bundle->NoTruth()) {
|
||||
return false; // no truth information for this word
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns true if the given string is equivalent to the truth string for
|
||||
// the current word.
|
||||
bool LTRResultIterator::EquivalentToTruth(const char *str) const {
|
||||
if (!HasTruthString()) return false;
|
||||
ASSERT_HOST(it_->word()->uch_set != NULL);
|
||||
WERD_CHOICE str_wd(str, *(it_->word()->uch_set));
|
||||
return it_->word()->blamer_bundle->ChoiceIsCorrect(&str_wd);
|
||||
}
|
||||
|
||||
// Returns the null terminated UTF-8 encoded truth string for the current word.
|
||||
// Use delete [] to free after use.
|
||||
char* LTRResultIterator::WordTruthUTF8Text() const {
|
||||
if (it_->word() == NULL) return NULL; // Already at the end!
|
||||
if (it_->word()->blamer_bundle == NULL ||
|
||||
it_->word()->blamer_bundle->incorrect_result_reason == IRR_NO_TRUTH) {
|
||||
return NULL; // no truth information for this word
|
||||
}
|
||||
const GenericVector<STRING> &truth_vec =
|
||||
it_->word()->blamer_bundle->truth_text;
|
||||
STRING truth_text;
|
||||
for (int i = 0; i < truth_vec.size(); ++i) truth_text += truth_vec[i];
|
||||
if (!HasTruthString()) return NULL;
|
||||
STRING truth_text = it_->word()->blamer_bundle->TruthString();
|
||||
int length = truth_text.length() + 1;
|
||||
char* result = new char[length];
|
||||
strncpy(result, truth_text.string(), length);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns the null terminated UTF-8 encoded normalized OCR string for the
|
||||
// current word. Use delete [] to free after use.
|
||||
char* LTRResultIterator::WordNormedUTF8Text() const {
|
||||
if (it_->word() == NULL) return NULL; // Already at the end!
|
||||
STRING ocr_text;
|
||||
WERD_CHOICE* best_choice = it_->word()->best_choice;
|
||||
const UNICHARSET *unicharset = it_->word()->uch_set;
|
||||
ASSERT_HOST(best_choice != NULL);
|
||||
for (int i = 0; i < best_choice->length(); ++i) {
|
||||
ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));
|
||||
}
|
||||
int length = ocr_text.length() + 1;
|
||||
char* result = new char[length];
|
||||
strncpy(result, ocr_text.string(), length);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns a pointer to serialized choice lattice.
|
||||
// Fills lattice_size with the number of bytes in lattice data.
|
||||
const char *LTRResultIterator::WordLattice(int *lattice_size) const {
|
||||
if (it_->word() == NULL) return NULL; // Already at the end!
|
||||
if (it_->word()->blamer_bundle == NULL) return NULL;
|
||||
*lattice_size = it_->word()->blamer_bundle->lattice_size;
|
||||
return it_->word()->blamer_bundle->lattice_data;
|
||||
*lattice_size = it_->word()->blamer_bundle->lattice_size();
|
||||
return it_->word()->blamer_bundle->lattice_data();
|
||||
}
|
||||
|
||||
// Returns true if the current symbol is a superscript.
|
||||
@ -293,7 +305,8 @@ const char *LTRResultIterator::WordLattice(int *lattice_size) const {
|
||||
// this will return the attributes of the first symbol in that word.
|
||||
bool LTRResultIterator::SymbolIsSuperscript() const {
|
||||
if (cblob_it_ == NULL && it_->word() != NULL)
|
||||
return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUPERSCRIPT;
|
||||
return it_->word()->best_choice->BlobPosition(blob_index_) ==
|
||||
SP_SUPERSCRIPT;
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -302,7 +315,7 @@ bool LTRResultIterator::SymbolIsSuperscript() const {
|
||||
// this will return the attributes of the first symbol in that word.
|
||||
bool LTRResultIterator::SymbolIsSubscript() const {
|
||||
if (cblob_it_ == NULL && it_->word() != NULL)
|
||||
return it_->word()->box_word->BlobPosition(blob_index_) == SP_SUBSCRIPT;
|
||||
return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUBSCRIPT;
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -311,7 +324,7 @@ bool LTRResultIterator::SymbolIsSubscript() const {
|
||||
// this will return the attributes of the first symbol in that word.
|
||||
bool LTRResultIterator::SymbolIsDropcap() const {
|
||||
if (cblob_it_ == NULL && it_->word() != NULL)
|
||||
return it_->word()->box_word->BlobPosition(blob_index_) == SP_DROPCAP;
|
||||
return it_->word()->best_choice->BlobPosition(blob_index_) == SP_DROPCAP;
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -319,13 +332,11 @@ ChoiceIterator::ChoiceIterator(const LTRResultIterator& result_it) {
|
||||
ASSERT_HOST(result_it.it_->word() != NULL);
|
||||
word_res_ = result_it.it_->word();
|
||||
PAGE_RES_IT res_it(*result_it.it_);
|
||||
WERD_CHOICE* best_choice = word_res_->best_choice;
|
||||
BLOB_CHOICE_LIST_CLIST* choices = best_choice->blob_choices();
|
||||
BLOB_CHOICE_LIST* choices = NULL;
|
||||
if (word_res_->ratings != NULL)
|
||||
choices = word_res_->GetBlobChoices(result_it.blob_index_);
|
||||
if (choices != NULL && !choices->empty()) {
|
||||
BLOB_CHOICE_LIST_C_IT blob_choices_it(choices);
|
||||
for (int blob = 0; blob < result_it.blob_index_; ++blob)
|
||||
blob_choices_it.forward();
|
||||
choice_it_ = new BLOB_CHOICE_IT(blob_choices_it.data());
|
||||
choice_it_ = new BLOB_CHOICE_IT(choices);
|
||||
choice_it_->mark_cycle_pt();
|
||||
} else {
|
||||
choice_it_ = NULL;
|
||||
|
@ -23,7 +23,7 @@
|
||||
|
||||
#include "platform.h"
|
||||
#include "pageiterator.h"
|
||||
#include "unicharset.h"
|
||||
#include "unichar.h"
|
||||
|
||||
class BLOB_CHOICE_IT;
|
||||
class WERD_RES;
|
||||
@ -128,7 +128,7 @@ class TESS_API LTRResultIterator : public PageIterator {
|
||||
|
||||
// Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
|
||||
// of the current word.
|
||||
void *GetParamsTrainingBundle() const;
|
||||
const void *GetParamsTrainingBundle() const;
|
||||
|
||||
// Returns a pointer to the string with blamer information for this word.
|
||||
// Assumes that the word's blamer_bundle is not NULL.
|
||||
@ -138,10 +138,21 @@ class TESS_API LTRResultIterator : public PageIterator {
|
||||
// Assumes that the word's blamer_bundle is not NULL.
|
||||
const char *GetBlamerMisadaptionDebug() const;
|
||||
|
||||
// Returns true if a truth string was recorded for the current word.
|
||||
bool HasTruthString() const;
|
||||
|
||||
// Returns true if the given string is equivalent to the truth string for
|
||||
// the current word.
|
||||
bool EquivalentToTruth(const char *str) const;
|
||||
|
||||
// Returns a null terminated UTF-8 encoded truth string for the current word.
|
||||
// Use delete [] to free after use.
|
||||
char* WordTruthUTF8Text() const;
|
||||
|
||||
// Returns a null terminated UTF-8 encoded normalized OCR string for the
|
||||
// current word. Use delete [] to free after use.
|
||||
char* WordNormedUTF8Text() const;
|
||||
|
||||
// Returns a pointer to serialized choice lattice.
|
||||
// Fills lattice_size with the number of bytes in lattice data.
|
||||
const char *WordLattice(int *lattice_size) const;
|
||||
|
@ -29,14 +29,12 @@
|
||||
#include <errno.h>
|
||||
#endif
|
||||
#include "helpers.h"
|
||||
#include "tfacep.h"
|
||||
#include "tessvars.h"
|
||||
#include "control.h"
|
||||
#include "secname.h"
|
||||
#include "reject.h"
|
||||
#include "docqual.h"
|
||||
#include "output.h"
|
||||
#include "bestfirst.h"
|
||||
#include "globals.h"
|
||||
#include "tesseractclass.h"
|
||||
|
||||
@ -242,13 +240,7 @@ void Tesseract::write_results(PAGE_RES_IT &page_res_it,
|
||||
(word->best_choice->unichar_id(0) == space)) {
|
||||
/* Prevent adjacent tilde across words - we know that adjacent tildes within
|
||||
words have been removed */
|
||||
word->best_choice->remove_unichar_id(0);
|
||||
if (word->best_choice->blob_choices() != NULL) {
|
||||
BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
|
||||
if (!blob_choices_it.empty()) delete blob_choices_it.extract();
|
||||
}
|
||||
word->reject_map.remove_pos (0);
|
||||
word->box_word->DeleteBox(0);
|
||||
word->MergeAdjacentBlobs(0);
|
||||
}
|
||||
if (newline_type ||
|
||||
(word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
|
||||
|
@ -303,16 +303,22 @@ bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
|
||||
bool PageIterator::BoundingBox(PageIteratorLevel level,
|
||||
int* left, int* top,
|
||||
int* right, int* bottom) const {
|
||||
return BoundingBox(level, 0, left, top, right, bottom);
|
||||
}
|
||||
|
||||
bool PageIterator::BoundingBox(PageIteratorLevel level, const int padding,
|
||||
int* left, int* top,
|
||||
int* right, int* bottom) const {
|
||||
if (!BoundingBoxInternal(level, left, top, right, bottom))
|
||||
return false;
|
||||
// Convert to the coordinate system of the original image.
|
||||
*left = ClipToRange(*left / scale_ + rect_left_,
|
||||
*left = ClipToRange(*left / scale_ + rect_left_ - padding,
|
||||
rect_left_, rect_left_ + rect_width_);
|
||||
*top = ClipToRange(*top / scale_ + rect_top_,
|
||||
*top = ClipToRange(*top / scale_ + rect_top_ - padding,
|
||||
rect_top_, rect_top_ + rect_height_);
|
||||
*right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_,
|
||||
*right = ClipToRange((*right + scale_ - 1) / scale_ + rect_left_ + padding,
|
||||
*left, rect_left_ + rect_width_);
|
||||
*bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_,
|
||||
*bottom = ClipToRange((*bottom + scale_ - 1) / scale_ + rect_top_ + padding,
|
||||
*top, rect_top_ + rect_height_);
|
||||
return true;
|
||||
}
|
||||
@ -546,14 +552,15 @@ void PageIterator::BeginWord(int offset) {
|
||||
// Recognition has been done, so we are using the box_word, which
|
||||
// is already baseline denormalized.
|
||||
word_length_ = word_res->best_choice->length();
|
||||
ASSERT_HOST(word_res->box_word != NULL);
|
||||
if (word_res->box_word->length() != word_length_) {
|
||||
tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ",
|
||||
word_length_, word_res->best_choice->unichar_string().string(),
|
||||
word_res->box_word->length());
|
||||
word_res->box_word->bounding_box().print();
|
||||
if (word_res->box_word != NULL) {
|
||||
if (word_res->box_word->length() != word_length_) {
|
||||
tprintf("Corrupted word! best_choice[len=%d] = %s, box_word[len=%d]: ",
|
||||
word_length_, word_res->best_choice->unichar_string().string(),
|
||||
word_res->box_word->length());
|
||||
word_res->box_word->bounding_box().print();
|
||||
}
|
||||
ASSERT_HOST(word_res->box_word->length() == word_length_);
|
||||
}
|
||||
ASSERT_HOST(word_res->box_word->length() == word_length_);
|
||||
word_ = NULL;
|
||||
// We will be iterating the box_word.
|
||||
if (cblob_it_ != NULL) {
|
||||
@ -574,4 +581,13 @@ void PageIterator::BeginWord(int offset) {
|
||||
}
|
||||
}
|
||||
|
||||
bool PageIterator::SetWordBlamerBundle(BlamerBundle *blamer_bundle) {
|
||||
if (it_->word() != NULL) {
|
||||
it_->word()->blamer_bundle = blamer_bundle;
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include "publictypes.h"
|
||||
#include "platform.h"
|
||||
|
||||
class BlamerBundle;
|
||||
class C_BLOB_IT;
|
||||
class PBLOB_IT;
|
||||
class PAGE_RES;
|
||||
@ -189,6 +190,8 @@ class TESS_API PageIterator {
|
||||
*/
|
||||
bool BoundingBox(PageIteratorLevel level,
|
||||
int* left, int* top, int* right, int* bottom) const;
|
||||
bool BoundingBox(PageIteratorLevel level, const int padding,
|
||||
int* left, int* top, int* right, int* bottom) const;
|
||||
/**
|
||||
* Returns the bounding rectangle of the object in a coordinate system of the
|
||||
* working image rectangle having its origin at (rect_left_, rect_top_) with
|
||||
@ -282,6 +285,12 @@ class TESS_API PageIterator {
|
||||
bool *is_crown,
|
||||
int *first_line_indent) const;
|
||||
|
||||
// If the current WERD_RES (it_->word()) is not NULL, sets the BlamerBundle
|
||||
// of the current word to the given pointer (takes ownership of the pointer)
|
||||
// and returns true.
|
||||
// Can only be used when iterating on the word level.
|
||||
bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
|
||||
|
||||
protected:
|
||||
/**
|
||||
* Sets up the internal data for iterating the blobs of a new word, then
|
||||
|
@ -16,8 +16,8 @@
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
#ifdef _MSC_VER
|
||||
#define __func__ __FUNCTION__
|
||||
#ifdef _MSC_VER
|
||||
#define __func__ __FUNCTION__
|
||||
#endif
|
||||
|
||||
#include <ctype.h>
|
||||
@ -40,11 +40,6 @@
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// The tab vectors for a given line should be ignored if both its tab vectors
|
||||
// are infrequent, specifically, if both tab vectors appear at most once per
|
||||
// kStrayLinePer lines in a block.
|
||||
const int kStrayLinePer = 6;
|
||||
|
||||
// Special "weak" ParagraphModels.
|
||||
const ParagraphModel *kCrownLeft
|
||||
= reinterpret_cast<ParagraphModel *>(0xDEAD111F);
|
||||
@ -727,7 +722,15 @@ void CalculateTabStops(GenericVector<RowScratchRegisters> *rows,
|
||||
// tab stop is frequent.
|
||||
SimpleClusterer lefts(tolerance);
|
||||
SimpleClusterer rights(tolerance);
|
||||
int infrequent_enough_to_ignore = (row_end - row_start) / kStrayLinePer;
|
||||
|
||||
// Outlier elimination. We might want to switch this to test outlier-ness
|
||||
// based on how strange a position an outlier is in instead of or in addition
|
||||
// to how rare it is. These outliers get re-added if we end up having too
|
||||
// few tab stops, to work with, however.
|
||||
int infrequent_enough_to_ignore = 0;
|
||||
if (row_end - row_start >= 8) infrequent_enough_to_ignore = 1;
|
||||
if (row_end - row_start >= 20) infrequent_enough_to_ignore = 2;
|
||||
|
||||
for (int i = row_start; i < row_end; i++) {
|
||||
int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
|
||||
int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
|
||||
@ -739,6 +742,54 @@ void CalculateTabStops(GenericVector<RowScratchRegisters> *rows,
|
||||
}
|
||||
lefts.GetClusters(left_tabs);
|
||||
rights.GetClusters(right_tabs);
|
||||
|
||||
if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
|
||||
(right_tabs->size() == 1 && left_tabs->size() >= 4)) {
|
||||
// One side is really ragged, and the other only has one tab stop,
|
||||
// so those "insignificant outliers" are probably important, actually.
|
||||
// This often happens on a page of an index. Add back in the ones
|
||||
// we omitted in the first pass.
|
||||
for (int i = row_start; i < row_end; i++) {
|
||||
int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
|
||||
int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
|
||||
if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
|
||||
initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
|
||||
lefts.Add((*rows)[i].lindent_);
|
||||
rights.Add((*rows)[i].rindent_);
|
||||
}
|
||||
}
|
||||
}
|
||||
lefts.GetClusters(left_tabs);
|
||||
rights.GetClusters(right_tabs);
|
||||
|
||||
// If one side is almost a two-indent aligned side, and the other clearly
|
||||
// isn't, try to prune out the least frequent tab stop from that side.
|
||||
if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
|
||||
int to_prune = -1;
|
||||
for (int i = left_tabs->size() - 1; i >= 0; i--) {
|
||||
if (to_prune < 0 ||
|
||||
(*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
|
||||
to_prune = i;
|
||||
}
|
||||
}
|
||||
if (to_prune >= 0 &&
|
||||
(*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
|
||||
left_tabs->remove(to_prune);
|
||||
}
|
||||
}
|
||||
if (right_tabs->size() == 3 && right_tabs->size() >= 4) {
|
||||
int to_prune = -1;
|
||||
for (int i = right_tabs->size() - 1; i >= 0; i--) {
|
||||
if (to_prune < 0 ||
|
||||
(*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
|
||||
to_prune = i;
|
||||
}
|
||||
}
|
||||
if (to_prune >= 0 &&
|
||||
(*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
|
||||
right_tabs->remove(to_prune);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Given a paragraph model mark rows[row_start, row_end) as said model
|
||||
@ -816,6 +867,11 @@ struct GeometricClassifierState {
|
||||
tolerance = InterwordSpace(*r, r_start, r_end);
|
||||
CalculateTabStops(r, r_start, r_end, tolerance,
|
||||
&left_tabs, &right_tabs);
|
||||
if (debug_level >= 3) {
|
||||
tprintf("Geometry: TabStop cluster tolerance = %d; "
|
||||
"%d left tabs; %d right tabs\n",
|
||||
tolerance, left_tabs.size(), right_tabs.size());
|
||||
}
|
||||
ltr = (*r)[r_start].ri_->ltr;
|
||||
}
|
||||
|
||||
@ -1079,16 +1135,18 @@ void GeometricClassify(int debug_level,
|
||||
firsts[s.AlignsideTabIndex(s.row_start)]++;
|
||||
// For each line, if the first word would have fit on the previous
|
||||
// line count it as a likely paragraph start line.
|
||||
bool jam_packed = true;
|
||||
for (int i = s.row_start + 1; i < s.row_end; i++) {
|
||||
if (s.FirstWordWouldHaveFit(i - 1, i)) {
|
||||
firsts[s.AlignsideTabIndex(i)]++;
|
||||
jam_packed = false;
|
||||
}
|
||||
}
|
||||
// Make an extra accounting for the last line of the paragraph just
|
||||
// in case it's the only short line in the block. That is, take its
|
||||
// first word as typical and see if this looks like the *last* line
|
||||
// of a paragraph. If so, mark the *other* indent as probably a first.
|
||||
if (s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
|
||||
if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
|
||||
firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
|
||||
}
|
||||
|
||||
@ -1543,24 +1601,26 @@ void RecomputeMarginsAndClearHypotheses(
|
||||
}
|
||||
}
|
||||
|
||||
// Return the minimum inter-word space in rows[row_start, row_end).
|
||||
// Return the median inter-word space in rows[row_start, row_end).
|
||||
int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
|
||||
int row_start, int row_end) {
|
||||
if (row_end < row_start + 1) return 1;
|
||||
bool legit = false;
|
||||
int natural_space = rows[row_start].ri_->average_interword_space;
|
||||
int word_height = (rows[row_start].ri_->lword_box.height() +
|
||||
rows[row_end - 1].ri_->lword_box.height()) / 2;
|
||||
int word_width = (rows[row_start].ri_->lword_box.width() +
|
||||
rows[row_end - 1].ri_->lword_box.width()) / 2;
|
||||
STATS spacing_widths(0, 5 + word_width);
|
||||
for (int i = row_start; i < row_end; i++) {
|
||||
if (rows[i].ri_->num_words > 1) {
|
||||
if (!legit) {
|
||||
natural_space = rows[i].ri_->average_interword_space;
|
||||
legit = true;
|
||||
} else {
|
||||
if (rows[i].ri_->average_interword_space < natural_space)
|
||||
natural_space = rows[i].ri_->average_interword_space;
|
||||
}
|
||||
spacing_widths.add(rows[i].ri_->average_interword_space, 1);
|
||||
}
|
||||
}
|
||||
return natural_space;
|
||||
int minimum_reasonable_space = word_height / 3;
|
||||
if (minimum_reasonable_space < 2)
|
||||
minimum_reasonable_space = 2;
|
||||
int median = spacing_widths.median();
|
||||
return (median > minimum_reasonable_space)
|
||||
? median : minimum_reasonable_space;
|
||||
}
|
||||
|
||||
// Return whether the first word on the after line can fit in the space at
|
||||
@ -2274,6 +2334,7 @@ void DetectParagraphs(int debug_level,
|
||||
GeometricClassify(debug_level, &rows,
|
||||
leftovers[i].begin, leftovers[i].end, &theory);
|
||||
}
|
||||
|
||||
// Undo any flush models for which there's little evidence.
|
||||
DowngradeWeakestToCrowns(debug_level, &theory, &rows);
|
||||
|
||||
|
@ -23,7 +23,6 @@
|
||||
#include "control.h"
|
||||
#include "cutil.h"
|
||||
#include "host.h"
|
||||
#include "permute.h"
|
||||
#include "ratngs.h"
|
||||
#include "reject.h"
|
||||
#include "stopper.h"
|
||||
@ -38,10 +37,6 @@ FILE *Tesseract::init_recog_training(const STRING &fname) {
|
||||
if (tessedit_ambigs_training) {
|
||||
tessedit_tess_adaption_mode.set_value(0); // turn off adaption
|
||||
tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
|
||||
save_blob_choices.set_value(1); // save individual char choices
|
||||
getDict().save_raw_choices.set_value(1); // save raw choices
|
||||
getDict().permute_only_top.set_value(true); // use only top choice permuter
|
||||
tessedit_ok_mode.set_value(0); // turn off context checking
|
||||
// Explore all segmentations.
|
||||
getDict().stopper_no_acceptable_choices.set_value(1);
|
||||
}
|
||||
@ -156,6 +151,47 @@ void Tesseract::recog_training_segmented(const STRING &fname,
|
||||
examined_words, total_words);
|
||||
}
|
||||
|
||||
// Helper prints the given set of blob choices.
|
||||
static void PrintPath(int length, const BLOB_CHOICE** blob_choices,
|
||||
const UNICHARSET& unicharset,
|
||||
const char *label, FILE *output_file) {
|
||||
float rating = 0.0f;
|
||||
float certainty = 0.0f;
|
||||
for (int i = 0; i < length; ++i) {
|
||||
const BLOB_CHOICE* blob_choice = blob_choices[i];
|
||||
fprintf(output_file, "%s",
|
||||
unicharset.id_to_unichar(blob_choice->unichar_id()));
|
||||
rating += blob_choice->rating();
|
||||
if (certainty > blob_choice->certainty())
|
||||
certainty = blob_choice->certainty();
|
||||
}
|
||||
fprintf(output_file, "\t%s\t%.4f\t%.4f\n",
|
||||
label, rating, certainty);
|
||||
}
|
||||
|
||||
// Helper recursively prints all paths through the ratings matrix, starting
|
||||
// at column col.
|
||||
static void PrintMatrixPaths(int col, int dim,
|
||||
const MATRIX& ratings,
|
||||
int length, const BLOB_CHOICE** blob_choices,
|
||||
const UNICHARSET& unicharset,
|
||||
const char *label, FILE *output_file) {
|
||||
for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
|
||||
if (ratings.get(col, row) != NOT_CLASSIFIED) {
|
||||
BLOB_CHOICE_IT bc_it(ratings.get(col, row));
|
||||
for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
|
||||
blob_choices[length] = bc_it.data();
|
||||
if (row + 1 < dim) {
|
||||
PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices,
|
||||
unicharset, label, output_file);
|
||||
} else {
|
||||
PrintPath(length + 1, blob_choices, unicharset, label, output_file);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Runs classify_word_pass1() on the current word. Outputs Tesseract's
|
||||
// raw choice as a result of the classification. For words labeled with a
|
||||
// single unichar also outputs all alternatives from blob_choices of the
|
||||
@ -165,44 +201,25 @@ void Tesseract::ambigs_classify_and_output(WERD_RES *werd_res,
|
||||
BLOCK_RES *block_res,
|
||||
const char *label,
|
||||
FILE *output_file) {
|
||||
int offset;
|
||||
// Classify word.
|
||||
fflush(stdout);
|
||||
classify_word_pass1(block_res->block, row_res->row, werd_res);
|
||||
WERD_CHOICE *best_choice = werd_res->best_choice;
|
||||
ASSERT_HOST(best_choice != NULL);
|
||||
ASSERT_HOST(best_choice->blob_choices() != NULL);
|
||||
|
||||
// Compute the number of unichars in the label.
|
||||
int label_num_unichars = 0;
|
||||
int step = 1; // should be non-zero on the first iteration
|
||||
for (offset = 0; label[offset] != '\0' && step > 0;
|
||||
step = werd_res->uch_set->step(label + offset),
|
||||
offset += step, ++label_num_unichars);
|
||||
if (step == 0) {
|
||||
GenericVector<UNICHAR_ID> encoding;
|
||||
if (!unicharset.encode_string(label, true, &encoding, NULL, NULL)) {
|
||||
tprintf("Not outputting illegal unichar %s\n", label);
|
||||
return;
|
||||
}
|
||||
|
||||
// Output all classifier choices for the unigrams (1->1 classifications).
|
||||
if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) {
|
||||
BLOB_CHOICE_LIST_C_IT outer_blob_choice_it;
|
||||
outer_blob_choice_it.set_to_list(best_choice->blob_choices());
|
||||
BLOB_CHOICE_IT blob_choice_it;
|
||||
blob_choice_it.set_to_list(outer_blob_choice_it.data());
|
||||
for (blob_choice_it.mark_cycle_pt();
|
||||
!blob_choice_it.cycled_list();
|
||||
blob_choice_it.forward()) {
|
||||
BLOB_CHOICE *blob_choice = blob_choice_it.data();
|
||||
if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) {
|
||||
fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n",
|
||||
unicharset.id_to_unichar(blob_choice->unichar_id()),
|
||||
label, blob_choice->rating(), blob_choice->certainty());
|
||||
}
|
||||
}
|
||||
}
|
||||
// Output raw choices for many->many and 1->many classifications.
|
||||
getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars);
|
||||
// Dump all paths through the ratings matrix (which is normally small).
|
||||
int dim = werd_res->ratings->dimension();
|
||||
const BLOB_CHOICE** blob_choices = new const BLOB_CHOICE*[dim];
|
||||
PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices,
|
||||
unicharset, label, output_file);
|
||||
delete [] blob_choices;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
|
@ -30,13 +30,13 @@
|
||||
#include "scanutils.h"
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include "memry.h"
|
||||
#include "genericvector.h"
|
||||
#include "reject.h"
|
||||
#include "tfacep.h"
|
||||
#include "imgs.h"
|
||||
#include "control.h"
|
||||
#include "docqual.h"
|
||||
#include "secname.h"
|
||||
#include "globaloc.h" // For err_exit.
|
||||
#include "globals.h"
|
||||
#include "helpers.h"
|
||||
|
||||
@ -58,126 +58,26 @@ CLISTIZEH (STRING) CLISTIZE (STRING)
|
||||
*************************************************************************/
|
||||
|
||||
namespace tesseract {
|
||||
void Tesseract::set_done( //set done flag
|
||||
WERD_RES *word,
|
||||
inT16 pass) {
|
||||
/*
|
||||
0: Original heuristic used in Tesseract and Ray's prototype Resaljet
|
||||
*/
|
||||
if (tessedit_ok_mode == 0) {
|
||||
/* NOTE - done even if word contains some or all spaces !!! */
|
||||
word->done = word->tess_accepted;
|
||||
void Tesseract::set_done(WERD_RES *word, inT16 pass) {
|
||||
word->done = word->tess_accepted &&
|
||||
(strchr(word->best_choice->unichar_string().string(), ' ') == NULL);
|
||||
bool word_is_ambig = word->best_choice->dangerous_ambig_found();
|
||||
bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
|
||||
word->best_choice->permuter() == FREQ_DAWG_PERM ||
|
||||
word->best_choice->permuter() == USER_DAWG_PERM;
|
||||
if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
|
||||
one_ell_conflict(word, FALSE)) {
|
||||
if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
|
||||
word->done = FALSE;
|
||||
}
|
||||
/*
|
||||
1: Reject words containing blanks and on pass 1 reject I/l/1 conflicts
|
||||
*/
|
||||
else if (tessedit_ok_mode == 1) {
|
||||
word->done = word->tess_accepted &&
|
||||
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
|
||||
|
||||
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
|
||||
if (word->done && ((!word_from_dict &&
|
||||
word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
|
||||
if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
|
||||
word->done = FALSE;
|
||||
}
|
||||
/*
|
||||
2: as 1 + only accept dict words or numerics in pass 1
|
||||
*/
|
||||
else if (tessedit_ok_mode == 2) {
|
||||
word->done = word->tess_accepted &&
|
||||
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
|
||||
|
||||
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
|
||||
word->done = FALSE;
|
||||
|
||||
if (word->done &&
|
||||
(pass == 1) &&
|
||||
(word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
|
||||
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
|
||||
(word->best_choice->permuter () != USER_DAWG_PERM) &&
|
||||
(word->best_choice->permuter () != NUMBER_PERM)) {
|
||||
#ifndef SECURE_NAMES
|
||||
if (tessedit_rejection_debug)
|
||||
tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
|
||||
word->best_choice->unichar_string().string ());
|
||||
#endif
|
||||
word->done = FALSE;
|
||||
}
|
||||
}
|
||||
/*
|
||||
3: as 2 + only accept dict words or numerics in pass 2 as well
|
||||
*/
|
||||
else if (tessedit_ok_mode == 3) {
|
||||
word->done = word->tess_accepted &&
|
||||
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
|
||||
|
||||
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
|
||||
word->done = FALSE;
|
||||
|
||||
if (word->done &&
|
||||
(word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
|
||||
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
|
||||
(word->best_choice->permuter () != USER_DAWG_PERM) &&
|
||||
(word->best_choice->permuter () != NUMBER_PERM)) {
|
||||
#ifndef SECURE_NAMES
|
||||
if (tessedit_rejection_debug)
|
||||
tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
|
||||
word->best_choice->unichar_string().string ());
|
||||
#endif
|
||||
word->done = FALSE;
|
||||
}
|
||||
}
|
||||
/*
|
||||
4: as 2 + reject dict ambigs in pass 1
|
||||
*/
|
||||
else if (tessedit_ok_mode == 4) {
|
||||
word->done = word->tess_accepted &&
|
||||
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
|
||||
|
||||
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
|
||||
word->done = FALSE;
|
||||
|
||||
if (word->done &&
|
||||
(pass == 1) &&
|
||||
(((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
|
||||
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
|
||||
(word->best_choice->permuter () != USER_DAWG_PERM) &&
|
||||
(word->best_choice->permuter () != NUMBER_PERM)) ||
|
||||
(test_ambig_word (word)))) {
|
||||
#ifndef SECURE_NAMES
|
||||
if (tessedit_rejection_debug)
|
||||
tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
|
||||
word->best_choice->unichar_string().string ());
|
||||
#endif
|
||||
word->done = FALSE;
|
||||
}
|
||||
}
|
||||
/*
|
||||
5: as 3 + reject dict ambigs in both passes
|
||||
*/
|
||||
else if (tessedit_ok_mode == 5) {
|
||||
word->done = word->tess_accepted &&
|
||||
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL);
|
||||
|
||||
if (word->done && (pass == 1) && one_ell_conflict (word, FALSE))
|
||||
word->done = FALSE;
|
||||
|
||||
if (word->done &&
|
||||
(((word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
|
||||
(word->best_choice->permuter () != FREQ_DAWG_PERM) &&
|
||||
(word->best_choice->permuter () != USER_DAWG_PERM) &&
|
||||
(word->best_choice->permuter () != NUMBER_PERM)) ||
|
||||
(test_ambig_word (word)))) {
|
||||
#ifndef SECURE_NAMES
|
||||
if (tessedit_rejection_debug)
|
||||
tprintf ("\nVETO Tess accepting poor word \"%s\"\n",
|
||||
word->best_choice->unichar_string().string ());
|
||||
#endif
|
||||
word->done = FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
else {
|
||||
tprintf ("BAD tessedit_ok_mode\n");
|
||||
err_exit();
|
||||
if (tessedit_rejection_debug) {
|
||||
tprintf("set_done(): done=%d\n", word->done);
|
||||
word->best_choice->print("");
|
||||
}
|
||||
}
|
||||
|
||||
@ -189,12 +89,7 @@ void Tesseract::set_done( //set done flag
|
||||
*
|
||||
* Sets a reject map for the word.
|
||||
*************************************************************************/
|
||||
void Tesseract::make_reject_map( //make rej map for wd //detailed results
|
||||
WERD_RES *word,
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices,
|
||||
ROW *row,
|
||||
inT16 pass //1st or 2nd?
|
||||
) {
|
||||
void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) {
|
||||
int i;
|
||||
int offset;
|
||||
|
||||
@ -208,7 +103,7 @@ void Tesseract::make_reject_map( //make rej map for wd //detailed results
|
||||
*/
|
||||
if (tessedit_reject_mode == 0) {
|
||||
if (!word->done)
|
||||
reject_poor_matches(word, blob_choices);
|
||||
reject_poor_matches(word);
|
||||
} else if (tessedit_reject_mode == 5) {
|
||||
/*
|
||||
5: Reject I/1/l from words where there is no strong contextual confirmation;
|
||||
@ -313,45 +208,13 @@ void Tesseract::reject_I_1_L(WERD_RES *word) {
|
||||
} // namespace tesseract
|
||||
|
||||
|
||||
void reject_poor_matches( //detailed results
|
||||
WERD_RES *word,
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices) {
|
||||
float threshold;
|
||||
inT16 i = 0;
|
||||
inT16 offset = 0;
|
||||
//super iterator
|
||||
BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
|
||||
BLOB_CHOICE_IT choice_it; //real iterator
|
||||
|
||||
#ifndef SECURE_NAMES
|
||||
if (strlen(word->best_choice->unichar_lengths().string()) !=
|
||||
list_it.length()) {
|
||||
tprintf
|
||||
("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",
|
||||
word->best_choice->unichar_string().string(),
|
||||
strlen (word->best_choice->unichar_lengths().string()), list_it.length(),
|
||||
word->box_word->length());
|
||||
}
|
||||
#endif
|
||||
ASSERT_HOST (strlen (word->best_choice->unichar_lengths().string ()) ==
|
||||
list_it.length ());
|
||||
ASSERT_HOST(word->box_word->length() == list_it.length());
|
||||
threshold = compute_reject_threshold (blob_choices);
|
||||
|
||||
for (list_it.mark_cycle_pt ();
|
||||
!list_it.cycled_list (); list_it.forward (), i++,
|
||||
offset += word->best_choice->unichar_lengths()[i]) {
|
||||
/* NB - only compares the threshold against the TOP choice char in the
|
||||
choices list for a blob !! - the selected one may be below the threshold
|
||||
*/
|
||||
choice_it.set_to_list (list_it.data ());
|
||||
if ((word->best_choice->unichar_string()[offset] == ' ') ||
|
||||
(choice_it.length () == 0))
|
||||
//rej unrecognised blobs
|
||||
word->reject_map[i].setrej_tess_failure ();
|
||||
else if (choice_it.data ()->certainty () < threshold)
|
||||
//rej poor score blob
|
||||
word->reject_map[i].setrej_poor_match ();
|
||||
void reject_poor_matches(WERD_RES *word) {
|
||||
float threshold = compute_reject_threshold(word->best_choice);
|
||||
for (int i = 0; i < word->best_choice->length(); ++i) {
|
||||
if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
|
||||
word->reject_map[i].setrej_tess_failure();
|
||||
else if (word->best_choice->certainty(i) < threshold)
|
||||
word->reject_map[i].setrej_poor_match();
|
||||
}
|
||||
}
|
||||
|
||||
@ -364,52 +227,32 @@ void reject_poor_matches( //detailed results
|
||||
* gap in the certainty value.
|
||||
**********************************************************************/
|
||||
|
||||
float compute_reject_threshold( //compute threshold //detailed results
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices) {
|
||||
inT16 index; //to ratings
|
||||
inT16 blob_count; //no of blobs in word
|
||||
inT16 ok_blob_count = 0; //non TESS rej blobs in word
|
||||
float *ratings; //array of confidences
|
||||
float threshold; //rejection threshold
|
||||
float bestgap; //biggest gap
|
||||
float gapstart; //bottom of gap
|
||||
//super iterator
|
||||
BLOB_CHOICE_LIST_C_IT list_it = blob_choices;
|
||||
BLOB_CHOICE_IT choice_it; //real iterator
|
||||
float compute_reject_threshold(WERD_CHOICE* word) {
|
||||
float threshold; // rejection threshold
|
||||
float bestgap = 0.0f; // biggest gap
|
||||
float gapstart; // bottom of gap
|
||||
// super iterator
|
||||
BLOB_CHOICE_IT choice_it; // real iterator
|
||||
|
||||
blob_count = blob_choices->length ();
|
||||
ratings = (float *) alloc_mem (blob_count * sizeof (float));
|
||||
for (list_it.mark_cycle_pt (), index = 0;
|
||||
!list_it.cycled_list (); list_it.forward (), index++) {
|
||||
choice_it.set_to_list (list_it.data ());
|
||||
if (choice_it.length () > 0) {
|
||||
ratings[ok_blob_count] = choice_it.data ()->certainty ();
|
||||
//get in an array
|
||||
// tprintf("Rating[%d]=%c %g %g\n",
|
||||
// index,choice_it.data()->char_class(),
|
||||
// choice_it.data()->rating(),choice_it.data()->certainty());
|
||||
ok_blob_count++;
|
||||
}
|
||||
int blob_count = word->length();
|
||||
GenericVector<float> ratings;
|
||||
ratings.init_to_size(blob_count, 0.0f);
|
||||
for (int i = 0; i < blob_count; ++i) {
|
||||
ratings[i] = word->certainty(i);
|
||||
}
|
||||
ASSERT_HOST (index == blob_count);
|
||||
qsort (ratings, ok_blob_count, sizeof (float), sort_floats);
|
||||
//sort them
|
||||
bestgap = 0;
|
||||
gapstart = ratings[0] - 1; //all reject if none better
|
||||
if (ok_blob_count >= 3) {
|
||||
for (index = 0; index < ok_blob_count - 1; index++) {
|
||||
ratings.sort();
|
||||
gapstart = ratings[0] - 1; // all reject if none better
|
||||
if (blob_count >= 3) {
|
||||
for (int index = 0; index < blob_count - 1; index++) {
|
||||
if (ratings[index + 1] - ratings[index] > bestgap) {
|
||||
bestgap = ratings[index + 1] - ratings[index];
|
||||
//find biggest
|
||||
// find biggest
|
||||
gapstart = ratings[index];
|
||||
}
|
||||
}
|
||||
}
|
||||
threshold = gapstart + bestgap / 2;
|
||||
// tprintf("First=%g, last=%g, gap=%g, threshold=%g\n",
|
||||
// ratings[0],ratings[index],bestgap,threshold);
|
||||
|
||||
free_mem(ratings);
|
||||
return threshold;
|
||||
}
|
||||
|
||||
@ -680,21 +523,6 @@ BOOL8 Tesseract::word_contains_non_1_digit(const char *word,
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
BOOL8 Tesseract::test_ambig_word( //test for ambiguity
|
||||
WERD_RES *word) {
|
||||
BOOL8 ambig = FALSE;
|
||||
|
||||
if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
|
||||
(word->best_choice->permuter () == FREQ_DAWG_PERM) ||
|
||||
(word->best_choice->permuter () == USER_DAWG_PERM)) {
|
||||
ambig = !getDict().NoDangerousAmbig(
|
||||
word->best_choice, NULL, false, NULL, NULL);
|
||||
}
|
||||
return ambig;
|
||||
}
|
||||
|
||||
|
||||
/*************************************************************************
|
||||
* dont_allow_1Il()
|
||||
* Dont unreject LONE accepted 1Il conflict set chars
|
||||
@ -786,10 +614,9 @@ inT16 Tesseract::safe_dict_word(const WERD_RES *werd_res) {
|
||||
return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
|
||||
}
|
||||
|
||||
// Note: After running this function word_res->best_choice->blob_choices()
|
||||
// might not contain the right BLOB_CHOICE coresponding to each character
|
||||
// in word_res->best_choice. However, the length of blob_choices and
|
||||
// word_res->best_choice will remain the same.
|
||||
// Note: After running this function word_res->ratings
|
||||
// might not contain the right BLOB_CHOICE corresponding to each character
|
||||
// in word_res->best_choice.
|
||||
void Tesseract::flip_hyphens(WERD_RES *word_res) {
|
||||
WERD_CHOICE *best_choice = word_res->best_choice;
|
||||
int i;
|
||||
@ -801,16 +628,16 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
|
||||
if (tessedit_lower_flip_hyphen <= 1)
|
||||
return;
|
||||
|
||||
TBLOB* blob = word_res->rebuild_word->blobs;
|
||||
int num_blobs = word_res->rebuild_word->NumBlobs();
|
||||
UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
|
||||
bool modified = false;
|
||||
for (i = 0; i < best_choice->length() && blob != NULL; ++i,
|
||||
blob = blob->next) {
|
||||
for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
|
||||
TBLOB* blob = word_res->rebuild_word->blobs[i];
|
||||
out_box = blob->bounding_box();
|
||||
if (blob->next == NULL)
|
||||
if (i + 1 == num_blobs)
|
||||
next_left = 9999;
|
||||
else
|
||||
next_left = blob->next->bounding_box().left();
|
||||
next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
|
||||
// Dont touch small or touching blobs - it is too dangerous.
|
||||
if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
|
||||
(out_box.left() > prev_right) && (out_box.right() < next_left)) {
|
||||
@ -846,10 +673,9 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
|
||||
}
|
||||
}
|
||||
|
||||
// Note: After running this function word_res->best_choice->blob_choices()
|
||||
// might not contain the right BLOB_CHOICE coresponding to each character
|
||||
// in word_res->best_choice. However, the length of blob_choices and
|
||||
// word_res->best_choice will remain the same.
|
||||
// Note: After running this function word_res->ratings
|
||||
// might not contain the right BLOB_CHOICE corresponding to each character
|
||||
// in word_res->best_choice.
|
||||
void Tesseract::flip_0O(WERD_RES *word_res) {
|
||||
WERD_CHOICE *best_choice = word_res->best_choice;
|
||||
int i;
|
||||
@ -858,9 +684,9 @@ void Tesseract::flip_0O(WERD_RES *word_res) {
|
||||
if (!tessedit_flip_0O)
|
||||
return;
|
||||
|
||||
TBLOB* blob = word_res->rebuild_word->blobs;
|
||||
for (i = 0; i < best_choice->length() && blob != NULL; ++i,
|
||||
blob = blob->next) {
|
||||
int num_blobs = word_res->rebuild_word->NumBlobs();
|
||||
for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
|
||||
TBLOB* blob = word_res->rebuild_word->blobs[i];
|
||||
if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
|
||||
word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
|
||||
out_box = blob->bounding_box();
|
||||
|
@ -24,8 +24,8 @@
|
||||
#include "pageres.h"
|
||||
|
||||
void reject_blanks(WERD_RES *word);
|
||||
void reject_poor_matches(WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices);
|
||||
float compute_reject_threshold(BLOB_CHOICE_LIST_CLIST *blob_choices);
|
||||
void reject_poor_matches(WERD_RES *word);
|
||||
float compute_reject_threshold(WERD_CHOICE* word);
|
||||
BOOL8 word_contains_non_1_digit(const char *word, const char *word_lengths);
|
||||
void dont_allow_1Il(WERD_RES *word);
|
||||
void flip_hyphens(WERD_RES *word);
|
||||
|
@ -24,8 +24,9 @@
|
||||
|
||||
#include "platform.h"
|
||||
#include "ltrresultiterator.h"
|
||||
#include "genericvector.h"
|
||||
|
||||
template <typename T> class GenericVector;
|
||||
template <typename T> class GenericVectorEqEq;
|
||||
class BLOB_CHOICE_IT;
|
||||
class WERD_RES;
|
||||
class STRING;
|
||||
|
@ -31,6 +31,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "fileerr.h"
|
||||
#include "globaloc.h" // For err_exit.
|
||||
#include "tprintf.h"
|
||||
#include "img.h"
|
||||
#include "imgscale.h"
|
||||
|
@ -21,25 +21,22 @@
|
||||
#pragma warning(disable:4244) // Conversion warnings
|
||||
#endif
|
||||
|
||||
#include "tfacep.h"
|
||||
#include "tfacepp.h"
|
||||
#include "tessbox.h"
|
||||
#include "mfoutline.h"
|
||||
#include "tessbox.h"
|
||||
#include "tesseractclass.h"
|
||||
|
||||
#define EXTERN
|
||||
|
||||
/**
|
||||
* @name tess_segment_pass1
|
||||
* @name tess_segment_pass_n
|
||||
*
|
||||
* Segment a word using the pass1 conditions of the tess segmenter.
|
||||
* Segment a word using the pass_n conditions of the tess segmenter.
|
||||
* @param pass_n pass number
|
||||
* @param word word to do
|
||||
* @param blob_choices list of blob lists
|
||||
*/
|
||||
|
||||
namespace tesseract {
|
||||
void Tesseract::tess_segment_pass1(WERD_RES *word,
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices) {
|
||||
void Tesseract::tess_segment_pass_n(int pass_n, WERD_RES *word) {
|
||||
int saved_enable_assoc = 0;
|
||||
int saved_chop_enable = 0;
|
||||
|
||||
@ -48,46 +45,17 @@ void Tesseract::tess_segment_pass1(WERD_RES *word,
|
||||
saved_chop_enable = chop_enable;
|
||||
wordrec_enable_assoc.set_value(0);
|
||||
chop_enable.set_value(0);
|
||||
if (word->word->flag(W_REP_CHAR))
|
||||
getDict().permute_only_top.set_value(true);
|
||||
}
|
||||
set_pass1();
|
||||
recog_word(word, blob_choices);
|
||||
if (pass_n == 1)
|
||||
set_pass1();
|
||||
else
|
||||
set_pass2();
|
||||
recog_word(word);
|
||||
if (word->best_choice == NULL)
|
||||
word->SetupFake(*word->uch_set);
|
||||
if (word->word->flag(W_DONT_CHOP)) {
|
||||
wordrec_enable_assoc.set_value(saved_enable_assoc);
|
||||
chop_enable.set_value(saved_chop_enable);
|
||||
getDict().permute_only_top.set_value(false);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @name tess_segment_pass2
|
||||
*
|
||||
* Segment a word using the pass2 conditions of the tess segmenter.
|
||||
* @param word word to do
|
||||
* @param blob_choices list of blob lists
|
||||
*/
|
||||
|
||||
void Tesseract::tess_segment_pass2(WERD_RES *word,
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices) {
|
||||
int saved_enable_assoc = 0;
|
||||
int saved_chop_enable = 0;
|
||||
|
||||
if (word->word->flag(W_DONT_CHOP)) {
|
||||
saved_enable_assoc = wordrec_enable_assoc;
|
||||
saved_chop_enable = chop_enable;
|
||||
wordrec_enable_assoc.set_value(0);
|
||||
chop_enable.set_value(0);
|
||||
if (word->word->flag(W_REP_CHAR))
|
||||
getDict().permute_only_top.set_value(true);
|
||||
}
|
||||
set_pass2();
|
||||
recog_word(word, blob_choices);
|
||||
if (word->word->flag(W_DONT_CHOP)) {
|
||||
wordrec_enable_assoc.set_value(saved_enable_assoc);
|
||||
chop_enable.set_value(saved_chop_enable);
|
||||
getDict().permute_only_top.set_value(false);
|
||||
}
|
||||
}
|
||||
|
||||
@ -98,10 +66,8 @@ void Tesseract::tess_segment_pass2(WERD_RES *word,
|
||||
* @param word_choice after context
|
||||
* @param raw_choice before context
|
||||
*/
|
||||
BOOL8 Tesseract::tess_acceptable_word(
|
||||
WERD_CHOICE *word_choice, // after context
|
||||
WERD_CHOICE *raw_choice) { // before context
|
||||
return getDict().AcceptableResult(*word_choice);
|
||||
bool Tesseract::tess_acceptable_word(WERD_RES* word) {
|
||||
return getDict().AcceptableResult(word);
|
||||
}
|
||||
|
||||
|
||||
|
@ -17,30 +17,17 @@
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
//#include <osfcn.h>
|
||||
//#include <signal.h>
|
||||
//#include <time.h>
|
||||
//#include <unistd.h>
|
||||
#include "tfacep.h" //must be before main.h
|
||||
//#include "fileerr.h"
|
||||
#include "stderr.h"
|
||||
#include "basedir.h"
|
||||
#include "tessvars.h"
|
||||
//#include "debgwin.h"
|
||||
//#include "epapdest.h"
|
||||
#include "control.h"
|
||||
#include "imgs.h"
|
||||
#include "reject.h"
|
||||
#include "pageres.h"
|
||||
//#include "gpapdest.h"
|
||||
#include "nwmain.h"
|
||||
#include "pgedit.h"
|
||||
#include "tprintf.h"
|
||||
//#include "ipeerr.h"
|
||||
//#include "restart.h"
|
||||
#include "tessedit.h"
|
||||
//#include "fontfind.h"
|
||||
#include "permute.h"
|
||||
#include "stopper.h"
|
||||
#include "intmatcher.h"
|
||||
#include "chop.h"
|
||||
@ -190,9 +177,16 @@ bool Tesseract::init_tesseract_lang_data(
|
||||
if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
|
||||
right_to_left_ = unicharset.major_right_to_left();
|
||||
|
||||
// Setup initial unichar ambigs table and read universal ambigs.
|
||||
UNICHARSET encoder_unicharset;
|
||||
encoder_unicharset.CopyFrom(unicharset);
|
||||
unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
|
||||
unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
|
||||
|
||||
if (!tessedit_ambigs_training &&
|
||||
tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) {
|
||||
unichar_ambigs.LoadUnicharAmbigs(
|
||||
encoder_unicharset,
|
||||
tessdata_manager.GetDataFilePtr(),
|
||||
tessdata_manager.GetEndOffset(TESSDATA_AMBIGS),
|
||||
ambigs_debug_level, use_ambigs_for_adaption, &unicharset);
|
||||
@ -210,6 +204,23 @@ bool Tesseract::init_tesseract_lang_data(
|
||||
tprintf("Loaded Cube with combiner\n");
|
||||
}
|
||||
|
||||
// Init ParamsModel.
|
||||
// Load pass1 and pass2 weights (for now these two sets are the same, but in
|
||||
// the future separate sets of weights can be generated).
|
||||
for (int p = ParamsModel::PTRAIN_PASS1;
|
||||
p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
|
||||
language_model_->getParamsModel().SetPass(
|
||||
static_cast<ParamsModel::PassEnum>(p));
|
||||
if (tessdata_manager.SeekToStart(TESSDATA_PARAMS_MODEL)) {
|
||||
if (!language_model_->getParamsModel().LoadFromFp(
|
||||
lang.string(), tessdata_manager.GetDataFilePtr(),
|
||||
tessdata_manager.GetEndOffset(TESSDATA_PARAMS_MODEL))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (tessdata_manager_debug_level) language_model_->getParamsModel().Print();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -323,6 +334,30 @@ int Tesseract::init_tesseract(
|
||||
tprintf("Tesseract couldn't load any languages!\n");
|
||||
return -1; // Couldn't load any language!
|
||||
}
|
||||
if (!sub_langs_.empty()) {
|
||||
// In multilingual mode word ratings have to be directly comparable,
|
||||
// so use the same language model weights for all languages:
|
||||
// use the primary language's params model if
|
||||
// tessedit_use_primary_params_model is set,
|
||||
// otherwise use default language model weights.
|
||||
if (tessedit_use_primary_params_model) {
|
||||
for (int s = 0; s < sub_langs_.size(); ++s) {
|
||||
sub_langs_[s]->language_model_->getParamsModel().Copy(
|
||||
this->language_model_->getParamsModel());
|
||||
}
|
||||
tprintf("Using params model of the primary language\n");
|
||||
if (tessdata_manager_debug_level) {
|
||||
this->language_model_->getParamsModel().Print();
|
||||
}
|
||||
} else {
|
||||
this->language_model_->getParamsModel().Clear();
|
||||
for (int s = 0; s < sub_langs_.size(); ++s) {
|
||||
sub_langs_[s]->language_model_->getParamsModel().Clear();
|
||||
}
|
||||
tprintf("Using default language params\n");
|
||||
}
|
||||
}
|
||||
|
||||
SetupUniversalFontIds();
|
||||
return 0;
|
||||
}
|
||||
@ -420,7 +455,7 @@ int Tesseract::init_tesseract_lm(const char *arg0,
|
||||
if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
|
||||
NULL, 0, NULL, NULL, false))
|
||||
return -1;
|
||||
getDict().Load();
|
||||
getDict().Load(Dict::GlobalDawgCache());
|
||||
tessdata_manager.End();
|
||||
return 0;
|
||||
}
|
||||
|
@ -221,16 +221,16 @@ bool TesseractCubeCombiner::ComputeCombinerFeatures(const string &tess_str,
|
||||
features->push_back(cube_best_bigram_cost);
|
||||
}
|
||||
// case-insensitive string comparison, including punctuation
|
||||
int compare_nocase_punc = CompareStrings(cube_best_str.c_str(),
|
||||
tess_str.c_str(), false, true);
|
||||
int compare_nocase_punc = CompareStrings(cube_best_str,
|
||||
tess_str, false, true);
|
||||
features->push_back(compare_nocase_punc == 0);
|
||||
// case-sensitive string comparison, ignoring punctuation
|
||||
int compare_case_nopunc = CompareStrings(cube_best_str.c_str(),
|
||||
tess_str.c_str(), true, false);
|
||||
int compare_case_nopunc = CompareStrings(cube_best_str,
|
||||
tess_str, true, false);
|
||||
features->push_back(compare_case_nopunc == 0);
|
||||
// case-insensitive string comparison, ignoring punctuation
|
||||
int compare_nocase_nopunc = CompareStrings(cube_best_str.c_str(),
|
||||
tess_str.c_str(), true, true);
|
||||
int compare_nocase_nopunc = CompareStrings(cube_best_str,
|
||||
tess_str, true, true);
|
||||
features->push_back(compare_nocase_nopunc == 0);
|
||||
return true;
|
||||
}
|
||||
|
@ -1,37 +0,0 @@
|
||||
/**********************************************************************
|
||||
* File: tfacep.h (Formerly tfacep.h)
|
||||
* Description: Declarations of C functions and C owned data.
|
||||
* Author: Ray Smith
|
||||
* Created: Mon Apr 27 12:51:28 BST 1992
|
||||
*
|
||||
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TFACEP_H
|
||||
#define TFACEP_H
|
||||
|
||||
#include "host.h"
|
||||
#include "blobs.h"
|
||||
#include "tessarray.h"
|
||||
#include "oldlist.h"
|
||||
#include "permute.h"
|
||||
#include "blobclass.h"
|
||||
#include "stopper.h"
|
||||
#include "associate.h"
|
||||
#include "chop.h"
|
||||
#include "structures.h"
|
||||
|
||||
typedef void (*TESS_TESTER) (TBLOB *, BOOL8, char *, inT32, LIST);
|
||||
typedef LIST (*TESS_MATCHER) (TBLOB *, TBLOB *, TBLOB *);
|
||||
|
||||
#endif
|
@ -25,19 +25,12 @@
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#ifdef __UNIX__
|
||||
#include <assert.h>
|
||||
#endif
|
||||
#include "errcode.h"
|
||||
#include "ratngs.h"
|
||||
#include "reject.h"
|
||||
#include "werd.h"
|
||||
#include "tfacep.h"
|
||||
#include "tfacepp.h"
|
||||
#include "tessvars.h"
|
||||
#include "globals.h"
|
||||
#include "reject.h"
|
||||
#include "tesseractclass.h"
|
||||
#include "blamer.h"
|
||||
#include "errcode.h"
|
||||
#include "ratngs.h"
|
||||
#include "reject.h"
|
||||
#include "tesseractclass.h"
|
||||
#include "werd.h"
|
||||
|
||||
#define MAX_UNDIVIDED_LENGTH 24
|
||||
|
||||
@ -50,21 +43,30 @@
|
||||
* Convert the output back to editor form.
|
||||
**********************************************************************/
|
||||
namespace tesseract {
|
||||
void Tesseract::recog_word(WERD_RES *word,
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices) {
|
||||
ASSERT_HOST(word->chopped_word->blobs != NULL);
|
||||
recog_word_recursive(word, blob_choices);
|
||||
void Tesseract::recog_word(WERD_RES *word) {
|
||||
if (wordrec_skip_no_truth_words && (word->blamer_bundle == NULL ||
|
||||
word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
|
||||
if (classify_debug_level) tprintf("No truth for word - skipping\n");
|
||||
word->tess_failed = true;
|
||||
return;
|
||||
}
|
||||
ASSERT_HOST(!word->chopped_word->blobs.empty());
|
||||
recog_word_recursive(word);
|
||||
word->SetupBoxWord();
|
||||
if ((word->best_choice->length() != word->box_word->length()) ||
|
||||
(word->best_choice->length() != blob_choices->length())) {
|
||||
if (word->best_choice->length() != word->box_word->length()) {
|
||||
tprintf("recog_word ASSERT FAIL String:\"%s\"; "
|
||||
"Strlen=%d; #Blobs=%d; #Choices=%d\n",
|
||||
"Strlen=%d; #Blobs=%d\n",
|
||||
word->best_choice->debug_string().string(),
|
||||
word->best_choice->length(), word->box_word->length(),
|
||||
blob_choices->length());
|
||||
word->best_choice->length(), word->box_word->length());
|
||||
}
|
||||
ASSERT_HOST(word->best_choice->length() == word->box_word->length());
|
||||
ASSERT_HOST(word->best_choice->length() == blob_choices->length());
|
||||
// Check that the ratings matrix size matches the sum of all the
|
||||
// segmentation states.
|
||||
if (!word->StatesAllValid()) {
|
||||
tprintf("Not all words have valid states relative to ratings matrix!!");
|
||||
word->DebugWordChoices(true, NULL);
|
||||
ASSERT_HOST(word->StatesAllValid());
|
||||
}
|
||||
if (tessedit_override_permuter) {
|
||||
/* Override the permuter type if a straight dictionary check disagrees. */
|
||||
uinT8 perm_type = word->best_choice->permuter();
|
||||
@ -105,31 +107,13 @@ void Tesseract::recog_word(WERD_RES *word,
|
||||
* Convert the word to tess form and pass it to the tess segmenter.
|
||||
* Convert the output back to editor form.
|
||||
**********************************************************************/
|
||||
void Tesseract::recog_word_recursive(WERD_RES *word,
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices) {
|
||||
void Tesseract::recog_word_recursive(WERD_RES *word) {
|
||||
int word_length = word->chopped_word->NumBlobs(); // no of blobs
|
||||
if (word_length > MAX_UNDIVIDED_LENGTH) {
|
||||
return split_and_recog_word(word, blob_choices);
|
||||
return split_and_recog_word(word);
|
||||
}
|
||||
int initial_blob_choice_len = blob_choices->length();
|
||||
BLOB_CHOICE_LIST_VECTOR* tess_ratings = cc_recog(word);
|
||||
|
||||
// Put BLOB_CHOICE_LISTs from tess_ratings into blob_choices.
|
||||
BLOB_CHOICE_LIST_C_IT blob_choices_it(blob_choices);
|
||||
for (int i = 0; i < tess_ratings->length(); ++i) {
|
||||
blob_choices_it.add_to_end(tess_ratings->get(i));
|
||||
}
|
||||
delete tess_ratings;
|
||||
|
||||
cc_recog(word);
|
||||
word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
|
||||
// Pad raw_choice with spaces if needed.
|
||||
if (word->raw_choice->length() < word_length) {
|
||||
UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
|
||||
while (word->raw_choice->length() < word_length) {
|
||||
word->raw_choice->append_unichar_id(space_id, 1, 0.0,
|
||||
word->raw_choice->certainty());
|
||||
}
|
||||
}
|
||||
|
||||
// Do sanity checks and minor fixes on best_choice.
|
||||
if (word->best_choice->length() > word_length) {
|
||||
@ -141,21 +125,6 @@ void Tesseract::recog_word_recursive(WERD_RES *word,
|
||||
tprintf("Word is at:");
|
||||
word->word->bounding_box().print();
|
||||
}
|
||||
if (blob_choices->length() - initial_blob_choice_len != word_length) {
|
||||
word->best_choice->make_bad(); // force rejection
|
||||
tprintf("recog_word: Choices list len:%d; blob lists len:%d\n",
|
||||
blob_choices->length(), word_length);
|
||||
blob_choices_it.set_to_list(blob_choices); // list of lists
|
||||
while (blob_choices->length() - initial_blob_choice_len < word_length) {
|
||||
blob_choices_it.add_to_end(new BLOB_CHOICE_LIST()); // add a fake one
|
||||
tprintf("recog_word: Added dummy choice list\n");
|
||||
}
|
||||
while (blob_choices->length() - initial_blob_choice_len > word_length) {
|
||||
blob_choices_it.move_to_last(); // should never happen
|
||||
delete blob_choices_it.extract();
|
||||
tprintf("recog_word: Deleted choice list\n");
|
||||
}
|
||||
}
|
||||
if (word->best_choice->length() < word_length) {
|
||||
UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
|
||||
while (word->best_choice->length() < word_length) {
|
||||
@ -172,133 +141,134 @@ void Tesseract::recog_word_recursive(WERD_RES *word,
|
||||
* Split the word into 2 smaller pieces at the largest gap.
|
||||
* Recognize the pieces and stick the results back together.
|
||||
**********************************************************************/
|
||||
|
||||
void Tesseract::split_and_recog_word(WERD_RES *word,
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices) {
|
||||
void Tesseract::split_and_recog_word(WERD_RES *word) {
|
||||
// Find the biggest blob gap in the chopped_word.
|
||||
int bestgap = -MAX_INT32;
|
||||
TPOINT best_split_pt;
|
||||
int split_index = 0;
|
||||
TBLOB* best_end = NULL;
|
||||
TBLOB* prev_blob = NULL;
|
||||
for (TBLOB* blob = word->chopped_word->blobs; blob != NULL;
|
||||
blob = blob->next) {
|
||||
if (prev_blob != NULL) {
|
||||
TBOX prev_box = prev_blob->bounding_box();
|
||||
TBOX blob_box = blob->bounding_box();
|
||||
int gap = blob_box.left() - prev_box.right();
|
||||
if (gap > bestgap) {
|
||||
bestgap = gap;
|
||||
best_end = prev_blob;
|
||||
best_split_pt.x = (prev_box.right() + blob_box.left()) / 2;
|
||||
best_split_pt.y = (prev_box.top() + prev_box.bottom() +
|
||||
blob_box.top() + blob_box.bottom()) / 4;
|
||||
}
|
||||
for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
|
||||
TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
|
||||
TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
|
||||
int gap = blob_box.left() - prev_box.right();
|
||||
if (gap > bestgap) {
|
||||
bestgap = gap;
|
||||
split_index = b;
|
||||
}
|
||||
prev_blob = blob;
|
||||
}
|
||||
ASSERT_HOST(best_end != NULL);
|
||||
ASSERT_HOST(best_end->next != NULL);
|
||||
ASSERT_HOST(split_index > 0);
|
||||
|
||||
// Make a copy of the word to put the 2nd half in.
|
||||
WERD_RES* word2 = new WERD_RES(*word);
|
||||
// Blow away the copied chopped_word, as we want to work with the blobs
|
||||
// from the input chopped_word so the seam_arrays can be merged.
|
||||
delete word2->chopped_word;
|
||||
word2->chopped_word = new TWERD;
|
||||
word2->chopped_word->blobs = best_end->next;
|
||||
best_end->next = NULL;
|
||||
// Make a new seamarray on both words.
|
||||
free_seam_list(word->seam_array);
|
||||
word->seam_array = start_seam_list(word->chopped_word->blobs);
|
||||
word2->seam_array = start_seam_list(word2->chopped_word->blobs);
|
||||
BlamerBundle *orig_bb = word->blamer_bundle;
|
||||
STRING blamer_debug;
|
||||
// Try to adjust truth information.
|
||||
if (orig_bb != NULL) {
|
||||
// Find truth boxes that correspond to the split in the blobs.
|
||||
int b;
|
||||
int begin2_truth_index = -1;
|
||||
if (orig_bb->incorrect_result_reason != IRR_NO_TRUTH &&
|
||||
orig_bb->truth_has_char_boxes) {
|
||||
int end1_x = best_end->bounding_box().right();
|
||||
int begin2_x = word2->chopped_word->blobs->bounding_box().left();
|
||||
blamer_debug = "Looking for truth split at";
|
||||
blamer_debug.add_str_int(" end1_x ", end1_x);
|
||||
blamer_debug.add_str_int(" begin2_x ", begin2_x);
|
||||
blamer_debug += "\nnorm_truth_word boxes:\n";
|
||||
if (orig_bb->norm_truth_word.length() > 1) {
|
||||
orig_bb->norm_truth_word.BlobBox(0).append_debug(&blamer_debug);
|
||||
for (b = 1; b < orig_bb->norm_truth_word.length(); ++b) {
|
||||
orig_bb->norm_truth_word.BlobBox(b).append_debug(&blamer_debug);
|
||||
if ((abs(end1_x - orig_bb->norm_truth_word.BlobBox(b-1).right()) <
|
||||
orig_bb->norm_box_tolerance) &&
|
||||
(abs(begin2_x - orig_bb->norm_truth_word.BlobBox(b).left()) <
|
||||
orig_bb->norm_box_tolerance)) {
|
||||
begin2_truth_index = b;
|
||||
blamer_debug += "Split found\n";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Populate truth information in word and word2 with the first and second
|
||||
// part of the original truth.
|
||||
word->blamer_bundle = new BlamerBundle();
|
||||
word2->blamer_bundle = new BlamerBundle();
|
||||
if (begin2_truth_index > 0) {
|
||||
word->blamer_bundle->truth_has_char_boxes = true;
|
||||
word->blamer_bundle->norm_box_tolerance = orig_bb->norm_box_tolerance;
|
||||
word2->blamer_bundle->truth_has_char_boxes = true;
|
||||
word2->blamer_bundle->norm_box_tolerance = orig_bb->norm_box_tolerance;
|
||||
BlamerBundle *curr_bb = word->blamer_bundle;
|
||||
for (b = 0; b < orig_bb->norm_truth_word.length(); ++b) {
|
||||
if (b == begin2_truth_index) curr_bb = word2->blamer_bundle;
|
||||
curr_bb->norm_truth_word.InsertBox(
|
||||
b, orig_bb->norm_truth_word.BlobBox(b));
|
||||
curr_bb->truth_word.InsertBox(b, orig_bb->truth_word.BlobBox(b));
|
||||
curr_bb->truth_text.push_back(orig_bb->truth_text[b]);
|
||||
}
|
||||
} else if (orig_bb->incorrect_result_reason == IRR_NO_TRUTH) {
|
||||
word->blamer_bundle->incorrect_result_reason = IRR_NO_TRUTH;
|
||||
word2->blamer_bundle->incorrect_result_reason = IRR_NO_TRUTH;
|
||||
} else {
|
||||
blamer_debug += "Truth split not found";
|
||||
blamer_debug += orig_bb->truth_has_char_boxes ?
|
||||
"\n" : " (no truth char boxes)\n";
|
||||
word->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug,
|
||||
NULL, wordrec_debug_blamer);
|
||||
word2->blamer_bundle->SetBlame(IRR_NO_TRUTH_SPLIT, blamer_debug,
|
||||
NULL, wordrec_debug_blamer);
|
||||
}
|
||||
}
|
||||
WERD_RES *word2 = NULL;
|
||||
BlamerBundle *orig_bb = NULL;
|
||||
split_word(word, split_index, &word2, &orig_bb);
|
||||
|
||||
// Recognize the first part of the word.
|
||||
recog_word_recursive(word, blob_choices);
|
||||
recog_word_recursive(word);
|
||||
// Recognize the second part of the word.
|
||||
recog_word_recursive(word2, blob_choices);
|
||||
recog_word_recursive(word2);
|
||||
|
||||
join_words(word, word2, orig_bb);
|
||||
}
|
||||
|
||||
|
||||
/**********************************************************************
|
||||
* split_word
|
||||
*
|
||||
* Split a given WERD_RES in place into two smaller words for recognition.
|
||||
* split_pt is the index of the first blob to go in the second word.
|
||||
* The underlying word is left alone, only the TWERD (and subsequent data)
|
||||
* are split up. orig_blamer_bundle is set to the original blamer bundle,
|
||||
* and will now be owned by the caller. New blamer bundles are forged for the
|
||||
* two pieces.
|
||||
**********************************************************************/
|
||||
void Tesseract::split_word(WERD_RES *word,
|
||||
int split_pt,
|
||||
WERD_RES **right_piece,
|
||||
BlamerBundle **orig_blamer_bundle) const {
|
||||
ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
|
||||
|
||||
// Save a copy of the blamer bundle so we can try to reconstruct it below.
|
||||
BlamerBundle *orig_bb =
|
||||
word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : NULL;
|
||||
|
||||
WERD_RES *word2 = new WERD_RES(*word);
|
||||
|
||||
// blow away the copied chopped_word, as we want to work with
|
||||
// the blobs from the input chopped_word so seam_arrays can be merged.
|
||||
TWERD *chopped = word->chopped_word;
|
||||
TWERD *chopped2 = new TWERD;
|
||||
chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
|
||||
for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
|
||||
chopped2->blobs.push_back(chopped->blobs[i]);
|
||||
}
|
||||
chopped->blobs.truncate(split_pt);
|
||||
word->chopped_word = NULL;
|
||||
delete word2->chopped_word;
|
||||
word2->chopped_word = NULL;
|
||||
|
||||
const UNICHARSET &unicharset = *word->uch_set;
|
||||
word->ClearResults();
|
||||
word2->ClearResults();
|
||||
word->chopped_word = chopped;
|
||||
word2->chopped_word = chopped2;
|
||||
word->SetupBasicsFromChoppedWord(unicharset);
|
||||
word2->SetupBasicsFromChoppedWord(unicharset);
|
||||
|
||||
// Try to adjust the blamer bundle.
|
||||
if (orig_bb != NULL) {
|
||||
// TODO(rays) Looks like a leak to me.
|
||||
// orig_bb should take, rather than copy.
|
||||
word->blamer_bundle = new BlamerBundle();
|
||||
word2->blamer_bundle = new BlamerBundle();
|
||||
orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
|
||||
word2->chopped_word->blobs[0]->bounding_box().left(),
|
||||
wordrec_debug_blamer,
|
||||
word->blamer_bundle, word2->blamer_bundle);
|
||||
}
|
||||
|
||||
*right_piece = word2;
|
||||
*orig_blamer_bundle = orig_bb;
|
||||
}
|
||||
|
||||
|
||||
/**********************************************************************
|
||||
* join_words
|
||||
*
|
||||
* The opposite of split_word():
|
||||
* join word2 (including any recognized data / seam array / etc)
|
||||
* onto the right of word and then delete word2.
|
||||
* Also, if orig_bb is provided, stitch it back into word.
|
||||
**********************************************************************/
|
||||
void Tesseract::join_words(WERD_RES *word,
|
||||
WERD_RES *word2,
|
||||
BlamerBundle *orig_bb) const {
|
||||
TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
|
||||
TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
|
||||
// Tack the word2 outputs onto the end of the word outputs.
|
||||
// New blobs might have appeared on the end of word1.
|
||||
for (best_end = word->chopped_word->blobs; best_end->next != NULL;
|
||||
best_end = best_end->next);
|
||||
best_end->next = word2->chopped_word->blobs;
|
||||
TBLOB* blob;
|
||||
for (blob = word->rebuild_word->blobs; blob->next != NULL; blob = blob->next);
|
||||
blob->next = word2->rebuild_word->blobs;
|
||||
word2->chopped_word->blobs = NULL;
|
||||
word2->rebuild_word->blobs = NULL;
|
||||
// Copy the seams onto the end of the word1 seam_array.
|
||||
word->chopped_word->blobs += word2->chopped_word->blobs;
|
||||
word->rebuild_word->blobs += word2->rebuild_word->blobs;
|
||||
word2->chopped_word->blobs.clear();
|
||||
word2->rebuild_word->blobs.clear();
|
||||
TPOINT split_pt;
|
||||
split_pt.x = (prev_box.right() + blob_box.left()) / 2;
|
||||
split_pt.y = (prev_box.top() + prev_box.bottom() +
|
||||
blob_box.top() + blob_box.bottom()) / 4;
|
||||
// Move the word2 seams onto the end of the word1 seam_array.
|
||||
// Since the seam list is one element short, an empty seam marking the
|
||||
// end of the last blob in the first word is needed first.
|
||||
word->seam_array = add_seam(word->seam_array,
|
||||
new_seam(0.0, best_split_pt, NULL, NULL, NULL));
|
||||
for (int i = 0; i < array_count(word2->seam_array); ++i) {
|
||||
SEAM* seam = reinterpret_cast<SEAM*>(array_value(word2->seam_array, i));
|
||||
array_value(word2->seam_array, i) = NULL;
|
||||
word->seam_array = add_seam(word->seam_array, seam);
|
||||
}
|
||||
word->seam_array.push_back(new SEAM(0.0f, split_pt, NULL, NULL, NULL));
|
||||
word->seam_array += word2->seam_array;
|
||||
word2->seam_array.truncate(0);
|
||||
// Fix widths and gaps.
|
||||
word->blob_widths += word2->blob_widths;
|
||||
word->blob_gaps += word2->blob_gaps;
|
||||
// Fix the ratings matrix.
|
||||
int rat1 = word->ratings->dimension();
|
||||
int rat2 = word2->ratings->dimension();
|
||||
word->ratings->AttachOnCorner(word2->ratings);
|
||||
ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
|
||||
word->best_state += word2->best_state;
|
||||
// Append the word choices.
|
||||
*word->best_choice += *word2->best_choice;
|
||||
*word->raw_choice += *word2->raw_choice;
|
||||
|
||||
// How many alt choices from each should we try to get?
|
||||
@ -306,70 +276,56 @@ void Tesseract::split_and_recog_word(WERD_RES *word,
|
||||
// When do we start throwing away extra alt choices?
|
||||
const int kTooManyAltChoices = 100;
|
||||
|
||||
if (word->alt_choices.size() > 0 && word2->alt_choices.size() > 0) {
|
||||
// Construct the cartesian product of the alt choices of word(1) and word2.
|
||||
int num_first_alt_choices = word->alt_choices.size();
|
||||
// Nota Bene: For the main loop here, we leave in place word1-only
|
||||
// alt_choices in
|
||||
// word->alt_choices[0] .. word_alt_choices[num_first_alt_choices - 1]
|
||||
// These will get fused with the best choices for word2 below.
|
||||
for (int j = 1; j < word2->alt_choices.size() &&
|
||||
(j <= kAltsPerPiece || word->alt_choices.size() < kTooManyAltChoices);
|
||||
j++) {
|
||||
for (int i = 0; i < num_first_alt_choices &&
|
||||
(i <= kAltsPerPiece ||
|
||||
word->alt_choices.size() < kTooManyAltChoices);
|
||||
i++) {
|
||||
WERD_CHOICE *wc = new WERD_CHOICE(*word->alt_choices[i]);
|
||||
*wc += *word2->alt_choices[j];
|
||||
word->alt_choices.push_back(wc);
|
||||
|
||||
word->alt_states.push_back(GenericVector<int>());
|
||||
GenericVector<int> &alt_state = word->alt_states.back();
|
||||
alt_state += word->alt_states[i];
|
||||
alt_state += word2->alt_states[j];
|
||||
}
|
||||
}
|
||||
// Now that we've filled in as many alternates as we want, paste the best
|
||||
// choice for word2 onto the original word alt_choices.
|
||||
for (int i = 0; i < num_first_alt_choices; i++) {
|
||||
*word->alt_choices[i] += *word2->alt_choices[0];
|
||||
word->alt_states[i] += word2->alt_states[0];
|
||||
// Construct the cartesian product of the best_choices of word(1) and word2.
|
||||
WERD_CHOICE_LIST joined_choices;
|
||||
WERD_CHOICE_IT jc_it(&joined_choices);
|
||||
WERD_CHOICE_IT bc1_it(&word->best_choices);
|
||||
WERD_CHOICE_IT bc2_it(&word2->best_choices);
|
||||
int num_word1_choices = word->best_choices.length();
|
||||
int total_joined_choices = num_word1_choices;
|
||||
// Nota Bene: For the main loop here, we operate only on the 2nd and greater
|
||||
// word2 choices, and put them in the joined_choices list. The 1st word2
|
||||
// choice gets added to the original word1 choices in-place after we have
|
||||
// finished with them.
|
||||
int bc2_index = 1;
|
||||
for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
|
||||
if (total_joined_choices >= kTooManyAltChoices &&
|
||||
bc2_index > kAltsPerPiece)
|
||||
break;
|
||||
int bc1_index = 0;
|
||||
for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
|
||||
++bc1_index, bc1_it.forward()) {
|
||||
if (total_joined_choices >= kTooManyAltChoices &&
|
||||
bc1_index > kAltsPerPiece)
|
||||
break;
|
||||
WERD_CHOICE *wc = new WERD_CHOICE(*bc1_it.data());
|
||||
*wc += *bc2_it.data();
|
||||
jc_it.add_after_then_move(wc);
|
||||
++total_joined_choices;
|
||||
}
|
||||
}
|
||||
// Now that we've filled in as many alternates as we want, paste the best
|
||||
// choice for word2 onto the original word alt_choices.
|
||||
bc1_it.move_to_first();
|
||||
bc2_it.move_to_first();
|
||||
for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
|
||||
*bc1_it.data() += *bc2_it.data();
|
||||
}
|
||||
bc1_it.move_to_last();
|
||||
bc1_it.add_list_after(&joined_choices);
|
||||
|
||||
// Restore the pointer to original blamer bundle and combine blamer
|
||||
// information recorded in the splits.
|
||||
if (orig_bb != NULL) {
|
||||
IncorrectResultReason irr = orig_bb->incorrect_result_reason;
|
||||
if (irr != IRR_NO_TRUTH_SPLIT) blamer_debug = "";
|
||||
if (word->blamer_bundle->incorrect_result_reason != IRR_CORRECT &&
|
||||
word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH &&
|
||||
word->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH_SPLIT) {
|
||||
blamer_debug += "Blame from part 1: ";
|
||||
blamer_debug += word->blamer_bundle->debug;
|
||||
irr = word->blamer_bundle->incorrect_result_reason;
|
||||
}
|
||||
if (word2->blamer_bundle->incorrect_result_reason != IRR_CORRECT &&
|
||||
word2->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH &&
|
||||
word2->blamer_bundle->incorrect_result_reason != IRR_NO_TRUTH_SPLIT) {
|
||||
blamer_debug += "Blame from part 2: ";
|
||||
blamer_debug += word2->blamer_bundle->debug;
|
||||
if (irr == IRR_CORRECT) {
|
||||
irr = word2->blamer_bundle->incorrect_result_reason;
|
||||
} else if (irr != word2->blamer_bundle->incorrect_result_reason) {
|
||||
irr = IRR_UNKNOWN;
|
||||
}
|
||||
}
|
||||
orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
|
||||
wordrec_debug_blamer);
|
||||
delete word->blamer_bundle;
|
||||
word->blamer_bundle = orig_bb;
|
||||
word->blamer_bundle->incorrect_result_reason = irr;
|
||||
if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) {
|
||||
word->blamer_bundle->SetBlame(irr, blamer_debug, NULL,
|
||||
wordrec_debug_blamer);
|
||||
}
|
||||
}
|
||||
word->SetupBoxWord();
|
||||
word->reject_map.initialise(word->box_word->length());
|
||||
delete word2;
|
||||
}
|
||||
|
||||
|
||||
} // namespace tesseract
|
||||
|
@ -1,41 +0,0 @@
|
||||
/**********************************************************************
|
||||
* File: tfacepp.h (Formerly tface++.h)
|
||||
* Description: C++ side of the C/C++ Tess/Editor interface.
|
||||
* Author: Ray Smith
|
||||
* Created: Thu Apr 23 15:39:23 BST 1992
|
||||
*
|
||||
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TFACEPP_H
|
||||
#define TFACEPP_H
|
||||
|
||||
#include "ratngs.h"
|
||||
#include "blobs.h"
|
||||
#include "tesseractclass.h"
|
||||
|
||||
void call_tester( //call a tester
|
||||
TBLOB *tessblob, //blob to test
|
||||
BOOL8 correct_blob, //true if good
|
||||
char *text, //source text
|
||||
inT32 count, //chars in text
|
||||
LIST result //output of matcher
|
||||
);
|
||||
void call_train_tester( //call a tester
|
||||
TBLOB *tessblob, //blob to test
|
||||
BOOL8 correct_blob, //true if good
|
||||
char *text, //source text
|
||||
inT32 count, //chars in text
|
||||
LIST result //output of matcher
|
||||
);
|
||||
#endif
|
@ -27,7 +27,7 @@
|
||||
**********************************************************************/
|
||||
|
||||
WERD *make_pseudo_word(PAGE_RES* page_res, // Blocks to check.
|
||||
TBOX &selection_box,
|
||||
const TBOX &selection_box,
|
||||
BLOCK *&pseudo_block,
|
||||
ROW *&pseudo_row) { // Row of selection.
|
||||
PAGE_RES_IT pr_it(page_res);
|
||||
|
@ -23,7 +23,7 @@
|
||||
#include "pageres.h"
|
||||
|
||||
WERD *make_pseudo_word(PAGE_RES* page_res, // blocks to check
|
||||
TBOX &selection_box,
|
||||
const TBOX &selection_box,
|
||||
BLOCK *&pseudo_block,
|
||||
ROW *&pseudo_row);
|
||||
|
||||
|
@ -9,7 +9,7 @@ endif
|
||||
|
||||
include_HEADERS = publictypes.h
|
||||
noinst_HEADERS = \
|
||||
blckerr.h blobbox.h blobs.h blread.h boxread.h boxword.h ccstruct.h coutln.h crakedge.h \
|
||||
blamer.h blckerr.h blobbox.h blobs.h blread.h boxread.h boxword.h ccstruct.h coutln.h crakedge.h \
|
||||
detlinefit.h dppoint.h fontinfo.h genblob.h hpdsizes.h ipoints.h \
|
||||
linlsq.h matrix.h mod128.h normalis.h \
|
||||
ocrblock.h ocrpara.h ocrrow.h otsuthr.h \
|
||||
@ -31,12 +31,12 @@ libtesseract_ccstruct_la_LIBADD = \
|
||||
endif
|
||||
|
||||
libtesseract_ccstruct_la_SOURCES = \
|
||||
blobbox.cpp blobs.cpp blread.cpp boxread.cpp boxword.cpp ccstruct.cpp coutln.cpp \
|
||||
blamer.cpp blobbox.cpp blobs.cpp blread.cpp boxread.cpp boxword.cpp ccstruct.cpp coutln.cpp \
|
||||
detlinefit.cpp dppoint.cpp fontinfo.cpp genblob.cpp \
|
||||
linlsq.cpp matrix.cpp mod128.cpp normalis.cpp \
|
||||
ocrblock.cpp ocrpara.cpp ocrrow.cpp otsuthr.cpp \
|
||||
pageres.cpp pdblock.cpp points.cpp polyaprx.cpp polyblk.cpp \
|
||||
publictypes.cpp \
|
||||
params_training_featdef.cpp publictypes.cpp \
|
||||
quadlsq.cpp quadratc.cpp quspline.cpp ratngs.cpp rect.cpp rejctmap.cpp \
|
||||
seam.cpp split.cpp statistc.cpp stepblob.cpp \
|
||||
vecfuncs.cpp werd.cpp
|
||||
|
587
ccstruct/blamer.cpp
Normal file
587
ccstruct/blamer.cpp
Normal file
@ -0,0 +1,587 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: blamer.cpp
|
||||
// Description: Module allowing precise error causes to be allocated.
|
||||
// Author: Rike Antonova
|
||||
// Refactored: Ray Smith
|
||||
// Created: Mon Feb 04 14:37:01 PST 2013
|
||||
//
|
||||
// (C) Copyright 2013, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "blamer.h"
|
||||
#include "blobs.h"
|
||||
#include "matrix.h"
|
||||
#include "normalis.h"
|
||||
#include "pageres.h"
|
||||
|
||||
// Names for each value of IncorrectResultReason enum. Keep in sync.
|
||||
const char kBlameCorrect[] = "corr";
|
||||
const char kBlameClassifier[] = "cl";
|
||||
const char kBlameChopper[] = "chop";
|
||||
const char kBlameClassLMTradeoff[] = "cl/LM";
|
||||
const char kBlamePageLayout[] = "pglt";
|
||||
const char kBlameSegsearchHeur[] = "ss_heur";
|
||||
const char kBlameSegsearchPP[] = "ss_pp";
|
||||
const char kBlameClassOldLMTradeoff[] = "cl/old_LM";
|
||||
const char kBlameAdaption[] = "adapt";
|
||||
const char kBlameNoTruthSplit[] = "no_tr_spl";
|
||||
const char kBlameNoTruth[] = "no_tr";
|
||||
const char kBlameUnknown[] = "unkn";
|
||||
|
||||
const char * const kIncorrectResultReasonNames[] = {
|
||||
kBlameCorrect,
|
||||
kBlameClassifier,
|
||||
kBlameChopper,
|
||||
kBlameClassLMTradeoff,
|
||||
kBlamePageLayout,
|
||||
kBlameSegsearchHeur,
|
||||
kBlameSegsearchPP,
|
||||
kBlameClassOldLMTradeoff,
|
||||
kBlameAdaption,
|
||||
kBlameNoTruthSplit,
|
||||
kBlameNoTruth,
|
||||
kBlameUnknown
|
||||
};
|
||||
|
||||
const char *BlamerBundle::IncorrectReasonName(IncorrectResultReason irr) {
|
||||
return kIncorrectResultReasonNames[irr];
|
||||
}
|
||||
|
||||
const char *BlamerBundle::IncorrectReason() const {
|
||||
return kIncorrectResultReasonNames[incorrect_result_reason_];
|
||||
}
|
||||
|
||||
// Functions to setup the blamer.
|
||||
// Whole word string, whole word bounding box.
|
||||
void BlamerBundle::SetWordTruth(const UNICHARSET& unicharset,
|
||||
const char* truth_str, const TBOX& word_box) {
|
||||
truth_word_.InsertBox(0, word_box);
|
||||
truth_has_char_boxes_ = false;
|
||||
// Encode the string as UNICHAR_IDs.
|
||||
GenericVector<UNICHAR_ID> encoding;
|
||||
GenericVector<char> lengths;
|
||||
unicharset.encode_string(truth_str, false, &encoding, &lengths, NULL);
|
||||
int total_length = 0;
|
||||
for (int i = 0; i < encoding.size(); total_length += lengths[i++]) {
|
||||
STRING uch(truth_str + total_length);
|
||||
uch.truncate_at(lengths[i] - total_length);
|
||||
UNICHAR_ID id = encoding[i];
|
||||
if (id != INVALID_UNICHAR_ID) uch = unicharset.get_normed_unichar(id);
|
||||
truth_text_.push_back(uch);
|
||||
}
|
||||
}
|
||||
|
||||
// Single "character" string, "character" bounding box.
|
||||
// May be called multiple times to indicate the characters in a word.
|
||||
void BlamerBundle::SetSymbolTruth(const UNICHARSET& unicharset,
|
||||
const char* char_str, const TBOX& char_box) {
|
||||
STRING symbol_str(char_str);
|
||||
UNICHAR_ID id = unicharset.unichar_to_id(char_str);
|
||||
if (id != INVALID_UNICHAR_ID) {
|
||||
STRING normed_uch(unicharset.get_normed_unichar(id));
|
||||
if (normed_uch.length() > 0) symbol_str = normed_uch;
|
||||
}
|
||||
int length = truth_word_.length();
|
||||
truth_text_.push_back(symbol_str);
|
||||
truth_word_.InsertBox(length, char_box);
|
||||
if (length == 0)
|
||||
truth_has_char_boxes_ = true;
|
||||
else if (truth_word_.BlobBox(length - 1) == char_box)
|
||||
truth_has_char_boxes_ = false;
|
||||
}
|
||||
|
||||
// Marks that there is something wrong with the truth text, like it contains
|
||||
// reject characters.
|
||||
void BlamerBundle::SetRejectedTruth() {
|
||||
incorrect_result_reason_ = IRR_NO_TRUTH;
|
||||
truth_has_char_boxes_ = false;
|
||||
}
|
||||
|
||||
// Returns true if the provided word_choice is correct.
|
||||
bool BlamerBundle::ChoiceIsCorrect(const WERD_CHOICE* word_choice) const {
|
||||
if (word_choice == NULL) return false;
|
||||
const UNICHARSET* uni_set = word_choice->unicharset();
|
||||
STRING normed_choice_str;
|
||||
for (int i = 0; i < word_choice->length(); ++i) {
|
||||
normed_choice_str +=
|
||||
uni_set->get_normed_unichar(word_choice->unichar_id(i));
|
||||
}
|
||||
STRING truth_str = TruthString();
|
||||
return truth_str == normed_choice_str;
|
||||
}
|
||||
|
||||
void BlamerBundle::FillDebugString(const STRING &msg,
|
||||
const WERD_CHOICE *choice,
|
||||
STRING *debug) {
|
||||
(*debug) += "Truth ";
|
||||
for (int i = 0; i < this->truth_text_.length(); ++i) {
|
||||
(*debug) += this->truth_text_[i];
|
||||
}
|
||||
if (!this->truth_has_char_boxes_) (*debug) += " (no char boxes)";
|
||||
if (choice != NULL) {
|
||||
(*debug) += " Choice ";
|
||||
STRING choice_str;
|
||||
choice->string_and_lengths(&choice_str, NULL);
|
||||
(*debug) += choice_str;
|
||||
}
|
||||
if (msg.length() > 0) {
|
||||
(*debug) += "\n";
|
||||
(*debug) += msg;
|
||||
}
|
||||
(*debug) += "\n";
|
||||
}
|
||||
|
||||
// Sets up the norm_truth_word from truth_word using the given DENORM.
|
||||
void BlamerBundle::SetupNormTruthWord(const DENORM& denorm) {
|
||||
// TODO(rays) Is this the last use of denorm in WERD_RES and can it go?
|
||||
norm_box_tolerance_ = kBlamerBoxTolerance * denorm.x_scale();
|
||||
TPOINT topleft;
|
||||
TPOINT botright;
|
||||
TPOINT norm_topleft;
|
||||
TPOINT norm_botright;
|
||||
for (int b = 0; b < truth_word_.length(); ++b) {
|
||||
const TBOX &box = truth_word_.BlobBox(b);
|
||||
topleft.x = box.left();
|
||||
topleft.y = box.top();
|
||||
botright.x = box.right();
|
||||
botright.y = box.bottom();
|
||||
denorm.NormTransform(NULL, topleft, &norm_topleft);
|
||||
denorm.NormTransform(NULL, botright, &norm_botright);
|
||||
TBOX norm_box(norm_topleft.x, norm_botright.y,
|
||||
norm_botright.x, norm_topleft.y);
|
||||
norm_truth_word_.InsertBox(b, norm_box);
|
||||
}
|
||||
}
|
||||
|
||||
// Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
|
||||
// bundles) where the right edge/ of the left-hand word is word1_right,
|
||||
// and the left edge of the right-hand word is word2_left.
|
||||
void BlamerBundle::SplitBundle(int word1_right, int word2_left, bool debug,
|
||||
BlamerBundle* bundle1,
|
||||
BlamerBundle* bundle2) const {
|
||||
STRING debug_str;
|
||||
// Find truth boxes that correspond to the split in the blobs.
|
||||
int b;
|
||||
int begin2_truth_index = -1;
|
||||
if (incorrect_result_reason_ != IRR_NO_TRUTH &&
|
||||
truth_has_char_boxes_) {
|
||||
debug_str = "Looking for truth split at";
|
||||
debug_str.add_str_int(" end1_x ", word1_right);
|
||||
debug_str.add_str_int(" begin2_x ", word2_left);
|
||||
debug_str += "\nnorm_truth_word boxes:\n";
|
||||
if (norm_truth_word_.length() > 1) {
|
||||
norm_truth_word_.BlobBox(0).print_to_str(&debug_str);
|
||||
for (b = 1; b < norm_truth_word_.length(); ++b) {
|
||||
norm_truth_word_.BlobBox(b).print_to_str(&debug_str);
|
||||
if ((abs(word1_right - norm_truth_word_.BlobBox(b - 1).right()) <
|
||||
norm_box_tolerance_) &&
|
||||
(abs(word2_left - norm_truth_word_.BlobBox(b).left()) <
|
||||
norm_box_tolerance_)) {
|
||||
begin2_truth_index = b;
|
||||
debug_str += "Split found";
|
||||
break;
|
||||
}
|
||||
}
|
||||
debug_str += '\n';
|
||||
}
|
||||
}
|
||||
// Populate truth information in word and word2 with the first and second
|
||||
// part of the original truth.
|
||||
if (begin2_truth_index > 0) {
|
||||
bundle1->truth_has_char_boxes_ = true;
|
||||
bundle1->norm_box_tolerance_ = norm_box_tolerance_;
|
||||
bundle2->truth_has_char_boxes_ = true;
|
||||
bundle2->norm_box_tolerance_ = norm_box_tolerance_;
|
||||
BlamerBundle *curr_bb = bundle1;
|
||||
for (b = 0; b < norm_truth_word_.length(); ++b) {
|
||||
if (b == begin2_truth_index) curr_bb = bundle2;
|
||||
curr_bb->norm_truth_word_.InsertBox(b, norm_truth_word_.BlobBox(b));
|
||||
curr_bb->truth_word_.InsertBox(b, truth_word_.BlobBox(b));
|
||||
curr_bb->truth_text_.push_back(truth_text_[b]);
|
||||
}
|
||||
} else if (incorrect_result_reason_ == IRR_NO_TRUTH) {
|
||||
bundle1->incorrect_result_reason_ = IRR_NO_TRUTH;
|
||||
bundle2->incorrect_result_reason_ = IRR_NO_TRUTH;
|
||||
} else {
|
||||
debug_str += "Truth split not found";
|
||||
debug_str += truth_has_char_boxes_ ?
|
||||
"\n" : " (no truth char boxes)\n";
|
||||
bundle1->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, NULL, debug);
|
||||
bundle2->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, NULL, debug);
|
||||
}
|
||||
}
|
||||
|
||||
// "Joins" the blames from bundle1 and bundle2 into *this.
|
||||
void BlamerBundle::JoinBlames(const BlamerBundle& bundle1,
|
||||
const BlamerBundle& bundle2, bool debug) {
|
||||
STRING debug_str;
|
||||
IncorrectResultReason irr = incorrect_result_reason_;
|
||||
if (irr != IRR_NO_TRUTH_SPLIT) debug_str = "";
|
||||
if (bundle1.incorrect_result_reason_ != IRR_CORRECT &&
|
||||
bundle1.incorrect_result_reason_ != IRR_NO_TRUTH &&
|
||||
bundle1.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) {
|
||||
debug_str += "Blame from part 1: ";
|
||||
debug_str += bundle1.debug_;
|
||||
irr = bundle1.incorrect_result_reason_;
|
||||
}
|
||||
if (bundle2.incorrect_result_reason_ != IRR_CORRECT &&
|
||||
bundle2.incorrect_result_reason_ != IRR_NO_TRUTH &&
|
||||
bundle2.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) {
|
||||
debug_str += "Blame from part 2: ";
|
||||
debug_str += bundle2.debug_;
|
||||
if (irr == IRR_CORRECT) {
|
||||
irr = bundle2.incorrect_result_reason_;
|
||||
} else if (irr != bundle2.incorrect_result_reason_) {
|
||||
irr = IRR_UNKNOWN;
|
||||
}
|
||||
}
|
||||
incorrect_result_reason_ = irr;
|
||||
if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) {
|
||||
SetBlame(irr, debug_str, NULL, debug);
|
||||
}
|
||||
}
|
||||
|
||||
// If a blob with the same bounding box as one of the truth character
|
||||
// bounding boxes is not classified as the corresponding truth character
|
||||
// blames character classifier for incorrect answer.
|
||||
void BlamerBundle::BlameClassifier(const UNICHARSET& unicharset,
|
||||
const TBOX& blob_box,
|
||||
const BLOB_CHOICE_LIST& choices,
|
||||
bool debug) {
|
||||
if (!truth_has_char_boxes_ ||
|
||||
incorrect_result_reason_ != IRR_CORRECT)
|
||||
return; // Nothing to do here.
|
||||
|
||||
for (int b = 0; b < norm_truth_word_.length(); ++b) {
|
||||
const TBOX &truth_box = norm_truth_word_.BlobBox(b);
|
||||
// Note that we are more strict on the bounding box boundaries here
|
||||
// than in other places (chopper, segmentation search), since we do
|
||||
// not have the ability to check the previous and next bounding box.
|
||||
if (blob_box.x_almost_equal(truth_box, norm_box_tolerance_/2)) {
|
||||
bool found = false;
|
||||
bool incorrect_adapted = false;
|
||||
UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID;
|
||||
const char *truth_str = truth_text_[b].string();
|
||||
// We promise not to modify the list or its contents, using a
|
||||
// const BLOB_CHOICE* below.
|
||||
BLOB_CHOICE_IT choices_it(const_cast<BLOB_CHOICE_LIST*>(&choices));
|
||||
for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
|
||||
choices_it.forward()) {
|
||||
const BLOB_CHOICE* choice = choices_it.data();
|
||||
if (strcmp(truth_str, unicharset.get_normed_unichar(
|
||||
choice->unichar_id())) == 0) {
|
||||
found = true;
|
||||
break;
|
||||
} else if (choice->IsAdapted()) {
|
||||
incorrect_adapted = true;
|
||||
incorrect_adapted_id = choice->unichar_id();
|
||||
}
|
||||
} // end choices_it for loop
|
||||
if (!found) {
|
||||
STRING debug_str = "unichar ";
|
||||
debug_str += truth_str;
|
||||
debug_str += " not found in classification list";
|
||||
SetBlame(IRR_CLASSIFIER, debug_str, NULL, debug);
|
||||
} else if (incorrect_adapted) {
|
||||
STRING debug_str = "better rating for adapted ";
|
||||
debug_str += unicharset.id_to_unichar(incorrect_adapted_id);
|
||||
debug_str += " than for correct ";
|
||||
debug_str += truth_str;
|
||||
SetBlame(IRR_ADAPTION, debug_str, NULL, debug);
|
||||
}
|
||||
break;
|
||||
}
|
||||
} // end iterating over blamer_bundle->norm_truth_word
|
||||
}
|
||||
|
||||
// Checks whether chops were made at all the character bounding box
|
||||
// boundaries in word->truth_word. If not - blames the chopper for an
|
||||
// incorrect answer.
|
||||
void BlamerBundle::SetChopperBlame(const WERD_RES* word, bool debug) {
|
||||
if (NoTruth() || !truth_has_char_boxes_ ||
|
||||
word->chopped_word->blobs.empty()) {
|
||||
return;
|
||||
}
|
||||
STRING debug_str;
|
||||
bool missing_chop = false;
|
||||
int num_blobs = word->chopped_word->blobs.size();
|
||||
int box_index = 0;
|
||||
int blob_index = 0;
|
||||
inT16 truth_x;
|
||||
while (box_index < truth_word_.length() && blob_index < num_blobs) {
|
||||
truth_x = norm_truth_word_.BlobBox(box_index).right();
|
||||
TBLOB * curr_blob = word->chopped_word->blobs[blob_index];
|
||||
if (curr_blob->bounding_box().right() < truth_x - norm_box_tolerance_) {
|
||||
++blob_index;
|
||||
continue; // encountered an extra chop, keep looking
|
||||
} else if (curr_blob->bounding_box().right() >
|
||||
truth_x + norm_box_tolerance_) {
|
||||
missing_chop = true;
|
||||
break;
|
||||
} else {
|
||||
++blob_index;
|
||||
}
|
||||
}
|
||||
if (missing_chop || box_index < norm_truth_word_.length()) {
|
||||
STRING debug_str;
|
||||
if (missing_chop) {
|
||||
debug_str.add_str_int("Detected missing chop (tolerance=",
|
||||
norm_box_tolerance_);
|
||||
debug_str += ") at Bounding Box=";
|
||||
TBLOB * curr_blob = word->chopped_word->blobs[blob_index];
|
||||
curr_blob->bounding_box().print_to_str(&debug_str);
|
||||
debug_str.add_str_int("\nNo chop for truth at x=", truth_x);
|
||||
} else {
|
||||
debug_str.add_str_int("Missing chops for last ",
|
||||
norm_truth_word_.length() - box_index);
|
||||
debug_str += " truth box(es)";
|
||||
}
|
||||
debug_str += "\nMaximally chopped word boxes:\n";
|
||||
for (blob_index = 0; blob_index < num_blobs; ++blob_index) {
|
||||
TBLOB * curr_blob = word->chopped_word->blobs[blob_index];
|
||||
curr_blob->bounding_box().print_to_str(&debug_str);
|
||||
debug_str += '\n';
|
||||
}
|
||||
debug_str += "Truth bounding boxes:\n";
|
||||
for (box_index = 0; box_index < norm_truth_word_.length(); ++box_index) {
|
||||
norm_truth_word_.BlobBox(box_index).print_to_str(&debug_str);
|
||||
debug_str += '\n';
|
||||
}
|
||||
SetBlame(IRR_CHOPPER, debug_str, word->best_choice, debug);
|
||||
}
|
||||
}
|
||||
|
||||
// Blames the classifier or the language model if, after running only the
|
||||
// chopper, best_choice is incorrect and no blame has been yet set.
|
||||
// Blames the classifier if best_choice is classifier's top choice and is a
|
||||
// dictionary word (i.e. language model could not have helped).
|
||||
// Otherwise, blames the language model (formerly permuter word adjustment).
|
||||
void BlamerBundle::BlameClassifierOrLangModel(
|
||||
const WERD_RES* word,
|
||||
const UNICHARSET& unicharset, bool valid_permuter, bool debug) {
|
||||
if (valid_permuter) {
|
||||
// Find out whether best choice is a top choice.
|
||||
best_choice_is_dict_and_top_choice_ = true;
|
||||
for (int i = 0; i < word->best_choice->length(); ++i) {
|
||||
BLOB_CHOICE_IT blob_choice_it(word->GetBlobChoices(i));
|
||||
ASSERT_HOST(!blob_choice_it.empty());
|
||||
BLOB_CHOICE *first_choice = NULL;
|
||||
for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
|
||||
blob_choice_it.forward()) { // find first non-fragment choice
|
||||
if (!(unicharset.get_fragment(blob_choice_it.data()->unichar_id()))) {
|
||||
first_choice = blob_choice_it.data();
|
||||
break;
|
||||
}
|
||||
}
|
||||
ASSERT_HOST(first_choice != NULL);
|
||||
if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) {
|
||||
best_choice_is_dict_and_top_choice_ = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
STRING debug_str;
|
||||
if (best_choice_is_dict_and_top_choice_) {
|
||||
debug_str = "Best choice is: incorrect, top choice, dictionary word";
|
||||
debug_str += " with permuter ";
|
||||
debug_str += word->best_choice->permuter_name();
|
||||
} else {
|
||||
debug_str = "Classifier/Old LM tradeoff is to blame";
|
||||
}
|
||||
SetBlame(best_choice_is_dict_and_top_choice_ ? IRR_CLASSIFIER
|
||||
: IRR_CLASS_OLD_LM_TRADEOFF,
|
||||
debug_str, word->best_choice, debug);
|
||||
}
|
||||
|
||||
// Sets up the correct_segmentation_* to mark the correct bounding boxes.
|
||||
void BlamerBundle::SetupCorrectSegmentation(const TWERD* word, bool debug) {
|
||||
params_training_bundle_.StartHypothesisList();
|
||||
if (incorrect_result_reason_ != IRR_CORRECT || !truth_has_char_boxes_)
|
||||
return; // Nothing to do here.
|
||||
|
||||
STRING debug_str;
|
||||
debug_str += "Blamer computing correct_segmentation_cols\n";
|
||||
int curr_box_col = 0;
|
||||
int next_box_col = 0;
|
||||
int num_blobs = word->NumBlobs();
|
||||
if (num_blobs == 0) return; // No blobs to play with.
|
||||
int blob_index = 0;
|
||||
inT16 next_box_x = word->blobs[blob_index]->bounding_box().right();
|
||||
for (int truth_idx = 0; blob_index < num_blobs &&
|
||||
truth_idx < norm_truth_word_.length();
|
||||
++blob_index) {
|
||||
++next_box_col;
|
||||
inT16 curr_box_x = next_box_x;
|
||||
if (blob_index + 1 < num_blobs)
|
||||
next_box_x = word->blobs[blob_index + 1]->bounding_box().right();
|
||||
inT16 truth_x = norm_truth_word_.BlobBox(truth_idx).right();
|
||||
debug_str.add_str_int("Box x coord vs. truth: ", curr_box_x);
|
||||
debug_str.add_str_int(" ", truth_x);
|
||||
debug_str += "\n";
|
||||
if (curr_box_x > (truth_x + norm_box_tolerance_)) {
|
||||
break; // failed to find a matching box
|
||||
} else if (curr_box_x >= truth_x - norm_box_tolerance_ && // matched
|
||||
(blob_index + 1 >= num_blobs || // next box can't be included
|
||||
next_box_x > truth_x + norm_box_tolerance_)) {
|
||||
correct_segmentation_cols_.push_back(curr_box_col);
|
||||
correct_segmentation_rows_.push_back(next_box_col-1);
|
||||
++truth_idx;
|
||||
debug_str.add_str_int("col=", curr_box_col);
|
||||
debug_str.add_str_int(" row=", next_box_col-1);
|
||||
debug_str += "\n";
|
||||
curr_box_col = next_box_col;
|
||||
}
|
||||
}
|
||||
if (blob_index < num_blobs || // trailing blobs
|
||||
correct_segmentation_cols_.length() != norm_truth_word_.length()) {
|
||||
debug_str.add_str_int("Blamer failed to find correct segmentation"
|
||||
" (tolerance=", norm_box_tolerance_);
|
||||
if (blob_index >= num_blobs) debug_str += " blob == NULL";
|
||||
debug_str += ")\n";
|
||||
debug_str.add_str_int(" path length ", correct_segmentation_cols_.length());
|
||||
debug_str.add_str_int(" vs. truth ", norm_truth_word_.length());
|
||||
debug_str += "\n";
|
||||
SetBlame(IRR_UNKNOWN, debug_str, NULL, debug);
|
||||
correct_segmentation_cols_.clear();
|
||||
correct_segmentation_rows_.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// Returns true if a guided segmentation search is needed.
|
||||
bool BlamerBundle::GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const {
|
||||
return incorrect_result_reason_ == IRR_CORRECT &&
|
||||
!segsearch_is_looking_for_blame_ &&
|
||||
truth_has_char_boxes_ &&
|
||||
!ChoiceIsCorrect(best_choice);
|
||||
}
|
||||
|
||||
// Setup ready to guide the segmentation search to the correct segmentation.
|
||||
// The callback pp_cb is used to avoid a cyclic dependency.
|
||||
// It calls into LMPainPoints::GenerateForBlamer by pre-binding the
|
||||
// WERD_RES, and the LMPainPoints itself.
|
||||
// pp_cb must be a permanent callback, and should be deleted by the caller.
|
||||
void BlamerBundle::InitForSegSearch(const WERD_CHOICE *best_choice,
|
||||
MATRIX* ratings, UNICHAR_ID wildcard_id,
|
||||
bool debug, STRING *debug_str,
|
||||
TessResultCallback2<bool, int, int>* cb) {
|
||||
segsearch_is_looking_for_blame_ = true;
|
||||
if (debug) {
|
||||
tprintf("segsearch starting to look for blame\n");
|
||||
}
|
||||
// Fill pain points for any unclassifed blob corresponding to the
|
||||
// correct segmentation state.
|
||||
*debug_str += "Correct segmentation:\n";
|
||||
for (int idx = 0; idx < correct_segmentation_cols_.length(); ++idx) {
|
||||
debug_str->add_str_int("col=", correct_segmentation_cols_[idx]);
|
||||
debug_str->add_str_int(" row=", correct_segmentation_rows_[idx]);
|
||||
*debug_str += "\n";
|
||||
if (!ratings->Classified(correct_segmentation_cols_[idx],
|
||||
correct_segmentation_rows_[idx],
|
||||
wildcard_id) &&
|
||||
!cb->Run(correct_segmentation_cols_[idx],
|
||||
correct_segmentation_rows_[idx])) {
|
||||
segsearch_is_looking_for_blame_ = false;
|
||||
*debug_str += "\nFailed to insert pain point\n";
|
||||
SetBlame(IRR_SEGSEARCH_HEUR, *debug_str, best_choice, debug);
|
||||
break;
|
||||
}
|
||||
} // end for blamer_bundle->correct_segmentation_cols/rows
|
||||
}
|
||||
// Returns true if the guided segsearch is in progress.
|
||||
bool BlamerBundle::GuidedSegsearchStillGoing() const {
|
||||
return segsearch_is_looking_for_blame_;
|
||||
}
|
||||
|
||||
// The segmentation search has ended. Sets the blame appropriately.
|
||||
void BlamerBundle::FinishSegSearch(const WERD_CHOICE *best_choice,
|
||||
bool debug, STRING *debug_str) {
|
||||
// If we are still looking for blame (i.e. best_choice is incorrect, but a
|
||||
// path representing the correct segmentation could be constructed), we can
|
||||
// blame segmentation search pain point prioritization if the rating of the
|
||||
// path corresponding to the correct segmentation is better than that of
|
||||
// best_choice (i.e. language model would have done the correct thing, but
|
||||
// because of poor pain point prioritization the correct segmentation was
|
||||
// never explored). Otherwise we blame the tradeoff between the language model
|
||||
// and the classifier, since even after exploring the path corresponding to
|
||||
// the correct segmentation incorrect best_choice would have been chosen.
|
||||
// One special case when we blame the classifier instead is when best choice
|
||||
// is incorrect, but it is a dictionary word and it classifier's top choice.
|
||||
if (segsearch_is_looking_for_blame_) {
|
||||
segsearch_is_looking_for_blame_ = false;
|
||||
if (best_choice_is_dict_and_top_choice_) {
|
||||
*debug_str = "Best choice is: incorrect, top choice, dictionary word";
|
||||
*debug_str += " with permuter ";
|
||||
*debug_str += best_choice->permuter_name();
|
||||
SetBlame(IRR_CLASSIFIER, *debug_str, best_choice, debug);
|
||||
} else if (best_correctly_segmented_rating_ <
|
||||
best_choice->rating()) {
|
||||
*debug_str += "Correct segmentation state was not explored";
|
||||
SetBlame(IRR_SEGSEARCH_PP, *debug_str, best_choice, debug);
|
||||
} else {
|
||||
if (best_correctly_segmented_rating_ >=
|
||||
WERD_CHOICE::kBadRating) {
|
||||
*debug_str += "Correct segmentation paths were pruned by LM\n";
|
||||
} else {
|
||||
debug_str->add_str_double("Best correct segmentation rating ",
|
||||
best_correctly_segmented_rating_);
|
||||
debug_str->add_str_double(" vs. best choice rating ",
|
||||
best_choice->rating());
|
||||
}
|
||||
SetBlame(IRR_CLASS_LM_TRADEOFF, *debug_str, best_choice, debug);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If the bundle is null or still does not indicate the correct result,
|
||||
// fix it and use some backup reason for the blame.
|
||||
void BlamerBundle::LastChanceBlame(bool debug, WERD_RES* word) {
|
||||
if (word->blamer_bundle == NULL) {
|
||||
word->blamer_bundle = new BlamerBundle();
|
||||
word->blamer_bundle->SetBlame(IRR_PAGE_LAYOUT, "LastChanceBlame",
|
||||
word->best_choice, debug);
|
||||
} else if (word->blamer_bundle->incorrect_result_reason_ == IRR_NO_TRUTH) {
|
||||
word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth",
|
||||
word->best_choice, debug);
|
||||
} else {
|
||||
bool correct = word->blamer_bundle->ChoiceIsCorrect(word->best_choice);
|
||||
IncorrectResultReason irr = word->blamer_bundle->incorrect_result_reason_;
|
||||
if (irr == IRR_CORRECT && !correct) {
|
||||
STRING debug_str = "Choice is incorrect after recognition";
|
||||
word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug_str, word->best_choice,
|
||||
debug);
|
||||
} else if (irr != IRR_CORRECT && correct) {
|
||||
if (debug) {
|
||||
tprintf("Corrected %s\n", word->blamer_bundle->debug_.string());
|
||||
}
|
||||
word->blamer_bundle->incorrect_result_reason_ = IRR_CORRECT;
|
||||
word->blamer_bundle->debug_ = "";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sets the misadaption debug if this word is incorrect, as this word is
|
||||
// being adapted to.
|
||||
void BlamerBundle::SetMisAdaptionDebug(const WERD_CHOICE *best_choice,
|
||||
bool debug) {
|
||||
if (incorrect_result_reason_ != IRR_NO_TRUTH &&
|
||||
!ChoiceIsCorrect(best_choice)) {
|
||||
misadaption_debug_ ="misadapt to word (";
|
||||
misadaption_debug_ += best_choice->permuter_name();
|
||||
misadaption_debug_ += "): ";
|
||||
FillDebugString("", best_choice, &misadaption_debug_);
|
||||
if (debug) {
|
||||
tprintf("%s\n", misadaption_debug_.string());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
330
ccstruct/blamer.h
Normal file
330
ccstruct/blamer.h
Normal file
@ -0,0 +1,330 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: blamer.h
|
||||
// Description: Module allowing precise error causes to be allocated.
|
||||
// Author: Rike Antonova
|
||||
// Refactored: Ray Smith
|
||||
// Created: Mon Feb 04 14:37:01 PST 2013
|
||||
//
|
||||
// (C) Copyright 2013, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCSTRUCT_BLAMER_H_
|
||||
#define TESSERACT_CCSTRUCT_BLAMER_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include "boxword.h"
|
||||
#include "genericvector.h"
|
||||
#include "matrix.h"
|
||||
#include "params_training_featdef.h"
|
||||
#include "ratngs.h"
|
||||
#include "strngs.h"
|
||||
#include "tesscallback.h"
|
||||
|
||||
static const inT16 kBlamerBoxTolerance = 5;
|
||||
|
||||
// Enum for expressing the source of error.
|
||||
// Note: Please update kIncorrectResultReasonNames when modifying this enum.
|
||||
enum IncorrectResultReason {
|
||||
// The text recorded in best choice == truth text
|
||||
IRR_CORRECT,
|
||||
// Either: Top choice is incorrect and is a dictionary word (language model
|
||||
// is unlikely to help correct such errors, so blame the classifier).
|
||||
// Or: the correct unichar was not included in shortlist produced by the
|
||||
// classifier at all.
|
||||
IRR_CLASSIFIER,
|
||||
// Chopper have not found one or more splits that correspond to the correct
|
||||
// character bounding boxes recorded in BlamerBundle::truth_word.
|
||||
IRR_CHOPPER,
|
||||
// Classifier did include correct unichars for each blob in the correct
|
||||
// segmentation, however its rating could have been too bad to allow the
|
||||
// language model to pull out the correct choice. On the other hand the
|
||||
// strength of the language model might have been too weak to favor the
|
||||
// correct answer, this we call this case a classifier-language model
|
||||
// tradeoff error.
|
||||
IRR_CLASS_LM_TRADEOFF,
|
||||
// Page layout failed to produce the correct bounding box. Blame page layout
|
||||
// if the truth was not found for the word, which implies that the bounding
|
||||
// box of the word was incorrect (no truth word had a similar bounding box).
|
||||
IRR_PAGE_LAYOUT,
|
||||
// SegSearch heuristic prevented one or more blobs from the correct
|
||||
// segmentation state to be classified (e.g. the blob was too wide).
|
||||
IRR_SEGSEARCH_HEUR,
|
||||
// The correct segmentaiton state was not explored because of poor SegSearch
|
||||
// pain point prioritization. We blame SegSearch pain point prioritization
|
||||
// if the best rating of a choice constructed from correct segmentation is
|
||||
// better than that of the best choice (i.e. if we got to explore the correct
|
||||
// segmentation state, language model would have picked the correct choice).
|
||||
IRR_SEGSEARCH_PP,
|
||||
// Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
|
||||
// and thus use the old language model (permuters).
|
||||
// TODO(antonova): integrate the new language mode with chopper
|
||||
IRR_CLASS_OLD_LM_TRADEOFF,
|
||||
// If there is an incorrect adaptive template match with a better score than
|
||||
// a correct one (either pre-trained or adapted), mark this as adaption error.
|
||||
IRR_ADAPTION,
|
||||
// split_and_recog_word() failed to find a suitable split in truth.
|
||||
IRR_NO_TRUTH_SPLIT,
|
||||
// Truth is not available for this word (e.g. when words in corrected content
|
||||
// file are turned into ~~~~ because an appropriate alignment was not found.
|
||||
IRR_NO_TRUTH,
|
||||
// The text recorded in best choice != truth text, but none of the above
|
||||
// reasons are set.
|
||||
IRR_UNKNOWN,
|
||||
|
||||
IRR_NUM_REASONS
|
||||
};
|
||||
|
||||
// Blamer-related information to determine the source of errors.
|
||||
struct BlamerBundle {
|
||||
static const char *IncorrectReasonName(IncorrectResultReason irr);
|
||||
BlamerBundle() : truth_has_char_boxes_(false),
|
||||
incorrect_result_reason_(IRR_CORRECT),
|
||||
lattice_data_(NULL) { ClearResults(); }
|
||||
BlamerBundle(const BlamerBundle &other) {
|
||||
this->CopyTruth(other);
|
||||
this->CopyResults(other);
|
||||
}
|
||||
~BlamerBundle() { delete[] lattice_data_; }
|
||||
|
||||
// Accessors.
|
||||
STRING TruthString() const {
|
||||
STRING truth_str;
|
||||
for (int i = 0; i < truth_text_.length(); ++i)
|
||||
truth_str += truth_text_[i];
|
||||
return truth_str;
|
||||
}
|
||||
IncorrectResultReason incorrect_result_reason() const {
|
||||
return incorrect_result_reason_;
|
||||
}
|
||||
bool NoTruth() const {
|
||||
return incorrect_result_reason_ == IRR_NO_TRUTH ||
|
||||
incorrect_result_reason_ == IRR_PAGE_LAYOUT;
|
||||
}
|
||||
bool HasDebugInfo() const {
|
||||
return debug_.length() > 0 || misadaption_debug_.length() > 0;
|
||||
}
|
||||
const STRING& debug() const {
|
||||
return debug_;
|
||||
}
|
||||
const STRING& misadaption_debug() const {
|
||||
return misadaption_debug_;
|
||||
}
|
||||
void UpdateBestRating(float rating) {
|
||||
if (rating < best_correctly_segmented_rating_)
|
||||
best_correctly_segmented_rating_ = rating;
|
||||
}
|
||||
int correct_segmentation_length() const {
|
||||
return correct_segmentation_cols_.length();
|
||||
}
|
||||
// Returns true if the given ratings matrix col,row position is included
|
||||
// in the correct segmentation path at the given index.
|
||||
bool MatrixPositionCorrect(int index, const MATRIX_COORD& coord) {
|
||||
return correct_segmentation_cols_[index] == coord.col &&
|
||||
correct_segmentation_rows_[index] == coord.row;
|
||||
}
|
||||
void set_best_choice_is_dict_and_top_choice(bool value) {
|
||||
best_choice_is_dict_and_top_choice_ = value;
|
||||
}
|
||||
const char* lattice_data() const {
|
||||
return lattice_data_;
|
||||
}
|
||||
int lattice_size() const {
|
||||
return lattice_size_; // size of lattice_data in bytes
|
||||
}
|
||||
void set_lattice_data(const char* data, int size) {
|
||||
lattice_size_ = size;
|
||||
delete [] lattice_data_;
|
||||
lattice_data_ = new char[lattice_size_];
|
||||
memcpy(lattice_data_, data, lattice_size_);
|
||||
}
|
||||
const tesseract::ParamsTrainingBundle& params_training_bundle() const {
|
||||
return params_training_bundle_;
|
||||
}
|
||||
// Adds a new ParamsTrainingHypothesis to the current hypothesis list.
|
||||
void AddHypothesis(const tesseract::ParamsTrainingHypothesis& hypo) {
|
||||
params_training_bundle_.AddHypothesis(hypo);
|
||||
}
|
||||
|
||||
// Functions to setup the blamer.
|
||||
// Whole word string, whole word bounding box.
|
||||
void SetWordTruth(const UNICHARSET& unicharset,
|
||||
const char* truth_str, const TBOX& word_box);
|
||||
// Single "character" string, "character" bounding box.
|
||||
// May be called multiple times to indicate the characters in a word.
|
||||
void SetSymbolTruth(const UNICHARSET& unicharset,
|
||||
const char* char_str, const TBOX& char_box);
|
||||
// Marks that there is something wrong with the truth text, like it contains
|
||||
// reject characters.
|
||||
void SetRejectedTruth();
|
||||
|
||||
// Returns true if the provided word_choice is correct.
|
||||
bool ChoiceIsCorrect(const WERD_CHOICE* word_choice) const;
|
||||
|
||||
void ClearResults() {
|
||||
norm_truth_word_.DeleteAllBoxes();
|
||||
norm_box_tolerance_ = 0;
|
||||
if (!NoTruth()) incorrect_result_reason_ = IRR_CORRECT;
|
||||
debug_ = "";
|
||||
segsearch_is_looking_for_blame_ = false;
|
||||
best_correctly_segmented_rating_ = WERD_CHOICE::kBadRating;
|
||||
correct_segmentation_cols_.clear();
|
||||
correct_segmentation_rows_.clear();
|
||||
best_choice_is_dict_and_top_choice_ = false;
|
||||
delete[] lattice_data_;
|
||||
lattice_data_ = NULL;
|
||||
lattice_size_ = 0;
|
||||
}
|
||||
void CopyTruth(const BlamerBundle &other) {
|
||||
truth_has_char_boxes_ = other.truth_has_char_boxes_;
|
||||
truth_word_ = other.truth_word_;
|
||||
truth_text_ = other.truth_text_;
|
||||
incorrect_result_reason_ =
|
||||
(other.NoTruth() ? other.incorrect_result_reason_ : IRR_CORRECT);
|
||||
}
|
||||
void CopyResults(const BlamerBundle &other) {
|
||||
norm_truth_word_ = other.norm_truth_word_;
|
||||
norm_box_tolerance_ = other.norm_box_tolerance_;
|
||||
incorrect_result_reason_ = other.incorrect_result_reason_;
|
||||
segsearch_is_looking_for_blame_ = other.segsearch_is_looking_for_blame_;
|
||||
best_correctly_segmented_rating_ = other.best_correctly_segmented_rating_;
|
||||
correct_segmentation_cols_ = other.correct_segmentation_cols_;
|
||||
correct_segmentation_rows_ = other.correct_segmentation_rows_;
|
||||
best_choice_is_dict_and_top_choice_ =
|
||||
other.best_choice_is_dict_and_top_choice_;
|
||||
if (other.lattice_data_ != NULL) {
|
||||
lattice_data_ = new char[other.lattice_size_];
|
||||
memcpy(lattice_data_, other.lattice_data_, other.lattice_size_);
|
||||
lattice_size_ = other.lattice_size_;
|
||||
} else {
|
||||
lattice_data_ = NULL;
|
||||
}
|
||||
}
|
||||
const char *IncorrectReason() const;
|
||||
|
||||
// Appends choice and truth details to the given debug string.
|
||||
void FillDebugString(const STRING &msg, const WERD_CHOICE *choice,
|
||||
STRING *debug);
|
||||
|
||||
// Sets up the norm_truth_word from truth_word using the given DENORM.
|
||||
void SetupNormTruthWord(const DENORM& denorm);
|
||||
|
||||
// Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty
|
||||
// bundles) where the right edge/ of the left-hand word is word1_right,
|
||||
// and the left edge of the right-hand word is word2_left.
|
||||
void SplitBundle(int word1_right, int word2_left, bool debug,
|
||||
BlamerBundle* bundle1, BlamerBundle* bundle2) const;
|
||||
// "Joins" the blames from bundle1 and bundle2 into *this.
|
||||
void JoinBlames(const BlamerBundle& bundle1, const BlamerBundle& bundle2,
|
||||
bool debug);
|
||||
|
||||
// If a blob with the same bounding box as one of the truth character
|
||||
// bounding boxes is not classified as the corresponding truth character
|
||||
// blames character classifier for incorrect answer.
|
||||
void BlameClassifier(const UNICHARSET& unicharset,
|
||||
const TBOX& blob_box,
|
||||
const BLOB_CHOICE_LIST& choices,
|
||||
bool debug);
|
||||
|
||||
|
||||
// Checks whether chops were made at all the character bounding box
|
||||
// boundaries in word->truth_word. If not - blames the chopper for an
|
||||
// incorrect answer.
|
||||
void SetChopperBlame(const WERD_RES* word, bool debug);
|
||||
// Blames the classifier or the language model if, after running only the
|
||||
// chopper, best_choice is incorrect and no blame has been yet set.
|
||||
// Blames the classifier if best_choice is classifier's top choice and is a
|
||||
// dictionary word (i.e. language model could not have helped).
|
||||
// Otherwise, blames the language model (formerly permuter word adjustment).
|
||||
void BlameClassifierOrLangModel(
|
||||
const WERD_RES* word,
|
||||
const UNICHARSET& unicharset, bool valid_permuter, bool debug);
|
||||
// Sets up the correct_segmentation_* to mark the correct bounding boxes.
|
||||
void SetupCorrectSegmentation(const TWERD* word, bool debug);
|
||||
|
||||
// Returns true if a guided segmentation search is needed.
|
||||
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const;
|
||||
// Setup ready to guide the segmentation search to the correct segmentation.
|
||||
// The callback pp_cb is used to avoid a cyclic dependency.
|
||||
// It calls into LMPainPoints::GenerateForBlamer by pre-binding the
|
||||
// WERD_RES, and the LMPainPoints itself.
|
||||
// pp_cb must be a permanent callback, and should be deleted by the caller.
|
||||
void InitForSegSearch(const WERD_CHOICE *best_choice,
|
||||
MATRIX* ratings, UNICHAR_ID wildcard_id,
|
||||
bool debug, STRING *debug_str,
|
||||
TessResultCallback2<bool, int, int>* pp_cb);
|
||||
// Returns true if the guided segsearch is in progress.
|
||||
bool GuidedSegsearchStillGoing() const;
|
||||
// The segmentation search has ended. Sets the blame appropriately.
|
||||
void FinishSegSearch(const WERD_CHOICE *best_choice,
|
||||
bool debug, STRING *debug_str);
|
||||
|
||||
// If the bundle is null or still does not indicate the correct result,
|
||||
// fix it and use some backup reason for the blame.
|
||||
static void LastChanceBlame(bool debug, WERD_RES* word);
|
||||
|
||||
// Sets the misadaption debug if this word is incorrect, as this word is
|
||||
// being adapted to.
|
||||
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug);
|
||||
|
||||
private:
|
||||
void SetBlame(IncorrectResultReason irr, const STRING &msg,
|
||||
const WERD_CHOICE *choice, bool debug) {
|
||||
incorrect_result_reason_ = irr;
|
||||
debug_ = IncorrectReason();
|
||||
debug_ += " to blame: ";
|
||||
FillDebugString(msg, choice, &debug_);
|
||||
if (debug) tprintf("SetBlame(): %s", debug_.string());
|
||||
}
|
||||
|
||||
private:
|
||||
// Set to true when bounding boxes for individual unichars are recorded.
|
||||
bool truth_has_char_boxes_;
|
||||
// The true_word (in the original image coordinate space) contains ground
|
||||
// truth bounding boxes for this WERD_RES.
|
||||
tesseract::BoxWord truth_word_;
|
||||
// Same as above, but in normalized coordinates
|
||||
// (filled in by WERD_RES::SetupForRecognition()).
|
||||
tesseract::BoxWord norm_truth_word_;
|
||||
// Tolerance for bounding box comparisons in normalized space.
|
||||
int norm_box_tolerance_;
|
||||
// Contains ground truth unichar for each of the bounding boxes in truth_word.
|
||||
GenericVector<STRING> truth_text_;
|
||||
// The reason for incorrect OCR result.
|
||||
IncorrectResultReason incorrect_result_reason_;
|
||||
// Debug text associated with the blame.
|
||||
STRING debug_;
|
||||
// Misadaption debug information (filled in if this word was misadapted to).
|
||||
STRING misadaption_debug_;
|
||||
// Variables used by the segmentation search when looking for the blame.
|
||||
// Set to true while segmentation search is continued after the usual
|
||||
// termination condition in order to look for the blame.
|
||||
bool segsearch_is_looking_for_blame_;
|
||||
// Best rating for correctly segmented path
|
||||
// (set and used by SegSearch when looking for blame).
|
||||
float best_correctly_segmented_rating_;
|
||||
// Vectors populated by SegSearch to indicate column and row indices that
|
||||
// correspond to blobs with correct bounding boxes.
|
||||
GenericVector<int> correct_segmentation_cols_;
|
||||
GenericVector<int> correct_segmentation_rows_;
|
||||
// Set to true if best choice is a dictionary word and
|
||||
// classifier's top choice.
|
||||
bool best_choice_is_dict_and_top_choice_;
|
||||
// Serialized segmentation search lattice.
|
||||
char *lattice_data_;
|
||||
int lattice_size_; // size of lattice_data in bytes
|
||||
// Information about hypotheses (paths) explored by the segmentation search.
|
||||
tesseract::ParamsTrainingBundle params_training_bundle_;
|
||||
};
|
||||
|
||||
|
||||
#endif // TESSERACT_CCSTRUCT_BLAMER_H_
|
@ -29,12 +29,6 @@ namespace tesseract {
|
||||
// tolerance. Otherwise, the blob may be chopped and we have to just use
|
||||
// the word bounding box.
|
||||
const int kBoxClipTolerance = 2;
|
||||
// Min offset in baseline-normalized coords to make a character a subscript.
|
||||
const int kMinSubscriptOffset = 20;
|
||||
// Min offset in baseline-normalized coords to make a character a superscript.
|
||||
const int kMinSuperscriptOffset = 20;
|
||||
// Max y of bottom of a drop-cap blob.
|
||||
const int kMaxDropCapBottom = -128;
|
||||
|
||||
BoxWord::BoxWord() : length_(0) {
|
||||
}
|
||||
@ -60,21 +54,17 @@ void BoxWord::CopyFrom(const BoxWord& src) {
|
||||
boxes_.push_back(src.boxes_[i]);
|
||||
}
|
||||
|
||||
// Factory to build a BoxWord from a TWERD and the DENORM to switch
|
||||
// back to original image coordinates.
|
||||
// If the denorm is not NULL, then the output is denormalized and rotated
|
||||
// back to the original image coordinates.
|
||||
BoxWord* BoxWord::CopyFromNormalized(const DENORM* denorm,
|
||||
TWERD* tessword) {
|
||||
// Factory to build a BoxWord from a TWERD using the DENORMs on each blob to
|
||||
// switch back to original image coordinates.
|
||||
BoxWord* BoxWord::CopyFromNormalized(TWERD* tessword) {
|
||||
BoxWord* boxword = new BoxWord();
|
||||
// Count the blobs.
|
||||
boxword->length_ = 0;
|
||||
for (TBLOB* tblob = tessword->blobs; tblob != NULL; tblob = tblob->next)
|
||||
++boxword->length_;
|
||||
boxword->length_ = tessword->NumBlobs();
|
||||
// Allocate memory.
|
||||
boxword->boxes_.reserve(boxword->length_);
|
||||
|
||||
for (TBLOB* tblob = tessword->blobs; tblob != NULL; tblob = tblob->next) {
|
||||
for (int b = 0; b < boxword->length_; ++b) {
|
||||
TBLOB* tblob = tessword->blobs[b];
|
||||
TBOX blob_box;
|
||||
for (TESSLINE* outline = tblob->outlines; outline != NULL;
|
||||
outline = outline->next) {
|
||||
@ -83,12 +73,10 @@ BoxWord* BoxWord::CopyFromNormalized(const DENORM* denorm,
|
||||
do {
|
||||
if (!edgept->IsHidden() || !edgept->prev->IsHidden()) {
|
||||
ICOORD pos(edgept->pos.x, edgept->pos.y);
|
||||
if (denorm != NULL) {
|
||||
TPOINT denormed;
|
||||
denorm->DenormTransform(edgept->pos, &denormed);
|
||||
pos.set_x(denormed.x);
|
||||
pos.set_y(denormed.y);
|
||||
}
|
||||
TPOINT denormed;
|
||||
tblob->denorm().DenormTransform(NULL, edgept->pos, &denormed);
|
||||
pos.set_x(denormed.x);
|
||||
pos.set_y(denormed.y);
|
||||
TBOX pt_box(pos, pos);
|
||||
blob_box += pt_box;
|
||||
}
|
||||
@ -101,37 +89,6 @@ BoxWord* BoxWord::CopyFromNormalized(const DENORM* denorm,
|
||||
return boxword;
|
||||
}
|
||||
|
||||
// Sets up the script_pos_ member using the tessword to get the bln
|
||||
// bounding boxes, the best_choice to get the unichars, and the unicharset
|
||||
// to get the target positions. If small_caps is true, sub/super are not
|
||||
// considered, but dropcaps are.
|
||||
void BoxWord::SetScriptPositions(const UNICHARSET& unicharset, bool small_caps,
|
||||
TWERD* tessword, WERD_CHOICE* best_choice) {
|
||||
// Allocate memory.
|
||||
script_pos_.init_to_size(length_, SP_NORMAL);
|
||||
|
||||
int blob_index = 0;
|
||||
for (TBLOB* tblob = tessword->blobs; tblob != NULL; tblob = tblob->next,
|
||||
++blob_index) {
|
||||
int class_id = best_choice->unichar_id(blob_index);
|
||||
TBOX blob_box = tblob->bounding_box();
|
||||
int top = blob_box.top();
|
||||
int bottom = blob_box.bottom();
|
||||
int min_bottom, max_bottom, min_top, max_top;
|
||||
unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
|
||||
&min_top, &max_top);
|
||||
if (bottom <= kMaxDropCapBottom) {
|
||||
script_pos_[blob_index] = SP_DROPCAP;
|
||||
} else if (!small_caps) {
|
||||
if (top + kMinSubscriptOffset < min_top) {
|
||||
script_pos_[blob_index] = SP_SUBSCRIPT;
|
||||
} else if (bottom - kMinSuperscriptOffset > max_bottom) {
|
||||
script_pos_[blob_index] = SP_SUPERSCRIPT;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up the bounding boxes from the polygonal approximation by
|
||||
// expanding slightly, then clipping to the blobs from the original_word
|
||||
// that overlap. If not null, the block provides the inverse rotation.
|
||||
@ -228,9 +185,8 @@ void BoxWord::ComputeBoundingBox() {
|
||||
// The callback is deleted on completion.
|
||||
void BoxWord::ProcessMatchedBlobs(const TWERD& other,
|
||||
TessCallback1<int>* cb) const {
|
||||
TBLOB* blob = other.blobs;
|
||||
for (int i = 0; i < length_ && blob != NULL; ++i, blob = blob->next) {
|
||||
TBOX blob_box = blob->bounding_box();
|
||||
for (int i = 0; i < length_ && i < other.NumBlobs(); ++i) {
|
||||
TBOX blob_box = other.blobs[i]->bounding_box();
|
||||
if (blob_box == boxes_[i])
|
||||
cb->Run(i);
|
||||
}
|
||||
@ -238,5 +194,3 @@ void BoxWord::ProcessMatchedBlobs(const TWERD& other,
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
|
||||
|
@ -22,6 +22,7 @@
|
||||
|
||||
#include "genericvector.h"
|
||||
#include "rect.h"
|
||||
#include "unichar.h"
|
||||
|
||||
class BLOCK;
|
||||
class DENORM;
|
||||
@ -34,14 +35,6 @@ class WERD_RES;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// ScriptPos tells whether a character is subscript, superscript or normal.
|
||||
enum ScriptPos {
|
||||
SP_NORMAL,
|
||||
SP_SUBSCRIPT,
|
||||
SP_SUPERSCRIPT,
|
||||
SP_DROPCAP
|
||||
};
|
||||
|
||||
// Class to hold an array of bounding boxes for an output word and
|
||||
// the bounding box of the whole word.
|
||||
class BoxWord {
|
||||
@ -54,19 +47,9 @@ class BoxWord {
|
||||
|
||||
void CopyFrom(const BoxWord& src);
|
||||
|
||||
// Factory to build a BoxWord from a TWERD and the DENORM to switch
|
||||
// back to original image coordinates.
|
||||
// If the denorm is not NULL, then the output is denormalized and rotated
|
||||
// back to the original image coordinates.
|
||||
static BoxWord* CopyFromNormalized(const DENORM* denorm,
|
||||
TWERD* tessword);
|
||||
|
||||
// Sets up the script_pos_ member using the tessword to get the bln
|
||||
// bounding boxes, the best_choice to get the unichars, and the unicharset
|
||||
// to get the target positions. If small_caps is true, sub/super are not
|
||||
// considered, but dropcaps are.
|
||||
void SetScriptPositions(const UNICHARSET& unicharset, bool small_caps,
|
||||
TWERD* tessword, WERD_CHOICE* best_choice);
|
||||
// Factory to build a BoxWord from a TWERD using the DENORMs on each blob to
|
||||
// switch back to original image coordinates.
|
||||
static BoxWord* CopyFromNormalized(TWERD* tessword);
|
||||
|
||||
// Clean up the bounding boxes from the polygonal approximation by
|
||||
// expanding slightly, then clipping to the blobs from the original_word
|
||||
@ -102,11 +85,6 @@ class BoxWord {
|
||||
const TBOX& BlobBox(int index) const {
|
||||
return boxes_[index];
|
||||
}
|
||||
ScriptPos BlobPosition(int index) const {
|
||||
if (index < 0 || index >= script_pos_.size())
|
||||
return SP_NORMAL;
|
||||
return script_pos_[index];
|
||||
}
|
||||
|
||||
private:
|
||||
void ComputeBoundingBox();
|
||||
@ -114,7 +92,6 @@ class BoxWord {
|
||||
TBOX bbox_;
|
||||
int length_;
|
||||
GenericVector<TBOX> boxes_;
|
||||
GenericVector<ScriptPos> script_pos_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
@ -32,21 +32,120 @@
|
||||
#include "tprintf.h"
|
||||
#include "unicharset.h"
|
||||
|
||||
// Returns true if there are any real classification results.
|
||||
bool MATRIX::Classified(int col, int row, int wildcard_id) const {
|
||||
if (get(col, row) == NOT_CLASSIFIED) return false;
|
||||
BLOB_CHOICE_IT b_it(get(col, row));
|
||||
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
|
||||
BLOB_CHOICE* choice = b_it.data();
|
||||
if (choice->IsClassified())
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Expands the existing matrix in-place to make the band wider, without
|
||||
// losing any existing data.
|
||||
void MATRIX::IncreaseBandSize(int bandwidth) {
|
||||
ResizeWithCopy(dimension(), bandwidth);
|
||||
}
|
||||
|
||||
// Returns a bigger MATRIX with a new column and row in the matrix in order
|
||||
// to split the blob at the given (ind,ind) diagonal location.
|
||||
// Entries are relocated to the new MATRIX using the transformation defined
|
||||
// by MATRIX_COORD::MapForSplit.
|
||||
// Transfers the pointer data to the new MATRIX and deletes *this.
|
||||
MATRIX* MATRIX::ConsumeAndMakeBigger(int ind) {
|
||||
int dim = dimension();
|
||||
int band_width = bandwidth();
|
||||
// Check to see if bandwidth needs expanding.
|
||||
for (int col = ind; col >= 0 && col > ind - band_width; --col) {
|
||||
if (array_[col * band_width + band_width - 1] != empty_) {
|
||||
++band_width;
|
||||
break;
|
||||
}
|
||||
}
|
||||
MATRIX* result = new MATRIX(dim + 1, band_width);
|
||||
|
||||
for (int col = 0; col < dim; ++col) {
|
||||
for (int row = col; row < dim && row < col + bandwidth(); ++row) {
|
||||
MATRIX_COORD coord(col, row);
|
||||
coord.MapForSplit(ind);
|
||||
BLOB_CHOICE_LIST* choices = get(col, row);
|
||||
if (choices != NULL) {
|
||||
// Correct matrix location on each choice.
|
||||
BLOB_CHOICE_IT bc_it(choices);
|
||||
for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
|
||||
BLOB_CHOICE* choice = bc_it.data();
|
||||
choice->set_matrix_cell(coord.col, coord.row);
|
||||
}
|
||||
ASSERT_HOST(coord.Valid(*result));
|
||||
result->put(coord.col, coord.row, choices);
|
||||
}
|
||||
}
|
||||
}
|
||||
delete this;
|
||||
return result;
|
||||
}
|
||||
|
||||
// Makes and returns a deep copy of *this, including all the BLOB_CHOICEs
|
||||
// on the lists, but not any LanguageModelState that may be attached to the
|
||||
// BLOB_CHOICEs.
|
||||
MATRIX* MATRIX::DeepCopy() const {
|
||||
int dim = dimension();
|
||||
int band_width = bandwidth();
|
||||
MATRIX* result = new MATRIX(dim, band_width);
|
||||
for (int col = 0; col < dim; ++col) {
|
||||
for (int row = col; row < col + band_width; ++row) {
|
||||
BLOB_CHOICE_LIST* choices = get(col, row);
|
||||
if (choices != NULL) {
|
||||
BLOB_CHOICE_LIST* copy_choices = new BLOB_CHOICE_LIST;
|
||||
choices->deep_copy(copy_choices, &BLOB_CHOICE::deep_copy);
|
||||
result->put(col, row, copy_choices);
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Print the best guesses out of the match rating matrix.
|
||||
void MATRIX::print(const UNICHARSET &unicharset) const {
|
||||
tprintf("Ratings Matrix (top choices)\n");
|
||||
tprintf("Ratings Matrix (top 3 choices)\n");
|
||||
int dim = dimension();
|
||||
int band_width = bandwidth();
|
||||
int row, col;
|
||||
for (col = 0; col < this->dimension(); ++col) tprintf("\t%d", col);
|
||||
for (col = 0; col < dim; ++col) {
|
||||
for (row = col; row < dim && row < col + band_width; ++row) {
|
||||
BLOB_CHOICE_LIST *rating = this->get(col, row);
|
||||
if (rating == NOT_CLASSIFIED) continue;
|
||||
BLOB_CHOICE_IT b_it(rating);
|
||||
tprintf("col=%d row=%d ", col, row);
|
||||
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
|
||||
tprintf("%s rat=%g cert=%g " ,
|
||||
unicharset.id_to_unichar(b_it.data()->unichar_id()),
|
||||
b_it.data()->rating(), b_it.data()->certainty());
|
||||
}
|
||||
tprintf("\n");
|
||||
}
|
||||
tprintf("\n");
|
||||
}
|
||||
tprintf("\n");
|
||||
for (row = 0; row < this->dimension(); ++row) {
|
||||
for (col = 0; col < dim; ++col) tprintf("\t%d", col);
|
||||
tprintf("\n");
|
||||
for (row = 0; row < dim; ++row) {
|
||||
for (col = 0; col <= row; ++col) {
|
||||
if (col == 0) tprintf("%d\t", row);
|
||||
if (row >= col + band_width) {
|
||||
tprintf(" \t");
|
||||
continue;
|
||||
}
|
||||
BLOB_CHOICE_LIST *rating = this->get(col, row);
|
||||
if (rating != NOT_CLASSIFIED) {
|
||||
BLOB_CHOICE_IT b_it(rating);
|
||||
int counter = 0;
|
||||
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
|
||||
tprintf("%s ", unicharset.id_to_unichar(b_it.data()->unichar_id()));
|
||||
tprintf("%s ",
|
||||
unicharset.id_to_unichar(b_it.data()->unichar_id()));
|
||||
++counter;
|
||||
if (counter == 3) break;
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
/* -*-C-*-
|
||||
********************************************************************************
|
||||
******************************************************************************
|
||||
*
|
||||
* File: matrix.h (Formerly matrix.h)
|
||||
* Description: Ratings matrix code. (Used by associator)
|
||||
@ -25,18 +25,28 @@
|
||||
#ifndef TESSERACT_CCSTRUCT_MATRIX_H__
|
||||
#define TESSERACT_CCSTRUCT_MATRIX_H__
|
||||
|
||||
#include "ratngs.h"
|
||||
#include "kdpair.h"
|
||||
#include "unicharset.h"
|
||||
|
||||
class BLOB_CHOICE_LIST;
|
||||
|
||||
#define NOT_CLASSIFIED reinterpret_cast<BLOB_CHOICE_LIST*>(NULL)
|
||||
|
||||
// A generic class to store a matrix with entries of type T.
|
||||
// A generic class to hold a 2-D matrix with entries of type T, but can also
|
||||
// act as a base class for other implementations, such as a triangular or
|
||||
// banded matrix.
|
||||
template <class T>
|
||||
class GENERIC_2D_ARRAY {
|
||||
public:
|
||||
// Allocate a piece of memory to hold a 2d-array of the given dimension.
|
||||
// Initialize all the elements of the array to empty instead of assuming
|
||||
// that a default constructor can be used.
|
||||
// Initializes the array size, and empty element, but cannot allocate memory
|
||||
// for the subclasses or initialize because calls to the num_elements
|
||||
// member will be routed to the base class implementation. Subclasses can
|
||||
// either pass the memory in, or allocate after by calling Resize().
|
||||
GENERIC_2D_ARRAY(int dim1, int dim2, const T& empty, T* array)
|
||||
: empty_(empty), dim1_(dim1), dim2_(dim2), array_(array) {
|
||||
}
|
||||
// Original constructor for a full rectangular matrix DOES allocate memory
|
||||
// and initialize it to empty.
|
||||
GENERIC_2D_ARRAY(int dim1, int dim2, const T& empty)
|
||||
: empty_(empty), dim1_(dim1), dim2_(dim2) {
|
||||
array_ = new T[dim1_ * dim2_];
|
||||
@ -44,26 +54,67 @@ class GENERIC_2D_ARRAY {
|
||||
for (int y = 0; y < dim2_; y++)
|
||||
this->put(x, y, empty_);
|
||||
}
|
||||
~GENERIC_2D_ARRAY() { delete[] array_; }
|
||||
virtual ~GENERIC_2D_ARRAY() { delete[] array_; }
|
||||
|
||||
// Reallocate the array to the given size. Does not keep old data.
|
||||
void Resize(int size1, int size2, const T& empty) {
|
||||
empty_ = empty;
|
||||
if (size1 != dim1_ || size2 != dim2_) {
|
||||
dim1_ = size1;
|
||||
dim2_ = size2;
|
||||
delete [] array_;
|
||||
array_ = new T[dim1_ * dim2_];
|
||||
}
|
||||
Clear();
|
||||
}
|
||||
|
||||
// Reallocate the array to the given size, keeping old data.
|
||||
void ResizeWithCopy(int size1, int size2) {
|
||||
if (size1 != dim1_ || size2 != dim2_) {
|
||||
T* new_array = new T[size1 * size2];
|
||||
for (int col = 0; col < size1; ++col) {
|
||||
for (int row = 0; row < size2; ++row) {
|
||||
int old_index = col * dim2() + row;
|
||||
int new_index = col * size2 + row;
|
||||
if (col < dim1_ && row < dim2_) {
|
||||
new_array[new_index] = array_[old_index];
|
||||
} else {
|
||||
new_array[new_index] = empty_;
|
||||
}
|
||||
}
|
||||
}
|
||||
delete[] array_;
|
||||
array_ = new_array;
|
||||
dim1_ = size1;
|
||||
dim2_ = size2;
|
||||
}
|
||||
}
|
||||
|
||||
// Sets all the elements of the array to the empty value.
|
||||
void Clear() {
|
||||
int total_size = num_elements();
|
||||
for (int i = 0; i < total_size; ++i)
|
||||
array_[i] = empty_;
|
||||
}
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
// Only works with bitwise-serializeable types!
|
||||
bool Serialize(FILE* fp) const {
|
||||
if (!SerializeSize(fp)) return false;
|
||||
if (fwrite(&empty_, sizeof(empty_), 1, fp) != 1) return false;
|
||||
int size = dim1_ * dim2_;
|
||||
int size = num_elements();
|
||||
if (fwrite(array_, sizeof(*array_), size, fp) != size) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// Only works with bitwise-serializeable types!
|
||||
// Only works with bitwise-serializeable typ
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp) {
|
||||
if (!DeSerializeSize(swap, fp)) return false;
|
||||
if (fread(&empty_, sizeof(empty_), 1, fp) != 1) return false;
|
||||
if (swap) ReverseN(&empty_, sizeof(empty_));
|
||||
int size = dim1_ * dim2_;
|
||||
int size = num_elements();
|
||||
if (fread(array_, sizeof(*array_), size, fp) != size) return false;
|
||||
if (swap) {
|
||||
for (int i = 0; i < size; ++i)
|
||||
@ -77,7 +128,7 @@ class GENERIC_2D_ARRAY {
|
||||
bool SerializeClasses(FILE* fp) const {
|
||||
if (!SerializeSize(fp)) return false;
|
||||
if (!empty_.Serialize(fp)) return false;
|
||||
int size = dim1_ * dim2_;
|
||||
int size = num_elements();
|
||||
for (int i = 0; i < size; ++i) {
|
||||
if (!array_[i].Serialize(fp)) return false;
|
||||
}
|
||||
@ -90,7 +141,7 @@ class GENERIC_2D_ARRAY {
|
||||
bool DeSerializeClasses(bool swap, FILE* fp) {
|
||||
if (!DeSerializeSize(swap, fp)) return false;
|
||||
if (!empty_.DeSerialize(swap, fp)) return false;
|
||||
int size = dim1_ * dim2_;
|
||||
int size = num_elements();
|
||||
for (int i = 0; i < size; ++i) {
|
||||
if (!array_[i].DeSerialize(swap, fp)) return false;
|
||||
}
|
||||
@ -100,11 +151,14 @@ class GENERIC_2D_ARRAY {
|
||||
// Provide the dimensions of this rectangular matrix.
|
||||
int dim1() const { return dim1_; }
|
||||
int dim2() const { return dim2_; }
|
||||
// Returns the number of elements in the array.
|
||||
// Banded/triangular matrices may override.
|
||||
virtual int num_elements() const { return dim1_ * dim2_; }
|
||||
|
||||
// Expression to select a specific location in the matrix. The matrix is
|
||||
// stored COLUMN-major, so the left-most index is the most significant.
|
||||
// This allows [][] access to use indices in the same order as (,).
|
||||
int index(int column, int row) const {
|
||||
virtual int index(int column, int row) const {
|
||||
return (column * dim2_ + row);
|
||||
}
|
||||
|
||||
@ -129,19 +183,21 @@ class GENERIC_2D_ARRAY {
|
||||
T* operator[](int column) {
|
||||
return &array_[this->index(column, 0)];
|
||||
}
|
||||
const T* operator[](int column) const {
|
||||
return &array_[this->index(column, 0)];
|
||||
}
|
||||
|
||||
// Delete objects pointed to by array_[i].
|
||||
void delete_matrix_pointers() {
|
||||
for (int x = 0; x < dim1_; x++) {
|
||||
for (int y = 0; y < dim2_; y++) {
|
||||
T matrix_cell = this->get(x, y);
|
||||
if (matrix_cell != empty_)
|
||||
delete matrix_cell;
|
||||
}
|
||||
int size = num_elements();
|
||||
for (int i = 0; i < size; ++i) {
|
||||
T matrix_cell = array_[i];
|
||||
if (matrix_cell != empty_)
|
||||
delete matrix_cell;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
protected:
|
||||
// Factored helper to serialize the size.
|
||||
bool SerializeSize(FILE* fp) const {
|
||||
inT32 size = dim1_;
|
||||
@ -160,12 +216,7 @@ class GENERIC_2D_ARRAY {
|
||||
ReverseN(&size1, sizeof(size1));
|
||||
ReverseN(&size2, sizeof(size2));
|
||||
}
|
||||
if (size1 != dim1_ || size2 != dim2_) {
|
||||
dim1_ = size1;
|
||||
dim2_ = size2;
|
||||
delete [] array_;
|
||||
array_ = new T[dim1_ * dim2_];
|
||||
}
|
||||
Resize(size1, size2, empty_);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -175,25 +226,90 @@ class GENERIC_2D_ARRAY {
|
||||
int dim2_; // Size of the 2nd dimension in indexing functions.
|
||||
};
|
||||
|
||||
// A generic class to store a square matrix with entries of type T.
|
||||
// A generic class to store a banded triangular matrix with entries of type T.
|
||||
// In this array, the nominally square matrix is dim1_ x dim1_, and dim2_ is
|
||||
// the number of bands, INCLUDING the diagonal. The storage is thus of size
|
||||
// dim1_ * dim2_ and index(col, row) = col * dim2_ + row - col, and an
|
||||
// assert will fail if row < col or row - col >= dim2.
|
||||
template <class T>
|
||||
class GENERIC_MATRIX : public GENERIC_2D_ARRAY<T> {
|
||||
class BandTriMatrix : public GENERIC_2D_ARRAY<T> {
|
||||
public:
|
||||
// Allocate a piece of memory to hold a matrix of the given dimension.
|
||||
// Initialize all the elements of the matrix to empty instead of assuming
|
||||
// Allocate a piece of memory to hold a 2d-array of the given dimension.
|
||||
// Initialize all the elements of the array to empty instead of assuming
|
||||
// that a default constructor can be used.
|
||||
GENERIC_MATRIX(int dimension, const T& empty)
|
||||
: GENERIC_2D_ARRAY<T>(dimension, dimension, empty) {
|
||||
BandTriMatrix(int dim1, int dim2, const T& empty)
|
||||
: GENERIC_2D_ARRAY<T>(dim1, dim2, empty) {
|
||||
}
|
||||
// The default destructor will do.
|
||||
|
||||
// Provide the dimensions of this matrix.
|
||||
// dimension is the size of the nominally square matrix.
|
||||
int dimension() const { return this->dim1_; }
|
||||
// bandwidth is the number of bands in the matrix, INCLUDING the diagonal.
|
||||
int bandwidth() const { return this->dim2_; }
|
||||
|
||||
// Expression to select a specific location in the matrix. The matrix is
|
||||
// stored COLUMN-major, so the left-most index is the most significant.
|
||||
// This allows [][] access to use indices in the same order as (,).
|
||||
virtual int index(int column, int row) const {
|
||||
ASSERT_HOST(row >= column);
|
||||
ASSERT_HOST(row - column < this->dim2_);
|
||||
return column * this->dim2_ + row - column;
|
||||
}
|
||||
|
||||
// Provide the dimension of this square matrix.
|
||||
int dimension() const { return this->dim1(); }
|
||||
// Appends array2 corner-to-corner to *this, making an array of dimension
|
||||
// equal to the sum of the individual dimensions.
|
||||
// array2 is not destroyed, but is left empty, as all elements are moved
|
||||
// to *this.
|
||||
void AttachOnCorner(BandTriMatrix<T>* array2) {
|
||||
int new_dim1 = this->dim1_ + array2->dim1_;
|
||||
int new_dim2 = MAX(this->dim2_, array2->dim2_);
|
||||
T* new_array = new T[new_dim1 * new_dim2];
|
||||
for (int col = 0; col < new_dim1; ++col) {
|
||||
for (int j = 0; j < new_dim2; ++j) {
|
||||
int new_index = col * new_dim2 + j;
|
||||
if (col < this->dim1_ && j < this->dim2_) {
|
||||
new_array[new_index] = this->get(col, col + j);
|
||||
} else if (col >= this->dim1_ && j < array2->dim2_) {
|
||||
new_array[new_index] = array2->get(col - this->dim1_,
|
||||
col - this->dim1_ + j);
|
||||
array2->put(col - this->dim1_, col - this->dim1_ + j, NULL);
|
||||
} else {
|
||||
new_array[new_index] = this->empty_;
|
||||
}
|
||||
}
|
||||
}
|
||||
delete[] this->array_;
|
||||
this->array_ = new_array;
|
||||
this->dim1_ = new_dim1;
|
||||
this->dim2_ = new_dim2;
|
||||
}
|
||||
};
|
||||
|
||||
class MATRIX : public GENERIC_MATRIX<BLOB_CHOICE_LIST *> {
|
||||
class MATRIX : public BandTriMatrix<BLOB_CHOICE_LIST *> {
|
||||
public:
|
||||
MATRIX(int dimension) : GENERIC_MATRIX<BLOB_CHOICE_LIST *>(dimension,
|
||||
NOT_CLASSIFIED) {}
|
||||
MATRIX(int dimension, int bandwidth)
|
||||
: BandTriMatrix<BLOB_CHOICE_LIST *>(dimension, bandwidth, NOT_CLASSIFIED) {}
|
||||
|
||||
// Returns true if there are any real classification results.
|
||||
bool Classified(int col, int row, int wildcard_id) const;
|
||||
|
||||
// Expands the existing matrix in-place to make the band wider, without
|
||||
// losing any existing data.
|
||||
void IncreaseBandSize(int bandwidth);
|
||||
|
||||
// Returns a bigger MATRIX with a new column and row in the matrix in order
|
||||
// to split the blob at the given (ind,ind) diagonal location.
|
||||
// Entries are relocated to the new MATRIX using the transformation defined
|
||||
// by MATRIX_COORD::MapForSplit.
|
||||
// Transfers the pointer data to the new MATRIX and deletes *this.
|
||||
MATRIX* ConsumeAndMakeBigger(int ind);
|
||||
|
||||
// Makes and returns a deep copy of *this, including all the BLOB_CHOICEs
|
||||
// on the lists, but not any LanguageModelState that may be attached to the
|
||||
// BLOB_CHOICEs.
|
||||
MATRIX* DeepCopy() const;
|
||||
|
||||
// Print a shortened version of the contents of the matrix.
|
||||
void print(const UNICHARSET &unicharset) const;
|
||||
};
|
||||
@ -203,14 +319,34 @@ struct MATRIX_COORD {
|
||||
MATRIX_COORD *c = static_cast<MATRIX_COORD *>(arg);
|
||||
delete c;
|
||||
}
|
||||
// Default constructor required by GenericHeap.
|
||||
MATRIX_COORD() : col(0), row(0) {}
|
||||
MATRIX_COORD(int c, int r): col(c), row(r) {}
|
||||
~MATRIX_COORD() {}
|
||||
|
||||
bool Valid(const MATRIX &m) const {
|
||||
return (col >= 0 && row >= 0 &&
|
||||
col < m.dimension() && row < m.dimension());
|
||||
return 0 <= col && col < m.dimension() &&
|
||||
col <= row && row < col + m.bandwidth() && row < m.dimension();
|
||||
}
|
||||
|
||||
// Remaps the col,row pair to split the blob at the given (ind,ind) diagonal
|
||||
// location.
|
||||
// Entries at (i,j) for i in [0,ind] and j in [ind,dim) move to (i,j+1),
|
||||
// making a new row at ind.
|
||||
// Entries at (i,j) for i in [ind+1,dim) and j in [i,dim) move to (i+i,j+1),
|
||||
// making a new column at ind+1.
|
||||
void MapForSplit(int ind) {
|
||||
ASSERT_HOST(row >= col);
|
||||
if (col > ind) ++col;
|
||||
if (row >= ind) ++row;
|
||||
ASSERT_HOST(row >= col);
|
||||
}
|
||||
|
||||
int col;
|
||||
int row;
|
||||
};
|
||||
|
||||
// The MatrixCoordPair contains a MATRIX_COORD and its priority.
|
||||
typedef tesseract::KDPairInc<float, MATRIX_COORD> MatrixCoordPair;
|
||||
|
||||
#endif // TESSERACT_CCSTRUCT_MATRIX_H__
|
||||
|
@ -472,6 +472,8 @@ void RefreshWordBlobsFromNewBlobs(BLOCK_LIST* block_list,
|
||||
BLOCK_IT block_it(block_list);
|
||||
for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
|
||||
BLOCK* block = block_it.data();
|
||||
if (block->poly_block() != NULL && !block->poly_block()->IsText())
|
||||
continue; // Don't touch non-text blocks.
|
||||
// Iterate over all rows in the block.
|
||||
ROW_IT row_it(block->row_list());
|
||||
for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -19,6 +19,7 @@
|
||||
#ifndef PAGERES_H
|
||||
#define PAGERES_H
|
||||
|
||||
#include "blamer.h"
|
||||
#include "blobs.h"
|
||||
#include "boxword.h"
|
||||
#include "elst.h"
|
||||
@ -38,167 +39,6 @@ class Tesseract;
|
||||
}
|
||||
using tesseract::FontInfo;
|
||||
|
||||
static const inT16 kBlamerBoxTolerance = 5;
|
||||
|
||||
// Enum for expressing the source of error.
|
||||
// Note: Please update kIncorrectResultReasonNames when modifying this enum.
|
||||
enum IncorrectResultReason {
|
||||
// The text recorded in best choice == truth text
|
||||
IRR_CORRECT,
|
||||
// Either: Top choice is incorrect and is a dictionary word (language model
|
||||
// is unlikely to help correct such errors, so blame the classifier).
|
||||
// Or: the correct unichar was not included in shortlist produced by the
|
||||
// classifier at all.
|
||||
IRR_CLASSIFIER,
|
||||
// Chopper have not found one or more splits that correspond to the correct
|
||||
// character bounding boxes recorded in BlamerBundle::truth_word.
|
||||
IRR_CHOPPER,
|
||||
// Classifier did include correct unichars for each blob in the correct
|
||||
// segmentation, however its rating could have been too bad to allow the
|
||||
// language model to pull out the correct choice. On the other hand the
|
||||
// strength of the language model might have been too weak to favor the
|
||||
// correct answer, this we call this case a classifier-language model
|
||||
// tradeoff error.
|
||||
IRR_CLASS_LM_TRADEOFF,
|
||||
// Page layout failed to produce the correct bounding box. Blame page layout
|
||||
// if the truth was not found for the word, which implies that the bounding
|
||||
// box of the word was incorrect (no truth word had a similar bounding box).
|
||||
IRR_PAGE_LAYOUT,
|
||||
// SegSearch heuristic prevented one or more blobs from the correct
|
||||
// segmentation state to be classified (e.g. the blob was too wide).
|
||||
IRR_SEGSEARCH_HEUR,
|
||||
// The correct segmentaiton state was not explored because of poor SegSearch
|
||||
// pain point prioritization. We blame SegSearch pain point prioritization
|
||||
// if the best rating of a choice constructed from correct segmentation is
|
||||
// better than that of the best choice (i.e. if we got to explore the correct
|
||||
// segmentation state, language model would have picked the correct choice).
|
||||
IRR_SEGSEARCH_PP,
|
||||
// Same as IRR_CLASS_LM_TRADEOFF, but used when we only run chopper on a word,
|
||||
|
||||
// and thus use the old language model (permuters).
|
||||
// TODO(antonova): integrate the new language mode with chopper
|
||||
IRR_CLASS_OLD_LM_TRADEOFF,
|
||||
// If there is an incorrect adaptive template match with a better score than
|
||||
// a correct one (either pre-trained or adapted), mark this as adaption error.
|
||||
IRR_ADAPTION,
|
||||
// split_and_recog_word() failed to find a suitable split in truth.
|
||||
IRR_NO_TRUTH_SPLIT,
|
||||
// Truth is not available for this word (e.g. when words in corrected content
|
||||
// file are turned into ~~~~ because an appropriate alignment was not found.
|
||||
IRR_NO_TRUTH,
|
||||
// The text recorded in best choice != truth text, but none of the above
|
||||
// reasons are set.
|
||||
IRR_UNKNOWN,
|
||||
|
||||
IRR_NUM_REASONS
|
||||
};
|
||||
|
||||
// Blamer-related information to determine the source of errors.
|
||||
struct BlamerBundle {
|
||||
static const char *IncorrectReasonName(IncorrectResultReason irr);
|
||||
BlamerBundle() : truth_has_char_boxes(false),
|
||||
incorrect_result_reason(IRR_CORRECT),
|
||||
lattice_data(NULL) { ClearResults(); }
|
||||
~BlamerBundle() { delete[] lattice_data; }
|
||||
void ClearResults() {
|
||||
norm_truth_word.DeleteAllBoxes();
|
||||
norm_box_tolerance = 0;
|
||||
if (!NoTruth()) incorrect_result_reason = IRR_CORRECT;
|
||||
debug = "";
|
||||
segsearch_is_looking_for_blame = false;
|
||||
best_correctly_segmented_rating = WERD_CHOICE::kBadRating;
|
||||
correct_segmentation_cols.clear();
|
||||
correct_segmentation_rows.clear();
|
||||
best_choice_is_dict_and_top_choice = false;
|
||||
delete[] lattice_data;
|
||||
lattice_data = NULL;
|
||||
lattice_size = 0;
|
||||
}
|
||||
void CopyTruth(const BlamerBundle &other) {
|
||||
truth_has_char_boxes = other.truth_has_char_boxes;
|
||||
truth_word = other.truth_word;
|
||||
truth_text = other.truth_text;
|
||||
incorrect_result_reason =
|
||||
(other.NoTruth() ? other.incorrect_result_reason : IRR_CORRECT);
|
||||
}
|
||||
void CopyResults(const BlamerBundle &other) {
|
||||
norm_truth_word = other.norm_truth_word;
|
||||
norm_box_tolerance = other.norm_box_tolerance;
|
||||
incorrect_result_reason = other.incorrect_result_reason;
|
||||
segsearch_is_looking_for_blame = other.segsearch_is_looking_for_blame;
|
||||
best_correctly_segmented_rating =other.best_correctly_segmented_rating;
|
||||
correct_segmentation_cols = other.correct_segmentation_cols;
|
||||
correct_segmentation_rows = other.correct_segmentation_rows;
|
||||
best_choice_is_dict_and_top_choice =
|
||||
other.best_choice_is_dict_and_top_choice;
|
||||
if (other.lattice_data != NULL) {
|
||||
lattice_data = new char[other.lattice_size];
|
||||
memcpy(lattice_data, other.lattice_data, other.lattice_size);
|
||||
lattice_size = other.lattice_size;
|
||||
} else {
|
||||
lattice_data = NULL;
|
||||
}
|
||||
}
|
||||
BlamerBundle(const BlamerBundle &other) {
|
||||
this->CopyTruth(other);
|
||||
this->CopyResults(other);
|
||||
}
|
||||
const char *IncorrectReason() const;
|
||||
bool NoTruth() const {
|
||||
return (incorrect_result_reason == IRR_NO_TRUTH ||
|
||||
incorrect_result_reason == IRR_PAGE_LAYOUT);
|
||||
}
|
||||
void SetBlame(IncorrectResultReason irr,
|
||||
const STRING &msg, const WERD_CHOICE *choice, bool debug) {
|
||||
this->incorrect_result_reason = irr;
|
||||
this->debug = this->IncorrectReason();
|
||||
this->debug += " to blame: ";
|
||||
this->FillDebugString(msg, choice, &(this->debug));
|
||||
if (debug) tprintf("SetBlame(): %s", this->debug.string());
|
||||
}
|
||||
// Appends choice and truth details to the given debug string.
|
||||
void FillDebugString(const STRING &msg, const WERD_CHOICE *choice,
|
||||
STRING *debug);
|
||||
|
||||
// Set to true when bounding boxes for individual unichars are recorded.
|
||||
bool truth_has_char_boxes;
|
||||
// The true_word (in the original image coordinate space) contains ground
|
||||
// truth bounding boxes for this WERD_RES.
|
||||
tesseract::BoxWord truth_word;
|
||||
// Same as above, but in normalized coordinates
|
||||
// (filled in by WERD_RES::SetupForRecognition()).
|
||||
tesseract::BoxWord norm_truth_word;
|
||||
// Tolerance for bounding box comparisons in normalized space.
|
||||
int norm_box_tolerance;
|
||||
// Contains ground truth unichar for each of the bounding boxes in truth_word.
|
||||
GenericVector<STRING> truth_text;
|
||||
// The reason for incorrect OCR result.
|
||||
IncorrectResultReason incorrect_result_reason;
|
||||
// Debug text associated with the blame.
|
||||
STRING debug;
|
||||
// Misadaption debug information (filled in if this word was misadapted to).
|
||||
STRING misadaption_debug;
|
||||
// Variables used by the segmentation search when looking for the blame.
|
||||
// Set to true while segmentation search is continued after the usual
|
||||
// termination condition in order to look for the blame.
|
||||
bool segsearch_is_looking_for_blame;
|
||||
// Best rating for correctly segmented path
|
||||
// (set and used by SegSearch when looking for blame).
|
||||
float best_correctly_segmented_rating;
|
||||
// Vectors populated by SegSearch to indicate column and row indices that
|
||||
// correspond to blobs with correct bounding boxes.
|
||||
GenericVector<int> correct_segmentation_cols;
|
||||
GenericVector<int> correct_segmentation_rows;
|
||||
// Set to true if best choice is a dictionary word and
|
||||
// classifier's top choice.
|
||||
bool best_choice_is_dict_and_top_choice;
|
||||
// Serialized segmentation search lattice.
|
||||
char *lattice_data;
|
||||
int lattice_size; // size of lattice_data in bytes
|
||||
// Information about hypotheses (paths) explored by the segmentation search.
|
||||
tesseract::ParamsTrainingBundle params_training_bundle;
|
||||
};
|
||||
|
||||
/* Forward declarations */
|
||||
|
||||
class BLOCK_RES;
|
||||
@ -341,8 +181,11 @@ class WERD_RES : public ELIST_LINK {
|
||||
// TODO(rays) determine if docqual does anything useful and delete bln_boxes
|
||||
// if it doesn't.
|
||||
tesseract::BoxWord* bln_boxes; // BLN input bounding boxes.
|
||||
// The ROW that this word sits in. NOT owned by the WERD_RES.
|
||||
ROW* blob_row;
|
||||
// The denorm provides the transformation to get back to the rotated image
|
||||
// coords from the chopped_word/rebuild_word BLN coords.
|
||||
// coords from the chopped_word/rebuild_word BLN coords, but each blob also
|
||||
// has its own denorm.
|
||||
DENORM denorm; // For use on chopped_word.
|
||||
// Unicharset used by the classifier output in best_choice and raw_choice.
|
||||
const UNICHARSET* uch_set; // For converting back to utf8.
|
||||
@ -355,13 +198,32 @@ class WERD_RES : public ELIST_LINK {
|
||||
// character fragments that make up the word.
|
||||
// The length of chopped_word matches length of seam_array + 1 (if set).
|
||||
TWERD* chopped_word; // BLN chopped fragments output.
|
||||
SEAMS seam_array; // Seams matching chopped_word.
|
||||
WERD_CHOICE *best_choice; // tess output
|
||||
WERD_CHOICE *raw_choice; // top choice permuter
|
||||
// Alternative paths found during chopping/segmentation search stages
|
||||
// (the first entry being a slim copy of best_choice).
|
||||
GenericVector<WERD_CHOICE *> alt_choices;
|
||||
GenericVector<GenericVector<int> > alt_states;
|
||||
// Vector of SEAM* holding chopping points matching chopped_word.
|
||||
GenericVector<SEAM*> seam_array;
|
||||
// Widths of blobs in chopped_word.
|
||||
GenericVector<int> blob_widths;
|
||||
// Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
|
||||
// blob i and blob i+1.
|
||||
GenericVector<int> blob_gaps;
|
||||
// Ratings matrix contains classifier choices for each classified combination
|
||||
// of blobs. The dimension is the same as the number of blobs in chopped_word
|
||||
// and the leading diagonal corresponds to classifier results of the blobs
|
||||
// in chopped_word. The state_ members of best_choice, raw_choice and
|
||||
// best_choices all correspond to this ratings matrix and allow extraction
|
||||
// of the blob choices for any given WERD_CHOICE.
|
||||
MATRIX* ratings; // Owned pointer.
|
||||
// Pointer to the first WERD_CHOICE in best_choices. This is the result that
|
||||
// will be output from Tesseract. Note that this is now a borrowed pointer
|
||||
// and should NOT be deleted.
|
||||
WERD_CHOICE* best_choice; // Borrowed pointer.
|
||||
// The best raw_choice found during segmentation search. Differs from the
|
||||
// best_choice by being the best result according to just the character
|
||||
// classifier, not taking any language model information into account.
|
||||
// Unlike best_choice, the pointer IS owned by this WERD_RES.
|
||||
WERD_CHOICE* raw_choice; // Owned pointer.
|
||||
// Alternative results found during chopping/segmentation search stages.
|
||||
// Note that being an ELIST, best_choices owns the WERD_CHOICEs.
|
||||
WERD_CHOICE_LIST best_choices;
|
||||
|
||||
// Truth bounding boxes, text and incorrect choice reason.
|
||||
BlamerBundle *blamer_bundle;
|
||||
@ -462,6 +324,8 @@ class WERD_RES : public ELIST_LINK {
|
||||
InitPointers();
|
||||
word = the_word;
|
||||
}
|
||||
// Deep copies everything except the ratings MATRIX.
|
||||
// To get that use deep_copy below.
|
||||
WERD_RES(const WERD_RES &source) {
|
||||
InitPointers();
|
||||
*this = source; // see operator=
|
||||
@ -545,7 +409,11 @@ class WERD_RES : public ELIST_LINK {
|
||||
void InitPointers();
|
||||
void Clear();
|
||||
void ClearResults();
|
||||
void ClearWordChoices();
|
||||
void ClearRatings();
|
||||
|
||||
// Deep copies everything except the ratings MATRIX.
|
||||
// To get that use deep_copy below.
|
||||
WERD_RES& operator=(const WERD_RES& source); //from this
|
||||
|
||||
void CopySimpleFields(const WERD_RES& source);
|
||||
@ -557,18 +425,28 @@ class WERD_RES : public ELIST_LINK {
|
||||
void InitForRetryRecognition(const WERD_RES& source);
|
||||
|
||||
// Sets up the members used in recognition: bln_boxes, chopped_word,
|
||||
// seam_array, denorm, best_choice, raw_choice. Returns false if
|
||||
// seam_array, denorm. Returns false if
|
||||
// the word is empty and sets up fake results. If use_body_size is
|
||||
// true and row->body_size is set, then body_size will be used for
|
||||
// blob normalization instead of xheight + ascrise. This flag is for
|
||||
// those languages that are using CJK pitch model and thus it has to
|
||||
// be true if and only if tesseract->textord_use_cjk_fp_model is
|
||||
// true.
|
||||
// If allow_detailed_fx is true, the feature extractor will receive fine
|
||||
// precision outline information, allowing smoother features and better
|
||||
// features on low resolution images.
|
||||
// Returns false if the word is empty and sets up fake results.
|
||||
bool SetupForTessRecognition(const UNICHARSET& unicharset_in,
|
||||
tesseract::Tesseract* tesseract, Pix* pix,
|
||||
bool numeric_mode, bool use_body_size,
|
||||
bool allow_detailed_fx,
|
||||
ROW *row, BLOCK* block);
|
||||
|
||||
// Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
|
||||
// accumulators from a made chopped word. We presume the fields are already
|
||||
// empty.
|
||||
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in);
|
||||
|
||||
// Sets up the members used in recognition:
|
||||
// bln_boxes, chopped_word, seam_array, denorm.
|
||||
// Returns false if the word is empty and sets up fake results.
|
||||
@ -586,6 +464,87 @@ class WERD_RES : public ELIST_LINK {
|
||||
// Sets up the blamer_bundle if it is not null, using the initialized denorm.
|
||||
void SetupBlamerBundle();
|
||||
|
||||
// Computes the blob_widths and blob_gaps from the chopped_word.
|
||||
void SetupBlobWidthsAndGaps();
|
||||
|
||||
// Updates internal data to account for a new SEAM (chop) at the given
|
||||
// blob_number. Fixes the ratings matrix and states in the choices, as well
|
||||
// as the blob widths and gaps.
|
||||
void InsertSeam(int blob_number, SEAM* seam);
|
||||
|
||||
// Returns true if all the word choices except the first have adjust_factors
|
||||
// worse than the given threshold.
|
||||
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const;
|
||||
|
||||
// Returns true if the current word is ambiguous (by number of answers or
|
||||
// by dangerous ambigs.)
|
||||
bool IsAmbiguous();
|
||||
|
||||
// Returns true if the ratings matrix size matches the sum of each of the
|
||||
// segmentation states.
|
||||
bool StatesAllValid();
|
||||
|
||||
// Prints a list of words found if debug is true or the word result matches
|
||||
// the word_to_debug.
|
||||
void DebugWordChoices(bool debug, const char* word_to_debug);
|
||||
|
||||
// Removes from best_choices all choices which are not within a reasonable
|
||||
// range of the best choice.
|
||||
void FilterWordChoices(int debug_level);
|
||||
|
||||
// Computes a set of distance thresholds used to control adaption.
|
||||
// Compares the best choice for the current word to the best raw choice
|
||||
// to determine which characters were classified incorrectly by the
|
||||
// classifier. Then places a separate threshold into thresholds for each
|
||||
// character in the word. If the classifier was correct, max_rating is placed
|
||||
// into thresholds. If the classifier was incorrect, the mean match rating
|
||||
// (error percentage) of the classifier's incorrect choice minus some margin
|
||||
// is placed into thresholds. This can then be used by the caller to try to
|
||||
// create a new template for the desired class that will classify the
|
||||
// character with a rating better than the threshold value. The match rating
|
||||
// placed into thresholds is never allowed to be below min_rating in order to
|
||||
// prevent trying to make overly tight templates.
|
||||
// min_rating limits how tight to make a template.
|
||||
// max_rating limits how loose to make a template.
|
||||
// rating_margin denotes the amount of margin to put in template.
|
||||
void ComputeAdaptionThresholds(float certainty_scale,
|
||||
float min_rating,
|
||||
float max_rating,
|
||||
float rating_margin,
|
||||
float* thresholds);
|
||||
|
||||
// Saves a copy of the word_choice if it has the best unadjusted rating.
|
||||
// Returns true if the word_choice was the new best.
|
||||
bool LogNewRawChoice(WERD_CHOICE* word_choice);
|
||||
// Consumes word_choice by adding it to best_choices, (taking ownership) if
|
||||
// the certainty for word_choice is some distance of the best choice in
|
||||
// best_choices, or by deleting the word_choice and returning false.
|
||||
// The best_choices list is kept in sorted order by rating. Duplicates are
|
||||
// removed, and the list is kept no longer than max_num_choices in length.
|
||||
// Returns true if the word_choice is still a valid pointer.
|
||||
bool LogNewCookedChoice(int max_num_choices, bool debug,
|
||||
WERD_CHOICE* word_choice);
|
||||
|
||||
// Prints a brief list of all the best choices.
|
||||
void PrintBestChoices() const;
|
||||
|
||||
// Returns the sum of the widths of the blob between start_blob and last_blob
|
||||
// inclusive.
|
||||
int GetBlobsWidth(int start_blob, int last_blob);
|
||||
// Returns the width of a gap between the specified blob and the next one.
|
||||
int GetBlobsGap(int blob_index);
|
||||
|
||||
// Returns the BLOB_CHOICE corresponding to the given index in the
|
||||
// best choice word taken from the appropriate cell in the ratings MATRIX.
|
||||
// Borrowed pointer, so do not delete. May return NULL if there is no
|
||||
// BLOB_CHOICE matching the unichar_id at the given index.
|
||||
BLOB_CHOICE* GetBlobChoice(int index) const;
|
||||
|
||||
// Returns the BLOB_CHOICE_LIST corresponding to the given index in the
|
||||
// best choice word taken from the appropriate cell in the ratings MATRIX.
|
||||
// Borrowed pointer, so do not delete.
|
||||
BLOB_CHOICE_LIST* GetBlobChoices(int index) const;
|
||||
|
||||
// Moves the results fields from word to this. This takes ownership of all
|
||||
// the data, so src can be destructed.
|
||||
// word1.ConsumeWordResult(word);
|
||||
@ -597,10 +556,11 @@ class WERD_RES : public ELIST_LINK {
|
||||
void ConsumeWordResults(WERD_RES* word);
|
||||
|
||||
// Replace the best choice and rebuild box word.
|
||||
void ReplaceBestChoice(const WERD_CHOICE& choice,
|
||||
const GenericVector<int> &segmentation_state);
|
||||
// choice must be from the current best_choices list.
|
||||
void ReplaceBestChoice(WERD_CHOICE* choice);
|
||||
|
||||
// Builds the rebuild_word from the chopped_word and the best_state.
|
||||
// Builds the rebuild_word and sets the best_state from the chopped_word and
|
||||
// the best_choice->state.
|
||||
void RebuildBestState();
|
||||
|
||||
// Copies the chopped_word to the rebuild_word, faking a best_state as well.
|
||||
@ -610,30 +570,26 @@ class WERD_RES : public ELIST_LINK {
|
||||
// Sets/replaces the box_word with one made from the rebuild_word.
|
||||
void SetupBoxWord();
|
||||
|
||||
// Sets up the script positions in the output boxword using the best_choice
|
||||
// Sets up the script positions in the best_choice using the best_choice
|
||||
// to get the unichars, and the unicharset to get the target positions.
|
||||
void SetScriptPositions();
|
||||
|
||||
// Returns the indices [start, end) containing the core of the word, stripped
|
||||
// of any superscript digits on either side.
|
||||
// (i.e., the non-footnote part of the word).
|
||||
// Assumes that BoxWord is all set up for best_choice.
|
||||
void WithoutFootnoteSpan(int *start, int *end) const;
|
||||
|
||||
// Given an alternate word choice and segmentation state, yield the indices
|
||||
// [start, end) containig the core of the word, stripped of any superscript
|
||||
// digits on either side. (i.e. stripping off the footnote parts).
|
||||
void WithoutFootnoteSpan(
|
||||
const WERD_CHOICE &choice, const GenericVector<int> &state,
|
||||
int *start, int *end) const;
|
||||
// Sets all the blobs in all the words (best choice and alternates) to be
|
||||
// the given position. (When a sub/superscript is recognized as a separate
|
||||
// word, it falls victim to the rule that a whole word cannot be sub or
|
||||
// superscript, so this function overrides that problem.)
|
||||
void SetAllScriptPositions(tesseract::ScriptPos position);
|
||||
|
||||
// Classifies the word with some already-calculated BLOB_CHOICEs.
|
||||
// The choices are an array of blob_count pointers to BLOB_CHOICE,
|
||||
// providing a single classifier result for each blob.
|
||||
// The BLOB_CHOICEs are consumed and the word takes ownership.
|
||||
// The number of blobs in the outword must match blob_count.
|
||||
// The number of blobs in the box_word must match blob_count.
|
||||
void FakeClassifyWord(int blob_count, BLOB_CHOICE** choices);
|
||||
|
||||
// Creates a WERD_CHOICE for the word using the top choices from the leading
|
||||
// diagonal of the ratings matrix.
|
||||
void FakeWordFromRatings();
|
||||
|
||||
// Copies the best_choice strings to the correct_text for adaption/training.
|
||||
void BestChoiceToCorrectText();
|
||||
|
||||
@ -644,13 +600,16 @@ class WERD_RES : public ELIST_LINK {
|
||||
// Returns true if anything was merged.
|
||||
bool ConditionalBlobMerge(
|
||||
TessResultCallback2<UNICHAR_ID, UNICHAR_ID, UNICHAR_ID>* class_cb,
|
||||
TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb,
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices);
|
||||
TessResultCallback2<bool, const TBOX&, const TBOX&>* box_cb);
|
||||
|
||||
// Merges 2 adjacent blobs in the result (index and index+1) and corrects
|
||||
// all the data to account for the change.
|
||||
void MergeAdjacentBlobs(int index);
|
||||
|
||||
// Callback helper for fix_quotes returns a double quote if both
|
||||
// arguments are quote, otherwise INVALID_UNICHAR_ID.
|
||||
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2);
|
||||
void fix_quotes(BLOB_CHOICE_LIST_CLIST *blob_choices);
|
||||
void fix_quotes();
|
||||
|
||||
// Callback helper for fix_hyphens returns UNICHAR_ID of - if both
|
||||
// arguments are hyphen, otherwise INVALID_UNICHAR_ID.
|
||||
@ -658,15 +617,21 @@ class WERD_RES : public ELIST_LINK {
|
||||
// Callback helper for fix_hyphens returns true if box1 and box2 overlap
|
||||
// (assuming both on the same textline, are in order and a chopped em dash.)
|
||||
bool HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2);
|
||||
void fix_hyphens(BLOB_CHOICE_LIST_CLIST *blob_choices);
|
||||
void fix_hyphens();
|
||||
|
||||
// Callback helper for merge_tess_fails returns a space if both
|
||||
// arguments are space, otherwise INVALID_UNICHAR_ID.
|
||||
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2);
|
||||
void merge_tess_fails();
|
||||
|
||||
// Returns a really deep copy of *src, including the ratings MATRIX.
|
||||
static WERD_RES* deep_copy(const WERD_RES* src) {
|
||||
return new WERD_RES(*src);
|
||||
WERD_RES* result = new WERD_RES(*src);
|
||||
// That didn't copy the ratings, but we want a copy if there is one to
|
||||
// begin width.
|
||||
if (src->ratings != NULL)
|
||||
result->ratings = src->ratings->DeepCopy();
|
||||
return result;
|
||||
}
|
||||
|
||||
// Copy blobs from word_res onto this word (eliminating spaces between).
|
||||
|
40
ccstruct/params_training_featdef.cpp
Normal file
40
ccstruct/params_training_featdef.cpp
Normal file
@ -0,0 +1,40 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: params_training_featdef.cpp
|
||||
// Description: Utility functions for params training features.
|
||||
// Author: David Eger
|
||||
// Created: Mon Jun 11 11:26:42 PDT 2012
|
||||
//
|
||||
// (C) Copyright 2012, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "params_training_featdef.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
int ParamsTrainingFeatureByName(const char *name) {
|
||||
if (name == NULL)
|
||||
return -1;
|
||||
int array_size = sizeof(kParamsTrainingFeatureTypeName) /
|
||||
sizeof(kParamsTrainingFeatureTypeName[0]);
|
||||
for (int i = 0; i < array_size; i++) {
|
||||
if (kParamsTrainingFeatureTypeName[i] == NULL)
|
||||
continue;
|
||||
if (strcmp(name, kParamsTrainingFeatureTypeName[i]) == 0)
|
||||
return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
@ -25,67 +25,97 @@
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Maximum number of unichars in the small and medium sized words
|
||||
static const int kMaxSmallWordUnichars = 3;
|
||||
static const int kMaxMediumWordUnichars = 6;
|
||||
|
||||
// Raw features extracted from a single OCR hypothesis.
|
||||
// The features are non-normalized real-valued quantities with
|
||||
// unbounded range and unknown distribution.
|
||||
// The features are normalized (by outline length or number of unichars as
|
||||
// appropriate) real-valued quantities with unbounded range and
|
||||
// unknown distribution.
|
||||
// Normalization / binarization of these features is done at a later stage.
|
||||
// Note: when adding new fields to this enum make sure to modify
|
||||
// kParamsTrainingRawFeatureTypeName enum accordingly.
|
||||
enum ParamsTrainingRawFeatureType {
|
||||
// What dictionary (if any) was this hypothesis found in.
|
||||
// See PermuterType enum in ccstruct/ratngs.h for interpretation.
|
||||
PTRAIN_RAW_FEATURE_DICT_MATCH_TYPE, // 0
|
||||
// Boolean indicator of whether this hypothesis is ambiguous to a known
|
||||
// dictionary word (or a valid number pattern).
|
||||
PTRAIN_RAW_FEATURE_UNAMBIG_DICT_MATCH, // 1
|
||||
// Shape cost of the segmentation path for this hypothesis.
|
||||
PTRAIN_RAW_FEATURE_SHAPE_COST, // 2
|
||||
// Character ngram probability of the string of unichars of this hypothesis.
|
||||
PTRAIN_RAW_FEATURE_NGRAM_PROB, // 3
|
||||
// Number of bad/inconsistent spots in this hypothesis.
|
||||
PTRAIN_RAW_FEATURE_NUM_BAD_PUNC, // 4
|
||||
PTRAIN_RAW_FEATURE_NUM_BAD_CASE, // 5
|
||||
PTRAIN_RAW_FEATURE_NUM_BAD_CHAR_TYPE, // 6
|
||||
PTRAIN_RAW_FEATURE_NUM_BAD_SPACING, // 7
|
||||
PTRAIN_RAW_FEATURE_NUM_BAD_SCRIPT, // 8
|
||||
PTRAIN_RAW_FEATURE_NUM_BAD_FONT, // 9
|
||||
// Classifier-related features.
|
||||
PTRAIN_RAW_FEATURE_WORST_CERT, // 10
|
||||
PTRAIN_RAW_FEATURE_RATING, // 11
|
||||
// Number of classifier results that came from adapted templates.
|
||||
PTRAIN_RAW_FEATURE_ADAPTED, // 12
|
||||
// Features potentially useful for normalization.
|
||||
PTRAIN_RAW_FEATURE_NUM_UNICHARS, // 13
|
||||
PTRAIN_RAW_FEATURE_OUTLINE_LEN, // 14
|
||||
// kParamsTrainingFeatureTypeName
|
||||
enum kParamsTrainingFeatureType {
|
||||
// Digits
|
||||
PTRAIN_DIGITS_SHORT, // 0
|
||||
PTRAIN_DIGITS_MED, // 1
|
||||
PTRAIN_DIGITS_LONG, // 2
|
||||
// Number or pattern (NUMBER_PERM, USER_PATTERN_PERM)
|
||||
PTRAIN_NUM_SHORT, // 3
|
||||
PTRAIN_NUM_MED, // 4
|
||||
PTRAIN_NUM_LONG, // 5
|
||||
// Document word (DOC_DAWG_PERM)
|
||||
PTRAIN_DOC_SHORT, // 6
|
||||
PTRAIN_DOC_MED, // 7
|
||||
PTRAIN_DOC_LONG, // 8
|
||||
// Word (SYSTEM_DAWG_PERM, USER_DAWG_PERM, COMPOUND_PERM)
|
||||
PTRAIN_DICT_SHORT, // 9
|
||||
PTRAIN_DICT_MED, // 10
|
||||
PTRAIN_DICT_LONG, // 11
|
||||
// Frequent word (FREQ_DAWG_PERM)
|
||||
PTRAIN_FREQ_SHORT, // 12
|
||||
PTRAIN_FREQ_MED, // 13
|
||||
PTRAIN_FREQ_LONG, // 14
|
||||
PTRAIN_SHAPE_COST_PER_CHAR, // 15
|
||||
PTRAIN_NGRAM_COST_PER_CHAR, // 16
|
||||
PTRAIN_NUM_BAD_PUNC, // 17
|
||||
PTRAIN_NUM_BAD_CASE, // 18
|
||||
PTRAIN_XHEIGHT_CONSISTENCY, // 19
|
||||
PTRAIN_NUM_BAD_CHAR_TYPE, // 20
|
||||
PTRAIN_NUM_BAD_SPACING, // 21
|
||||
PTRAIN_NUM_BAD_FONT, // 22
|
||||
PTRAIN_RATING_PER_CHAR, // 23
|
||||
|
||||
PTRAIN_NUM_RAW_FEATURE_TYPES
|
||||
PTRAIN_NUM_FEATURE_TYPES
|
||||
};
|
||||
|
||||
static const char * const kParamsTrainingRawFeatureTypeName[] = {
|
||||
"DICT_MATCH_TYPE", // 0
|
||||
"UNAMBIG_DICT_MATCH", // 1
|
||||
"SHAPE_COST", // 2
|
||||
"NGRAM_PROB", // 3
|
||||
"NUM_BAD_PUNC", // 4
|
||||
"NUM_BAD_CASE", // 5
|
||||
"NUM_BAD_CHAR_TYPE", // 6
|
||||
"NUM_BAD_SPACING", // 7
|
||||
"NUM_BAD_SCRIPT", // 8
|
||||
"NUM_BAD_FONT", // 9
|
||||
"WORST_CERT", // 10
|
||||
"RATING", // 11
|
||||
"ADAPTED", // 12
|
||||
"NUM_UNICHARS", // 13
|
||||
"OUTLINE_LEN", // 14
|
||||
static const char * const kParamsTrainingFeatureTypeName[] = {
|
||||
"PTRAIN_DIGITS_SHORT", // 0
|
||||
"PTRAIN_DIGITS_MED", // 1
|
||||
"PTRAIN_DIGITS_LONG", // 2
|
||||
"PTRAIN_NUM_SHORT", // 3
|
||||
"PTRAIN_NUM_MED", // 4
|
||||
"PTRAIN_NUM_LONG", // 5
|
||||
"PTRAIN_DOC_SHORT", // 6
|
||||
"PTRAIN_DOC_MED", // 7
|
||||
"PTRAIN_DOC_LONG", // 8
|
||||
"PTRAIN_DICT_SHORT", // 9
|
||||
"PTRAIN_DICT_MED", // 10
|
||||
"PTRAIN_DICT_LONG", // 11
|
||||
"PTRAIN_FREQ_SHORT", // 12
|
||||
"PTRAIN_FREQ_MED", // 13
|
||||
"PTRAIN_FREQ_LONG", // 14
|
||||
"PTRAIN_SHAPE_COST_PER_CHAR", // 15
|
||||
"PTRAIN_NGRAM_COST_PER_CHAR", // 16
|
||||
"PTRAIN_NUM_BAD_PUNC", // 17
|
||||
"PTRAIN_NUM_BAD_CASE", // 18
|
||||
"PTRAIN_XHEIGHT_CONSISTENCY", // 19
|
||||
"PTRAIN_NUM_BAD_CHAR_TYPE", // 20
|
||||
"PTRAIN_NUM_BAD_SPACING", // 21
|
||||
"PTRAIN_NUM_BAD_FONT", // 22
|
||||
"PTRAIN_RATING_PER_CHAR", // 23
|
||||
};
|
||||
|
||||
// Returns the index of the given feature (by name),
|
||||
// or -1 meaning the feature is unknown.
|
||||
int ParamsTrainingFeatureByName(const char *name);
|
||||
|
||||
|
||||
// Entry with features extracted from a single OCR hypothesis for a word.
|
||||
struct ParamsTrainingHypothesis {
|
||||
ParamsTrainingHypothesis() {
|
||||
for (int i = 0; i < PTRAIN_NUM_RAW_FEATURE_TYPES; ++i) features[i] = 0.0;
|
||||
ParamsTrainingHypothesis() : cost(0.0) {
|
||||
memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
|
||||
}
|
||||
float features[PTRAIN_NUM_RAW_FEATURE_TYPES];
|
||||
ParamsTrainingHypothesis(const ParamsTrainingHypothesis &other) {
|
||||
memcpy(features, other.features,
|
||||
sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
|
||||
str = other.str;
|
||||
cost = other.cost;
|
||||
}
|
||||
float features[PTRAIN_NUM_FEATURE_TYPES];
|
||||
STRING str; // string corresponding to word hypothesis (for debugging)
|
||||
float cost; // path cost computed by segsearch
|
||||
};
|
||||
|
||||
// A list of hypotheses explored during one run of segmentation search.
|
||||
@ -104,9 +134,10 @@ class ParamsTrainingBundle {
|
||||
}
|
||||
// Adds a new ParamsTrainingHypothesis to the current hypothesis list
|
||||
// and returns the reference to the newly added entry.
|
||||
ParamsTrainingHypothesis &AddHypothesis() {
|
||||
ParamsTrainingHypothesis &AddHypothesis(
|
||||
const ParamsTrainingHypothesis &other) {
|
||||
if (hyp_list_vec.empty()) StartHypothesisList();
|
||||
hyp_list_vec.back().push_back(ParamsTrainingHypothesis());
|
||||
hyp_list_vec.back().push_back(ParamsTrainingHypothesis(other));
|
||||
return hyp_list_vec.back().back();
|
||||
}
|
||||
|
||||
|
@ -19,13 +19,33 @@
|
||||
|
||||
#include "ratngs.h"
|
||||
|
||||
#include "blobs.h"
|
||||
#include "callcpp.h"
|
||||
#include "genericvector.h"
|
||||
#include "matrix.h"
|
||||
#include "normalis.h" // kBlnBaselineOffset.
|
||||
#include "unicharset.h"
|
||||
|
||||
ELISTIZE (BLOB_CHOICE) CLISTIZE (BLOB_CHOICE_LIST) CLISTIZE (WERD_CHOICE);
|
||||
using tesseract::ScriptPos;
|
||||
|
||||
ELISTIZE(BLOB_CHOICE);
|
||||
ELISTIZE(WERD_CHOICE);
|
||||
|
||||
const float WERD_CHOICE::kBadRating = 100000.0;
|
||||
// Min offset in baseline-normalized coords to make a character a subscript.
|
||||
const int kMinSubscriptOffset = 20;
|
||||
// Min offset in baseline-normalized coords to make a character a superscript.
|
||||
const int kMinSuperscriptOffset = 20;
|
||||
// Max y of bottom of a drop-cap blob.
|
||||
const int kMaxDropCapBottom = -128;
|
||||
// Max fraction of x-height to use as denominator in measuring x-height overlap.
|
||||
const double kMaxOverlapDenominator = 0.125;
|
||||
// Min fraction of x-height range that should be in agreement for matching
|
||||
// x-heights.
|
||||
const double kMinXHeightMatch = 0.5;
|
||||
// Max tolerance on baseline position as a fraction of x-height for matching
|
||||
// baselines.
|
||||
const double kMaxBaselineDrift = 0.0625;
|
||||
|
||||
static const char kPermuterTypeNoPerm[] = "None";
|
||||
static const char kPermuterTypePuncPerm[] = "Punctuation";
|
||||
@ -68,20 +88,20 @@ BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
|
||||
inT16 src_fontinfo_id, // font
|
||||
inT16 src_fontinfo_id2, // 2nd choice font
|
||||
int src_script_id, // script
|
||||
inT16 min_xheight, // min xheight allowed
|
||||
inT16 max_xheight, // max xheight by this char
|
||||
bool adapted // adapted match or not
|
||||
) {
|
||||
float min_xheight, // min xheight allowed
|
||||
float max_xheight, // max xheight by this char
|
||||
float yshift, // yshift out of position
|
||||
BlobChoiceClassifier c) { // adapted match or other
|
||||
unichar_id_ = src_unichar_id;
|
||||
rating_ = src_rating;
|
||||
certainty_ = src_cert;
|
||||
fontinfo_id_ = src_fontinfo_id;
|
||||
fontinfo_id2_ = src_fontinfo_id2;
|
||||
script_id_ = src_script_id;
|
||||
language_model_state_ = NULL;
|
||||
min_xheight_ = min_xheight;
|
||||
max_xheight_ = max_xheight;
|
||||
adapted_ = adapted;
|
||||
yshift_ = yshift;
|
||||
classifier_ = c;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -96,12 +116,75 @@ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) {
|
||||
fontinfo_id_ = other.fontinfo_id();
|
||||
fontinfo_id2_ = other.fontinfo_id2();
|
||||
script_id_ = other.script_id();
|
||||
language_model_state_ = NULL;
|
||||
matrix_cell_ = other.matrix_cell_;
|
||||
min_xheight_ = other.min_xheight_;
|
||||
max_xheight_ = other.max_xheight_;
|
||||
adapted_ = other.adapted_;
|
||||
yshift_ = other.yshift();
|
||||
classifier_ = other.classifier_;
|
||||
}
|
||||
|
||||
// Returns true if *this and other agree on the baseline and x-height
|
||||
// to within some tolerance based on a given estimate of the x-height.
|
||||
bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE& other, float x_height,
|
||||
bool debug) const {
|
||||
double baseline_diff = fabs(yshift() - other.yshift());
|
||||
if (baseline_diff > kMaxBaselineDrift * x_height) {
|
||||
if (debug) {
|
||||
tprintf("Baseline diff %g for %d v %d\n",
|
||||
baseline_diff, unichar_id_, other.unichar_id_);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
double this_range = max_xheight() - min_xheight();
|
||||
double other_range = other.max_xheight() - other.min_xheight();
|
||||
double denominator = ClipToRange(MIN(this_range, other_range),
|
||||
1.0, kMaxOverlapDenominator * x_height);
|
||||
double overlap = MIN(max_xheight(), other.max_xheight()) -
|
||||
MAX(min_xheight(), other.min_xheight());
|
||||
overlap /= denominator;
|
||||
if (debug) {
|
||||
tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n",
|
||||
unichar_id_, other.unichar_id_, baseline_diff,
|
||||
this_range, other_range, denominator, overlap);
|
||||
}
|
||||
|
||||
return overlap >= kMinXHeightMatch;
|
||||
}
|
||||
|
||||
// Helper to find the BLOB_CHOICE in the bc_list that matches the given
|
||||
// unichar_id, or NULL if there is no match.
|
||||
BLOB_CHOICE* FindMatchingChoice(UNICHAR_ID char_id,
|
||||
BLOB_CHOICE_LIST* bc_list) {
|
||||
// Find the corresponding best BLOB_CHOICE.
|
||||
BLOB_CHOICE_IT choice_it(bc_list);
|
||||
for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
|
||||
choice_it.forward()) {
|
||||
BLOB_CHOICE* choice = choice_it.data();
|
||||
if (choice->unichar_id() == char_id) {
|
||||
return choice;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const char *WERD_CHOICE::permuter_name(uinT8 permuter) {
|
||||
return kPermuterTypeNames[permuter];
|
||||
}
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
const char *ScriptPosToString(enum ScriptPos script_pos) {
|
||||
switch (script_pos) {
|
||||
case SP_NORMAL: return "NORM";
|
||||
case SP_SUBSCRIPT: return "SUB";
|
||||
case SP_SUPERSCRIPT: return "SUPER";
|
||||
case SP_DROPCAP: return "DROPC";
|
||||
}
|
||||
return "SP_UNKNOWN";
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
/**
|
||||
* WERD_CHOICE::WERD_CHOICE
|
||||
*
|
||||
@ -111,16 +194,13 @@ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) {
|
||||
WERD_CHOICE::WERD_CHOICE(const char *src_string,
|
||||
const UNICHARSET &unicharset)
|
||||
: unicharset_(&unicharset){
|
||||
STRING src_lengths;
|
||||
const char *ptr = src_string;
|
||||
const char *end = src_string + strlen(src_string);
|
||||
int step = unicharset.step(ptr);
|
||||
for (; ptr < end && step > 0;
|
||||
step = unicharset.step(ptr), src_lengths += step, ptr += step);
|
||||
if (step != 0 && ptr == end) {
|
||||
this->init(src_string, src_lengths.string(),
|
||||
0.0, 0.0, NO_PERM);
|
||||
} else { // there must have been an invalid unichar in the string
|
||||
GenericVector<UNICHAR_ID> encoding;
|
||||
GenericVector<char> lengths;
|
||||
if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) {
|
||||
lengths.push_back('\0');
|
||||
STRING src_lengths = &lengths[0];
|
||||
this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM);
|
||||
} else { // There must have been an invalid unichar in the string.
|
||||
this->init(8);
|
||||
this->make_bad();
|
||||
}
|
||||
@ -152,13 +232,16 @@ void WERD_CHOICE::init(const char *src_string,
|
||||
int unichar_length = src_lengths ? src_lengths[i] : 1;
|
||||
unichar_ids_[i] =
|
||||
unicharset_->unichar_to_id(src_string+offset, unichar_length);
|
||||
fragment_lengths_[i] = 1;
|
||||
state_[i] = 1;
|
||||
certainties_[i] = src_certainty;
|
||||
offset += unichar_length;
|
||||
}
|
||||
}
|
||||
adjust_factor_ = 1.0f;
|
||||
rating_ = src_rating;
|
||||
certainty_ = src_certainty;
|
||||
permuter_ = src_permuter;
|
||||
dangerous_ambig_found_ = false;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -166,25 +249,46 @@ void WERD_CHOICE::init(const char *src_string,
|
||||
*/
|
||||
WERD_CHOICE::~WERD_CHOICE() {
|
||||
delete[] unichar_ids_;
|
||||
delete[] fragment_lengths_;
|
||||
delete_blob_choices();
|
||||
delete[] script_pos_;
|
||||
delete[] state_;
|
||||
delete[] certainties_;
|
||||
}
|
||||
|
||||
const char *WERD_CHOICE::permuter_name() const {
|
||||
return kPermuterTypeNames[permuter_];
|
||||
}
|
||||
|
||||
/**
|
||||
* WERD_CHOICE::set_blob_choices
|
||||
*
|
||||
* Delete current blob_choices. Set the blob_choices to the given new
|
||||
* list.
|
||||
*/
|
||||
void WERD_CHOICE::set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices) {
|
||||
if (blob_choices_ != blob_choices) {
|
||||
delete_blob_choices();
|
||||
blob_choices_ = blob_choices;
|
||||
// Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
|
||||
// taken from the appropriate cell in the ratings MATRIX.
|
||||
// Borrowed pointer, so do not delete.
|
||||
BLOB_CHOICE_LIST* WERD_CHOICE::blob_choices(int index, MATRIX* ratings) const {
|
||||
MATRIX_COORD coord = MatrixCoord(index);
|
||||
BLOB_CHOICE_LIST* result = ratings->get(coord.col, coord.row);
|
||||
if (result == NULL) {
|
||||
result = new BLOB_CHOICE_LIST;
|
||||
ratings->put(coord.col, coord.row, result);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns the MATRIX_COORD corresponding to the location in the ratings
|
||||
// MATRIX for the given index into the word.
|
||||
MATRIX_COORD WERD_CHOICE::MatrixCoord(int index) const {
|
||||
int col = 0;
|
||||
for (int i = 0; i < index; ++i)
|
||||
col += state_[i];
|
||||
int row = col + state_[index] - 1;
|
||||
return MATRIX_COORD(col, row);
|
||||
}
|
||||
|
||||
// Sets the entries for the given index from the BLOB_CHOICE, assuming
|
||||
// unit fragment lengths, but setting the state for this index to blob_count.
|
||||
void WERD_CHOICE::set_blob_choice(int index, int blob_count,
|
||||
const BLOB_CHOICE* blob_choice) {
|
||||
unichar_ids_[index] = blob_choice->unichar_id();
|
||||
script_pos_[index] = tesseract::SP_NORMAL;
|
||||
state_[index] = blob_count;
|
||||
certainties_[index] = blob_choice->certainty();
|
||||
}
|
||||
|
||||
|
||||
@ -211,9 +315,18 @@ bool WERD_CHOICE::contains_unichar_id(UNICHAR_ID unichar_id) const {
|
||||
*/
|
||||
void WERD_CHOICE::remove_unichar_ids(int start, int num) {
|
||||
ASSERT_HOST(start >= 0 && start + num <= length_);
|
||||
for (int i = start; i+num < length_; ++i) {
|
||||
unichar_ids_[i] = unichar_ids_[i+num];
|
||||
fragment_lengths_[i] = fragment_lengths_[i+num];
|
||||
// Accumulate the states to account for the merged blobs.
|
||||
for (int i = 0; i < num; ++i) {
|
||||
if (start > 0)
|
||||
state_[start - 1] += state_[start + i];
|
||||
else if (start + num < length_)
|
||||
state_[start + num] += state_[start + i];
|
||||
}
|
||||
for (int i = start; i + num < length_; ++i) {
|
||||
unichar_ids_[i] = unichar_ids_[i + num];
|
||||
script_pos_[i] = script_pos_[i + num];
|
||||
state_[i] = state_[i + num];
|
||||
certainties_[i] = certainties_[i + num];
|
||||
}
|
||||
length_ -= num;
|
||||
}
|
||||
@ -224,7 +337,7 @@ void WERD_CHOICE::remove_unichar_ids(int start, int num) {
|
||||
* Reverses and mirrors unichars in unichar_ids.
|
||||
*/
|
||||
void WERD_CHOICE::reverse_and_mirror_unichar_ids() {
|
||||
for (int i = 0; i < length_/2; ++i) {
|
||||
for (int i = 0; i < length_ / 2; ++i) {
|
||||
UNICHAR_ID tmp_id = unichar_ids_[i];
|
||||
unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]);
|
||||
unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id);
|
||||
@ -255,6 +368,23 @@ void WERD_CHOICE::punct_stripped(int *start, int *end) const {
|
||||
(*end)++;
|
||||
}
|
||||
|
||||
void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const {
|
||||
int end = length();
|
||||
while (end > 0 &&
|
||||
unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
|
||||
BlobPosition(end - 1) == tesseract::SP_SUPERSCRIPT) {
|
||||
end--;
|
||||
}
|
||||
int start = 0;
|
||||
while (start < end &&
|
||||
unicharset_->get_isdigit(unichar_ids_[start]) &&
|
||||
BlobPosition(start) == tesseract::SP_SUPERSCRIPT) {
|
||||
start++;
|
||||
}
|
||||
*pstart = start;
|
||||
*pend = end;
|
||||
}
|
||||
|
||||
WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const {
|
||||
ASSERT_HOST(start >= 0 && start <= length_);
|
||||
ASSERT_HOST(end >= 0 && end <= length_);
|
||||
@ -262,7 +392,7 @@ WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const {
|
||||
WERD_CHOICE retval(unicharset_, end - start);
|
||||
for (int i = start; i < end; i++) {
|
||||
retval.append_unichar_id_space_allocated(
|
||||
unichar_ids_[i], fragment_lengths_[i], 0.0f, 0.0f);
|
||||
unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
@ -310,12 +440,12 @@ void WERD_CHOICE::string_and_lengths(STRING *word_str,
|
||||
* and call append_unichar_id_space_allocated().
|
||||
*/
|
||||
void WERD_CHOICE::append_unichar_id(
|
||||
UNICHAR_ID unichar_id, char fragment_length,
|
||||
UNICHAR_ID unichar_id, int blob_count,
|
||||
float rating, float certainty) {
|
||||
if (length_ == reserved_) {
|
||||
this->double_the_size();
|
||||
}
|
||||
this->append_unichar_id_space_allocated(unichar_id, fragment_length,
|
||||
this->append_unichar_id_space_allocated(unichar_id, blob_count,
|
||||
rating, certainty);
|
||||
}
|
||||
|
||||
@ -327,59 +457,31 @@ void WERD_CHOICE::append_unichar_id(
|
||||
* If the permuters are NOT the same the permuter is set to COMPOUND_PERM
|
||||
*/
|
||||
WERD_CHOICE & WERD_CHOICE::operator+= (const WERD_CHOICE & second) {
|
||||
// TODO(daria): find out why the choice was cleared this way if any
|
||||
// of the pieces are empty. Add the description of this behavior
|
||||
// to the comments.
|
||||
// if (word_string.length () == 0 || second.word_string.length () == 0) {
|
||||
// word_string = NULL; //make it empty
|
||||
// word_lengths = NULL;
|
||||
// delete_blob_choices();
|
||||
// } else {
|
||||
ASSERT_HOST(unicharset_ == second.unicharset_);
|
||||
while (reserved_ < length_ + second.length()) {
|
||||
this->double_the_size();
|
||||
}
|
||||
const UNICHAR_ID *other_unichar_ids = second.unichar_ids();
|
||||
const char *other_fragment_lengths = second.fragment_lengths();
|
||||
for (int i = 0; i < second.length(); ++i) {
|
||||
unichar_ids_[length_ + i] = other_unichar_ids[i];
|
||||
fragment_lengths_[length_ + i] = other_fragment_lengths[i];
|
||||
state_[length_ + i] = second.state_[i];
|
||||
certainties_[length_ + i] = second.certainties_[i];
|
||||
script_pos_[length_ + i] = second.BlobPosition(i);
|
||||
}
|
||||
length_ += second.length();
|
||||
if (second.adjust_factor_ > adjust_factor_)
|
||||
adjust_factor_ = second.adjust_factor_;
|
||||
rating_ += second.rating(); // add ratings
|
||||
if (second.certainty() < certainty_) // take min
|
||||
certainty_ = second.certainty();
|
||||
if (second.dangerous_ambig_found_)
|
||||
dangerous_ambig_found_ = true;
|
||||
if (permuter_ == NO_PERM) {
|
||||
permuter_ = second.permuter();
|
||||
} else if (second.permuter() != NO_PERM &&
|
||||
second.permuter() != permuter_) {
|
||||
permuter_ = COMPOUND_PERM;
|
||||
}
|
||||
|
||||
// Append a deep copy of second blob_choices if it exists.
|
||||
if (second.blob_choices_ != NULL) {
|
||||
if (this->blob_choices_ == NULL)
|
||||
this->blob_choices_ = new BLOB_CHOICE_LIST_CLIST;
|
||||
|
||||
BLOB_CHOICE_LIST_C_IT this_blob_choices_it;
|
||||
BLOB_CHOICE_LIST_C_IT second_blob_choices_it;
|
||||
|
||||
this_blob_choices_it.set_to_list(this->blob_choices_);
|
||||
this_blob_choices_it.move_to_last();
|
||||
|
||||
second_blob_choices_it.set_to_list(second.blob_choices_);
|
||||
|
||||
for (second_blob_choices_it.mark_cycle_pt();
|
||||
!second_blob_choices_it.cycled_list();
|
||||
second_blob_choices_it.forward()) {
|
||||
|
||||
BLOB_CHOICE_LIST* blob_choices_copy = new BLOB_CHOICE_LIST();
|
||||
blob_choices_copy->deep_copy(second_blob_choices_it.data(),
|
||||
&BLOB_CHOICE::deep_copy);
|
||||
|
||||
this_blob_choices_it.add_after_then_move(blob_choices_copy);
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -397,55 +499,202 @@ WERD_CHOICE& WERD_CHOICE::operator=(const WERD_CHOICE& source) {
|
||||
|
||||
unicharset_ = source.unicharset_;
|
||||
const UNICHAR_ID *other_unichar_ids = source.unichar_ids();
|
||||
const char *other_fragment_lengths = source.fragment_lengths();
|
||||
for (int i = 0; i < source.length(); ++i) {
|
||||
unichar_ids_[i] = other_unichar_ids[i];
|
||||
fragment_lengths_[i] = other_fragment_lengths[i];
|
||||
state_[i] = source.state_[i];
|
||||
certainties_[i] = source.certainties_[i];
|
||||
script_pos_[i] = source.BlobPosition(i);
|
||||
}
|
||||
length_ = source.length();
|
||||
adjust_factor_ = source.adjust_factor_;
|
||||
rating_ = source.rating();
|
||||
certainty_ = source.certainty();
|
||||
min_x_height_ = source.min_x_height();
|
||||
max_x_height_ = source.max_x_height();
|
||||
permuter_ = source.permuter();
|
||||
fragment_mark_ = source.fragment_mark();
|
||||
|
||||
// Delete existing blob_choices
|
||||
this->delete_blob_choices();
|
||||
|
||||
// Deep copy blob_choices of source
|
||||
if (source.blob_choices_ != NULL) {
|
||||
BLOB_CHOICE_LIST_C_IT this_blob_choices_it;
|
||||
BLOB_CHOICE_LIST_C_IT source_blob_choices_it;
|
||||
|
||||
this->blob_choices_ = new BLOB_CHOICE_LIST_CLIST();
|
||||
|
||||
this_blob_choices_it.set_to_list(this->blob_choices_);
|
||||
source_blob_choices_it.set_to_list(source.blob_choices_);
|
||||
|
||||
for (source_blob_choices_it.mark_cycle_pt();
|
||||
!source_blob_choices_it.cycled_list();
|
||||
source_blob_choices_it.forward()) {
|
||||
|
||||
BLOB_CHOICE_LIST* blob_choices_copy = new BLOB_CHOICE_LIST();
|
||||
blob_choices_copy->deep_copy(source_blob_choices_it.data(),
|
||||
&BLOB_CHOICE::deep_copy);
|
||||
|
||||
this_blob_choices_it.add_after_then_move(blob_choices_copy);
|
||||
}
|
||||
}
|
||||
dangerous_ambig_found_ = source.dangerous_ambig_found_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* WERD_CHOICE::delete_blob_choices
|
||||
*
|
||||
* Clear the blob_choices list, delete it and set it to NULL.
|
||||
**********************************************************************/
|
||||
void WERD_CHOICE::delete_blob_choices() {
|
||||
if (blob_choices_ != NULL) {
|
||||
blob_choices_->deep_clear();
|
||||
delete blob_choices_;
|
||||
blob_choices_ = NULL;
|
||||
// Sets up the script_pos_ member using the blobs_list to get the bln
|
||||
// bounding boxes, *this to get the unichars, and this->unicharset
|
||||
// to get the target positions. If small_caps is true, sub/super are not
|
||||
// considered, but dropcaps are.
|
||||
// NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
|
||||
void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD* word) {
|
||||
// Since WERD_CHOICE isn't supposed to depend on a Tesseract,
|
||||
// we don't have easy access to the flags Tesseract stores. Therefore, debug
|
||||
// for this module is hard compiled in.
|
||||
int debug = 0;
|
||||
|
||||
// Initialize to normal.
|
||||
for (int i = 0; i < length_; ++i)
|
||||
script_pos_[i] = tesseract::SP_NORMAL;
|
||||
if (word->blobs.empty())
|
||||
return;
|
||||
|
||||
int position_counts[4];
|
||||
for (int i = 0; i < 4; i++) {
|
||||
position_counts[i] = 0;
|
||||
}
|
||||
|
||||
int chunk_index = 0;
|
||||
for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
|
||||
TBLOB* tblob = word->blobs[chunk_index];
|
||||
int uni_id = unichar_id(blob_index);
|
||||
TBOX blob_box = tblob->bounding_box();
|
||||
if (state_ != NULL) {
|
||||
for (int i = 1; i < state_[blob_index]; ++i) {
|
||||
++chunk_index;
|
||||
tblob = word->blobs[chunk_index];
|
||||
blob_box += tblob->bounding_box();
|
||||
}
|
||||
}
|
||||
script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box,
|
||||
uni_id);
|
||||
if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
|
||||
script_pos_[blob_index] = tesseract::SP_NORMAL;
|
||||
}
|
||||
position_counts[script_pos_[blob_index]]++;
|
||||
}
|
||||
// If almost everything looks like a superscript or subscript,
|
||||
// we most likely just got the baseline wrong.
|
||||
if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ ||
|
||||
position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) {
|
||||
if (debug >= 2) {
|
||||
tprintf("Most characters of %s are subscript or superscript.\n"
|
||||
"That seems wrong, so I'll assume we got the baseline wrong\n",
|
||||
unichar_string().string());
|
||||
}
|
||||
for (int i = 0; i < length_; i++) {
|
||||
ScriptPos sp = script_pos_[i];
|
||||
if (sp == tesseract::SP_SUBSCRIPT || sp == tesseract::SP_SUPERSCRIPT) {
|
||||
position_counts[sp]--;
|
||||
position_counts[tesseract::SP_NORMAL]++;
|
||||
script_pos_[i] = tesseract::SP_NORMAL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) ||
|
||||
debug >= 2) {
|
||||
tprintf("SetScriptPosition on %s\n", unichar_string().string());
|
||||
int chunk_index = 0;
|
||||
for (int blob_index = 0; blob_index < length_; ++blob_index) {
|
||||
if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
|
||||
TBLOB* tblob = word->blobs[chunk_index];
|
||||
ScriptPositionOf(true, *unicharset_, tblob->bounding_box(),
|
||||
unichar_id(blob_index));
|
||||
}
|
||||
chunk_index += state_ != NULL ? state_[blob_index] : 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Sets the script_pos_ member from some source positions with a given length.
|
||||
void WERD_CHOICE::SetScriptPositions(const tesseract::ScriptPos* positions,
|
||||
int length) {
|
||||
ASSERT_HOST(length == length_);
|
||||
if (positions != script_pos_) {
|
||||
delete [] script_pos_;
|
||||
script_pos_ = new ScriptPos[length];
|
||||
memcpy(script_pos_, positions, sizeof(positions[0]) * length);
|
||||
}
|
||||
}
|
||||
// Sets all the script_pos_ positions to the given position.
|
||||
void WERD_CHOICE::SetAllScriptPositions(tesseract::ScriptPos position) {
|
||||
for (int i = 0; i < length_; ++i)
|
||||
script_pos_[i] = position;
|
||||
}
|
||||
|
||||
/* static */
|
||||
ScriptPos WERD_CHOICE::ScriptPositionOf(bool print_debug,
|
||||
const UNICHARSET& unicharset,
|
||||
const TBOX& blob_box,
|
||||
UNICHAR_ID unichar_id) {
|
||||
ScriptPos retval = tesseract::SP_NORMAL;
|
||||
int top = blob_box.top();
|
||||
int bottom = blob_box.bottom();
|
||||
int min_bottom, max_bottom, min_top, max_top;
|
||||
unicharset.get_top_bottom(unichar_id,
|
||||
&min_bottom, &max_bottom,
|
||||
&min_top, &max_top);
|
||||
|
||||
int sub_thresh_top = min_top - kMinSubscriptOffset;
|
||||
int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
|
||||
int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
|
||||
if (bottom <= kMaxDropCapBottom) {
|
||||
retval = tesseract::SP_DROPCAP;
|
||||
} else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
|
||||
retval = tesseract::SP_SUBSCRIPT;
|
||||
} else if (bottom > sup_thresh_bot) {
|
||||
retval = tesseract::SP_SUPERSCRIPT;
|
||||
}
|
||||
|
||||
if (print_debug) {
|
||||
const char *pos = ScriptPosToString(retval);
|
||||
tprintf("%s Character %s[bot:%d top: %d] "
|
||||
"bot_range[%d,%d] top_range[%d, %d] "
|
||||
"sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
|
||||
pos, unicharset.id_to_unichar(unichar_id),
|
||||
bottom, top,
|
||||
min_bottom, max_bottom, min_top, max_top,
|
||||
sub_thresh_bot, sub_thresh_top,
|
||||
sup_thresh_bot);
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
// Returns the script-id (eg Han) of the dominant script in the word.
|
||||
int WERD_CHOICE::GetTopScriptID() const {
|
||||
int max_script = unicharset_->get_script_table_size();
|
||||
int *sid = new int[max_script];
|
||||
int x;
|
||||
for (x = 0; x < max_script; x++) sid[x] = 0;
|
||||
for (x = 0; x < length_; ++x) {
|
||||
int script_id = unicharset_->get_script(unichar_id(x));
|
||||
sid[script_id]++;
|
||||
}
|
||||
if (unicharset_->han_sid() != unicharset_->null_sid()) {
|
||||
// Add the Hiragana & Katakana counts to Han and zero them out.
|
||||
if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {
|
||||
sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
|
||||
sid[unicharset_->hiragana_sid()] = 0;
|
||||
}
|
||||
if (unicharset_->katakana_sid() != unicharset_->null_sid()) {
|
||||
sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
|
||||
sid[unicharset_->katakana_sid()] = 0;
|
||||
}
|
||||
}
|
||||
// Note that high script ID overrides lower one on a tie, thus biasing
|
||||
// towards non-Common script (if sorted that way in unicharset file).
|
||||
int max_sid = 0;
|
||||
for (x = 1; x < max_script; x++)
|
||||
if (sid[x] >= sid[max_sid]) max_sid = x;
|
||||
if (sid[max_sid] < length_ / 2)
|
||||
max_sid = unicharset_->null_sid();
|
||||
delete[] sid;
|
||||
return max_sid;
|
||||
}
|
||||
|
||||
// Fixes the state_ for a chop at the given blob_posiiton.
|
||||
void WERD_CHOICE::UpdateStateForSplit(int blob_position) {
|
||||
int total_chunks = 0;
|
||||
for (int i = 0; i < length_; ++i) {
|
||||
total_chunks += state_[i];
|
||||
if (total_chunks > blob_position) {
|
||||
++state_[i];
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the sum of all the state elements, being the total number of blobs.
|
||||
int WERD_CHOICE::TotalOfStates() const {
|
||||
int total_chunks = 0;
|
||||
for (int i = 0; i < length_; ++i) {
|
||||
total_chunks += state_[i];
|
||||
}
|
||||
return total_chunks;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -453,32 +702,87 @@ void WERD_CHOICE::delete_blob_choices() {
|
||||
*
|
||||
* Print WERD_CHOICE to stdout.
|
||||
*/
|
||||
const void WERD_CHOICE::print(const char *msg) const {
|
||||
tprintf("%s WERD_CHOICE:\n", msg);
|
||||
tprintf("length_ %d reserved_ %d permuter_ %d\n",
|
||||
length_, reserved_, permuter_);
|
||||
tprintf("rating_ %.4f certainty_ %.4f", rating_, certainty_);
|
||||
if (fragment_mark_) {
|
||||
tprintf(" fragment_mark_ true");
|
||||
void WERD_CHOICE::print(const char *msg) const {
|
||||
tprintf("%s : ", msg);
|
||||
for (int i = 0; i < length_; ++i) {
|
||||
tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
|
||||
}
|
||||
tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n",
|
||||
rating_, certainty_, adjust_factor_, permuter_,
|
||||
min_x_height_, max_x_height_, dangerous_ambig_found_);
|
||||
tprintf("pos");
|
||||
for (int i = 0; i < length_; ++i) {
|
||||
tprintf("\t%s", ScriptPosToString(script_pos_[i]));
|
||||
}
|
||||
tprintf("\nstr");
|
||||
for (int i = 0; i < length_; ++i) {
|
||||
tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
|
||||
}
|
||||
tprintf("\nstate:");
|
||||
for (int i = 0; i < length_; ++i) {
|
||||
tprintf("\t%d ", state_[i]);
|
||||
}
|
||||
tprintf("\nC");
|
||||
for (int i = 0; i < length_; ++i) {
|
||||
tprintf("\t%.3f", certainties_[i]);
|
||||
}
|
||||
tprintf("\n");
|
||||
if (unichar_string_.length() > 0) {
|
||||
tprintf("unichar_string_ %s unichar_lengths_ %s\n",
|
||||
unichar_string_.string(), unichar_lengths_.string());
|
||||
}
|
||||
tprintf("unichar_ids: ");
|
||||
int i;
|
||||
for (i = 0; i < length_; ++i) {
|
||||
tprintf("%d ", unichar_ids_[i]);
|
||||
}
|
||||
tprintf("\nfragment_lengths_: ");
|
||||
for (i = 0; i < length_; ++i) {
|
||||
tprintf("%d ", fragment_lengths_[i]);
|
||||
}
|
||||
tprintf("\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
// Prints the segmentation state with an introductory message.
|
||||
void WERD_CHOICE::print_state(const char *msg) const {
|
||||
tprintf("%s", msg);
|
||||
for (int i = 0; i < length_; ++i)
|
||||
tprintf(" %d", state_[i]);
|
||||
tprintf("\n");
|
||||
}
|
||||
|
||||
// Displays the segmentation state of *this (if not the same as the last
|
||||
// one displayed) and waits for a click in the window.
|
||||
void WERD_CHOICE::DisplaySegmentation(TWERD* word) {
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
// Number of different colors to draw with.
|
||||
const int kNumColors = 6;
|
||||
static ScrollView *segm_window = NULL;
|
||||
// Check the state against the static prev_drawn_state.
|
||||
static GenericVector<int> prev_drawn_state;
|
||||
bool already_done = prev_drawn_state.size() == length_;
|
||||
if (!already_done) prev_drawn_state.init_to_size(length_, 0);
|
||||
for (int i = 0; i < length_; ++i) {
|
||||
if (prev_drawn_state[i] != state_[i]) {
|
||||
already_done = false;
|
||||
}
|
||||
prev_drawn_state[i] = state_[i];
|
||||
}
|
||||
if (already_done || word->blobs.empty()) return;
|
||||
|
||||
// Create the window if needed.
|
||||
if (segm_window == NULL) {
|
||||
segm_window = new ScrollView("Segmentation", 5, 10, 500, 256,
|
||||
2000.0, 256.0, true);
|
||||
} else {
|
||||
segm_window->Clear();
|
||||
}
|
||||
|
||||
TBOX bbox;
|
||||
int blob_index = 0;
|
||||
for (int c = 0; c < length_; ++c) {
|
||||
ScrollView::Color color =
|
||||
static_cast<ScrollView::Color>(c % kNumColors + 3);
|
||||
for (int i = 0; i < state_[c]; ++i, ++blob_index) {
|
||||
TBLOB* blob = word->blobs[blob_index];
|
||||
bbox += blob->bounding_box();
|
||||
blob->plot(segm_window, color, color);
|
||||
}
|
||||
}
|
||||
segm_window->ZoomToRectangle(bbox.left(), bbox.top(),
|
||||
bbox.right(), bbox.bottom());
|
||||
segm_window->Update();
|
||||
window_wait(segm_window);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1,
|
||||
const WERD_CHOICE &word2) {
|
||||
const UNICHARSET *uchset = word1.unicharset();
|
||||
@ -526,114 +830,3 @@ void print_ratings_list(const char *msg,
|
||||
tprintf("\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
/**
|
||||
* print_ratings_list
|
||||
*
|
||||
* Print ratings list (unichar ids only).
|
||||
*/
|
||||
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings) {
|
||||
if (ratings->length() == 0) {
|
||||
tprintf("%s:<none>\n", msg);
|
||||
return;
|
||||
}
|
||||
if (*msg != '\0') {
|
||||
tprintf("%s\n", msg);
|
||||
}
|
||||
BLOB_CHOICE_IT c_it;
|
||||
c_it.set_to_list(ratings);
|
||||
for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
|
||||
c_it.data()->print(NULL);
|
||||
if (!c_it.at_last()) tprintf("\n");
|
||||
}
|
||||
tprintf("\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
/**
|
||||
* print_ratings_info
|
||||
*
|
||||
* Send all the ratings out to the logfile.
|
||||
*
|
||||
* @param fp file to use
|
||||
* @param ratings list of results
|
||||
* @param current_unicharset unicharset that can be used
|
||||
* for id-to-unichar conversion
|
||||
*/
|
||||
void print_ratings_info(FILE *fp,
|
||||
BLOB_CHOICE_LIST *ratings,
|
||||
const UNICHARSET ¤t_unicharset) {
|
||||
inT32 index; // to list
|
||||
const char* first_char = NULL; // character
|
||||
FLOAT32 first_rat; // rating
|
||||
FLOAT32 first_cert; // certainty
|
||||
const char* sec_char = NULL; // character
|
||||
FLOAT32 sec_rat = 0.0f; // rating
|
||||
FLOAT32 sec_cert = 0.0f; // certainty
|
||||
BLOB_CHOICE_IT c_it = ratings; // iterator
|
||||
|
||||
index = ratings->length();
|
||||
if (index > 0) {
|
||||
first_char = current_unicharset.id_to_unichar(c_it.data()->unichar_id());
|
||||
first_rat = c_it.data()->rating();
|
||||
first_cert = -c_it.data()->certainty();
|
||||
if (index > 1) {
|
||||
sec_char = current_unicharset.id_to_unichar(
|
||||
c_it.data_relative(1)->unichar_id());
|
||||
sec_rat = c_it.data_relative(1)->rating();
|
||||
sec_cert = -c_it.data_relative(1)->certainty();
|
||||
} else {
|
||||
sec_char = NULL;
|
||||
sec_rat = -1;
|
||||
sec_cert = -1;
|
||||
}
|
||||
} else {
|
||||
first_char = NULL;
|
||||
first_rat = -1;
|
||||
first_cert = -1;
|
||||
}
|
||||
if (first_char != NULL && (*first_char == '\0' || *first_char == ' '))
|
||||
first_char = NULL;
|
||||
if (sec_char != NULL && (*sec_char == '\0' || *sec_char == ' '))
|
||||
sec_char = NULL;
|
||||
tprintf(" " INT32FORMAT " %s %g %g %s %g %g\n",
|
||||
ratings->length(),
|
||||
first_char != NULL ? first_char : "~",
|
||||
first_rat, first_cert, sec_char != NULL ? sec_char : "~",
|
||||
sec_rat, sec_cert);
|
||||
}
|
||||
|
||||
/**
|
||||
* print_char_choices_list
|
||||
*/
|
||||
void print_char_choices_list(const char *msg,
|
||||
const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||||
const UNICHARSET ¤t_unicharset,
|
||||
BOOL8 detailed) {
|
||||
if (*msg != '\0') tprintf("%s\n", msg);
|
||||
for (int x = 0; x < char_choices.length(); ++x) {
|
||||
BLOB_CHOICE_IT c_it;
|
||||
c_it.set_to_list(char_choices.get(x));
|
||||
tprintf("\nchar[%d]: %s\n", x,
|
||||
current_unicharset.debug_str( c_it.data()->unichar_id()).string());
|
||||
if (detailed)
|
||||
print_ratings_list("", char_choices.get(x), current_unicharset);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* print_word_alternates_list
|
||||
*/
|
||||
void print_word_alternates_list(
|
||||
WERD_CHOICE *word,
|
||||
GenericVector<WERD_CHOICE *> *alternates) {
|
||||
if (!word || !alternates) return;
|
||||
|
||||
STRING alternates_str;
|
||||
for (int i = 0; i < alternates->size(); i++) {
|
||||
if (i > 0) alternates_str += "\", \"";
|
||||
alternates_str += alternates->get(i)->unichar_string();
|
||||
}
|
||||
tprintf("Alternates for \"%s\": {\"%s\"}\n",
|
||||
word->unichar_string().string(), alternates_str.string());
|
||||
}
|
||||
|
@ -23,11 +23,27 @@
|
||||
#include <assert.h>
|
||||
|
||||
#include "clst.h"
|
||||
#include "elst.h"
|
||||
#include "genericvector.h"
|
||||
#include "matrix.h"
|
||||
#include "unichar.h"
|
||||
#include "unicharset.h"
|
||||
#include "werd.h"
|
||||
|
||||
class MATRIX;
|
||||
class TBLOB;
|
||||
class TWERD;
|
||||
|
||||
// Enum to describe the source of a BLOB_CHOICE to make it possible to determine
|
||||
// whether a blob has been classified by inspecting the BLOB_CHOICEs.
|
||||
enum BlobChoiceClassifier {
|
||||
BCC_STATIC_CLASSIFIER, // From the char_norm classifier.
|
||||
BCC_ADAPTED_CLASSIFIER, // From the adaptive classifier.
|
||||
BCC_SPECKLE_CLASSIFIER, // Backup for failed classification.
|
||||
BCC_AMBIG, // Generated by ambiguity detection.
|
||||
BCC_FAKE, // From some other process.
|
||||
};
|
||||
|
||||
class BLOB_CHOICE: public ELIST_LINK
|
||||
{
|
||||
public:
|
||||
@ -38,20 +54,23 @@ class BLOB_CHOICE: public ELIST_LINK
|
||||
rating_ = MAX_FLOAT32;
|
||||
certainty_ = -MAX_FLOAT32;
|
||||
script_id_ = -1;
|
||||
language_model_state_ = NULL;
|
||||
min_xheight_ = 0;
|
||||
max_xheight_ = 0;
|
||||
adapted_ = false;
|
||||
xgap_before_ = 0;
|
||||
xgap_after_ = 0;
|
||||
min_xheight_ = 0.0f;
|
||||
max_xheight_ = 0.0f;
|
||||
yshift_ = 0.0f;
|
||||
classifier_ = BCC_FAKE;
|
||||
}
|
||||
BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
|
||||
float src_rating, // rating
|
||||
float src_cert, // certainty
|
||||
inT16 src_fontinfo_id, // font
|
||||
inT16 src_fontinfo_id2, // 2nd choice font
|
||||
inT16 src_fontinfo_id, // font
|
||||
inT16 src_fontinfo_id2, // 2nd choice font
|
||||
int script_id, // script
|
||||
inT16 min_xheight, // min xheight in image pixel units
|
||||
inT16 max_xheight, // max xheight allowed by this char
|
||||
bool adapted); // adapted match or not
|
||||
float min_xheight, // min xheight in image pixel units
|
||||
float max_xheight, // max xheight allowed by this char
|
||||
float yshift, // the larger of y shift (top or bottom)
|
||||
BlobChoiceClassifier c); // adapted match or other
|
||||
BLOB_CHOICE(const BLOB_CHOICE &other);
|
||||
~BLOB_CHOICE() {}
|
||||
|
||||
@ -73,8 +92,8 @@ class BLOB_CHOICE: public ELIST_LINK
|
||||
int script_id() const {
|
||||
return script_id_;
|
||||
}
|
||||
void *language_model_state() {
|
||||
return language_model_state_;
|
||||
const MATRIX_COORD& matrix_cell() {
|
||||
return matrix_cell_;
|
||||
}
|
||||
inT16 xgap_before() const {
|
||||
return xgap_before_;
|
||||
@ -82,14 +101,25 @@ class BLOB_CHOICE: public ELIST_LINK
|
||||
inT16 xgap_after() const {
|
||||
return xgap_after_;
|
||||
}
|
||||
inT16 min_xheight() const {
|
||||
float min_xheight() const {
|
||||
return min_xheight_;
|
||||
}
|
||||
inT16 max_xheight() const {
|
||||
float max_xheight() const {
|
||||
return max_xheight_;
|
||||
}
|
||||
bool adapted() const {
|
||||
return adapted_;
|
||||
float yshift() const {
|
||||
return yshift_;
|
||||
}
|
||||
BlobChoiceClassifier classifier() const {
|
||||
return classifier_;
|
||||
}
|
||||
bool IsAdapted() const {
|
||||
return classifier_ == BCC_ADAPTED_CLASSIFIER;
|
||||
}
|
||||
bool IsClassified() const {
|
||||
return classifier_ == BCC_STATIC_CLASSIFIER ||
|
||||
classifier_ == BCC_ADAPTED_CLASSIFIER ||
|
||||
classifier_ == BCC_SPECKLE_CLASSIFIER;
|
||||
}
|
||||
|
||||
void set_unichar_id(UNICHAR_ID newunichar_id) {
|
||||
@ -110,8 +140,9 @@ class BLOB_CHOICE: public ELIST_LINK
|
||||
void set_script(int newscript_id) {
|
||||
script_id_ = newscript_id;
|
||||
}
|
||||
void set_language_model_state(void *language_model_state) {
|
||||
language_model_state_ = language_model_state;
|
||||
void set_matrix_cell(int col, int row) {
|
||||
matrix_cell_.col = col;
|
||||
matrix_cell_.row = row;
|
||||
}
|
||||
void set_xgap_before(inT16 gap) {
|
||||
xgap_before_ = gap;
|
||||
@ -119,19 +150,39 @@ class BLOB_CHOICE: public ELIST_LINK
|
||||
void set_xgap_after(inT16 gap) {
|
||||
xgap_after_ = gap;
|
||||
}
|
||||
void set_adapted(bool adapted) {
|
||||
adapted_ = adapted;
|
||||
void set_classifier(BlobChoiceClassifier classifier) {
|
||||
classifier_ = classifier;
|
||||
}
|
||||
static BLOB_CHOICE* deep_copy(const BLOB_CHOICE* src) {
|
||||
BLOB_CHOICE* choice = new BLOB_CHOICE;
|
||||
*choice = *src;
|
||||
return choice;
|
||||
}
|
||||
void print(const UNICHARSET *unicharset) {
|
||||
tprintf("r%.2f c%.2f : %d %s", rating_, certainty_, unichar_id_,
|
||||
// Returns true if *this and other agree on the baseline and x-height
|
||||
// to within some tolerance based on a given estimate of the x-height.
|
||||
bool PosAndSizeAgree(const BLOB_CHOICE& other, float x_height,
|
||||
bool debug) const;
|
||||
|
||||
void print(const UNICHARSET *unicharset) const {
|
||||
tprintf("r%.2f c%.2f x[%g,%g]: %d %s",
|
||||
rating_, certainty_,
|
||||
min_xheight_, max_xheight_, unichar_id_,
|
||||
(unicharset == NULL) ? "" :
|
||||
unicharset->debug_str(unichar_id_).string());
|
||||
}
|
||||
void print_full() const {
|
||||
print(NULL);
|
||||
tprintf(" script=%d, font1=%d, font2=%d, yshift=%g, classifier=%d\n",
|
||||
script_id_, fontinfo_id_, fontinfo_id2_, yshift_, classifier_);
|
||||
}
|
||||
// Sort function for sorting BLOB_CHOICEs in increasing order of rating.
|
||||
static int SortByRating(const void *p1, const void *p2) {
|
||||
const BLOB_CHOICE *bc1 =
|
||||
*reinterpret_cast<const BLOB_CHOICE * const *>(p1);
|
||||
const BLOB_CHOICE *bc2 =
|
||||
*reinterpret_cast<const BLOB_CHOICE * const *>(p2);
|
||||
return (bc1->rating_ < bc2->rating_) ? -1 : 1;
|
||||
}
|
||||
|
||||
private:
|
||||
UNICHAR_ID unichar_id_; // unichar id
|
||||
@ -149,21 +200,26 @@ class BLOB_CHOICE: public ELIST_LINK
|
||||
// k is defined as above to normalize -klog p to the range [0, 1].
|
||||
float certainty_; // absolute
|
||||
int script_id_;
|
||||
// Stores language model information about this BLOB_CHOICE. Used during
|
||||
// the segmentation search for BLOB_CHOICEs in BLOB_CHOICE_LISTs that are
|
||||
// recorded in the ratings matrix.
|
||||
// The pointer is owned/managed by the segmentation search.
|
||||
void *language_model_state_;
|
||||
// Holds the position of this choice in the ratings matrix.
|
||||
// Used to location position in the matrix during path backtracking.
|
||||
MATRIX_COORD matrix_cell_;
|
||||
inT16 xgap_before_;
|
||||
inT16 xgap_after_;
|
||||
// X-height range (in image pixels) that this classification supports.
|
||||
inT16 min_xheight_;
|
||||
inT16 max_xheight_;
|
||||
bool adapted_; // true if this is a match from adapted templates
|
||||
float min_xheight_;
|
||||
float max_xheight_;
|
||||
// yshift_ - The vertical distance (in image pixels) the character is
|
||||
// shifted (up or down) from an acceptable y position.
|
||||
float yshift_;
|
||||
BlobChoiceClassifier classifier_; // What generated *this.
|
||||
};
|
||||
|
||||
// Make BLOB_CHOICE listable.
|
||||
ELISTIZEH (BLOB_CHOICE) CLISTIZEH (BLOB_CHOICE_LIST)
|
||||
ELISTIZEH(BLOB_CHOICE)
|
||||
|
||||
// Return the BLOB_CHOICE in bc_list matching a given unichar_id,
|
||||
// or NULL if there is no match.
|
||||
BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list);
|
||||
|
||||
// Permuter codes used in WERD_CHOICEs.
|
||||
enum PermuterType {
|
||||
@ -180,11 +236,27 @@ enum PermuterType {
|
||||
USER_DAWG_PERM, // 10
|
||||
FREQ_DAWG_PERM, // 11
|
||||
COMPOUND_PERM, // 12
|
||||
|
||||
NUM_PERMUTER_TYPES
|
||||
};
|
||||
|
||||
class WERD_CHOICE {
|
||||
namespace tesseract {
|
||||
// ScriptPos tells whether a character is subscript, superscript or normal.
|
||||
enum ScriptPos {
|
||||
SP_NORMAL,
|
||||
SP_SUBSCRIPT,
|
||||
SP_SUPERSCRIPT,
|
||||
SP_DROPCAP
|
||||
};
|
||||
|
||||
const char *ScriptPosToString(tesseract::ScriptPos script_pos);
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
class WERD_CHOICE : public ELIST_LINK {
|
||||
public:
|
||||
static const float kBadRating;
|
||||
static const char *permuter_name(uinT8 permuter);
|
||||
|
||||
WERD_CHOICE(const UNICHARSET *unicharset)
|
||||
: unicharset_(unicharset) { this->init(8); }
|
||||
@ -213,6 +285,12 @@ class WERD_CHOICE {
|
||||
inline int length() const {
|
||||
return length_;
|
||||
}
|
||||
float adjust_factor() const {
|
||||
return adjust_factor_;
|
||||
}
|
||||
void set_adjust_factor(float factor) {
|
||||
adjust_factor_ = factor;
|
||||
}
|
||||
inline const UNICHAR_ID *unichar_ids() const {
|
||||
return unichar_ids_;
|
||||
}
|
||||
@ -220,12 +298,13 @@ class WERD_CHOICE {
|
||||
assert(index < length_);
|
||||
return unichar_ids_[index];
|
||||
}
|
||||
inline const char *fragment_lengths() const {
|
||||
return fragment_lengths_;
|
||||
inline int state(int index) const {
|
||||
return state_[index];
|
||||
}
|
||||
inline const char fragment_length(int index) const {
|
||||
assert(index < length_);
|
||||
return fragment_lengths_[index];
|
||||
tesseract::ScriptPos BlobPosition(int index) const {
|
||||
if (index < 0 || index >= length_)
|
||||
return tesseract::SP_NORMAL;
|
||||
return script_pos_[index];
|
||||
}
|
||||
inline float rating() const {
|
||||
return rating_;
|
||||
@ -233,23 +312,41 @@ class WERD_CHOICE {
|
||||
inline float certainty() const {
|
||||
return certainty_;
|
||||
}
|
||||
inline float certainty(int index) const {
|
||||
return certainties_[index];
|
||||
}
|
||||
inline float min_x_height() const {
|
||||
return min_x_height_;
|
||||
}
|
||||
inline float max_x_height() const {
|
||||
return max_x_height_;
|
||||
}
|
||||
inline void set_x_heights(float min_height, float max_height) {
|
||||
min_x_height_ = min_height;
|
||||
max_x_height_ = max_height;
|
||||
}
|
||||
inline uinT8 permuter() const {
|
||||
return permuter_;
|
||||
}
|
||||
const char *permuter_name() const;
|
||||
inline bool fragment_mark() const {
|
||||
return fragment_mark_;
|
||||
}
|
||||
inline BLOB_CHOICE_LIST_CLIST* blob_choices() {
|
||||
return blob_choices_;
|
||||
}
|
||||
// Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
|
||||
// taken from the appropriate cell in the ratings MATRIX.
|
||||
// Borrowed pointer, so do not delete.
|
||||
BLOB_CHOICE_LIST* blob_choices(int index, MATRIX* ratings) const;
|
||||
|
||||
// Returns the MATRIX_COORD corresponding to the location in the ratings
|
||||
// MATRIX for the given index into the word.
|
||||
MATRIX_COORD MatrixCoord(int index) const;
|
||||
|
||||
inline void set_unichar_id(UNICHAR_ID unichar_id, int index) {
|
||||
assert(index < length_);
|
||||
unichar_ids_[index] = unichar_id;
|
||||
}
|
||||
inline void set_fragment_length(char flen, int index) {
|
||||
assert(index < length_);
|
||||
fragment_lengths_[index] = flen;
|
||||
bool dangerous_ambig_found() const {
|
||||
return dangerous_ambig_found_;
|
||||
}
|
||||
void set_dangerous_ambig_found_(bool value) {
|
||||
dangerous_ambig_found_ = value;
|
||||
}
|
||||
inline void set_rating(float new_val) {
|
||||
rating_ = new_val;
|
||||
@ -260,9 +357,6 @@ class WERD_CHOICE {
|
||||
inline void set_permuter(uinT8 perm) {
|
||||
permuter_ = perm;
|
||||
}
|
||||
inline void set_fragment_mark(bool new_fragment_mark) {
|
||||
fragment_mark_ = new_fragment_mark;
|
||||
}
|
||||
// Note: this function should only be used if all the fields
|
||||
// are populated manually with set_* functions (rather than
|
||||
// (copy)constructors and append_* functions).
|
||||
@ -270,19 +364,24 @@ class WERD_CHOICE {
|
||||
ASSERT_HOST(reserved_ >= len);
|
||||
length_ = len;
|
||||
}
|
||||
void set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices);
|
||||
|
||||
/// Make more space in unichar_id_ and fragment_lengths_ arrays.
|
||||
inline void double_the_size() {
|
||||
if (reserved_ > 0) {
|
||||
unichar_ids_ = GenericVector<UNICHAR_ID>::double_the_size_memcpy(
|
||||
reserved_, unichar_ids_);
|
||||
fragment_lengths_ = GenericVector<char>::double_the_size_memcpy(
|
||||
reserved_, fragment_lengths_);
|
||||
script_pos_ = GenericVector<tesseract::ScriptPos>::double_the_size_memcpy(
|
||||
reserved_, script_pos_);
|
||||
state_ = GenericVector<int>::double_the_size_memcpy(
|
||||
reserved_, state_);
|
||||
certainties_ = GenericVector<float>::double_the_size_memcpy(
|
||||
reserved_, certainties_);
|
||||
reserved_ *= 2;
|
||||
} else {
|
||||
unichar_ids_ = new UNICHAR_ID[1];
|
||||
fragment_lengths_ = new char[1];
|
||||
script_pos_ = new tesseract::ScriptPos[1];
|
||||
state_ = new int[1];
|
||||
certainties_ = new float[1];
|
||||
reserved_ = 1;
|
||||
}
|
||||
}
|
||||
@ -293,18 +392,24 @@ class WERD_CHOICE {
|
||||
reserved_ = reserved;
|
||||
if (reserved > 0) {
|
||||
unichar_ids_ = new UNICHAR_ID[reserved];
|
||||
fragment_lengths_ = new char[reserved];
|
||||
script_pos_ = new tesseract::ScriptPos[reserved];
|
||||
state_ = new int[reserved];
|
||||
certainties_ = new float[reserved];
|
||||
} else {
|
||||
unichar_ids_ = NULL;
|
||||
fragment_lengths_ = NULL;
|
||||
script_pos_ = NULL;
|
||||
state_ = NULL;
|
||||
certainties_ = NULL;
|
||||
}
|
||||
length_ = 0;
|
||||
adjust_factor_ = 1.0f;
|
||||
rating_ = 0.0;
|
||||
certainty_ = MAX_FLOAT32;
|
||||
min_x_height_ = 0.0f;
|
||||
max_x_height_ = MAX_FLOAT32;
|
||||
permuter_ = NO_PERM;
|
||||
fragment_mark_ = false;
|
||||
blob_choices_ = NULL;
|
||||
unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
|
||||
dangerous_ambig_found_ = false;
|
||||
}
|
||||
|
||||
/// Helper function to build a WERD_CHOICE from the given string,
|
||||
@ -321,34 +426,39 @@ class WERD_CHOICE {
|
||||
length_ = 0;
|
||||
rating_ = kBadRating;
|
||||
certainty_ = -MAX_FLOAT32;
|
||||
fragment_mark_ = false;
|
||||
}
|
||||
|
||||
/// This function assumes that there is enough space reserved
|
||||
/// in the WERD_CHOICE for adding another unichar.
|
||||
/// This is an efficient alternative to append_unichar_id().
|
||||
inline void append_unichar_id_space_allocated(
|
||||
UNICHAR_ID unichar_id, char fragment_length,
|
||||
UNICHAR_ID unichar_id, int blob_count,
|
||||
float rating, float certainty) {
|
||||
assert(reserved_ > length_);
|
||||
length_++;
|
||||
this->set_unichar_id(unichar_id, fragment_length,
|
||||
this->set_unichar_id(unichar_id, blob_count,
|
||||
rating, certainty, length_-1);
|
||||
}
|
||||
|
||||
void append_unichar_id(UNICHAR_ID unichar_id, char fragment_length,
|
||||
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count,
|
||||
float rating, float certainty);
|
||||
|
||||
inline void set_unichar_id(UNICHAR_ID unichar_id, char fragment_length,
|
||||
inline void set_unichar_id(UNICHAR_ID unichar_id, int blob_count,
|
||||
float rating, float certainty, int index) {
|
||||
assert(index < length_);
|
||||
unichar_ids_[index] = unichar_id;
|
||||
fragment_lengths_[index] = fragment_length;
|
||||
state_[index] = blob_count;
|
||||
certainties_[index] = certainty;
|
||||
script_pos_[index] = tesseract::SP_NORMAL;
|
||||
rating_ += rating;
|
||||
if (certainty < certainty_) {
|
||||
certainty_ = certainty;
|
||||
}
|
||||
}
|
||||
// Sets the entries for the given index from the BLOB_CHOICE, assuming
|
||||
// unit fragment lengths, but setting the state for this index to blob_count.
|
||||
void set_blob_choice(int index, int blob_count,
|
||||
const BLOB_CHOICE* blob_choice);
|
||||
|
||||
bool contains_unichar_id(UNICHAR_ID unichar_id) const;
|
||||
void remove_unichar_ids(int index, int num);
|
||||
@ -364,6 +474,11 @@ class WERD_CHOICE {
|
||||
// punctuation from the left and right.
|
||||
void punct_stripped(int *start_core, int *end_core) const;
|
||||
|
||||
// Returns the indices [start, end) containing the core of the word, stripped
|
||||
// of any superscript digits on either side. (i.e., the non-footnote part
|
||||
// of the word). There is no guarantee that the output range is non-empty.
|
||||
void GetNonSuperscriptSpan(int *start, int *end) const;
|
||||
|
||||
// Return a copy of this WERD_CHOICE with the choices [start, end).
|
||||
// The result is useful only for checking against a dictionary.
|
||||
WERD_CHOICE shallow_copy(int start, int end) const;
|
||||
@ -402,8 +517,42 @@ class WERD_CHOICE {
|
||||
this->string_and_lengths(&unichar_string_, &unichar_lengths_);
|
||||
return unichar_lengths_;
|
||||
}
|
||||
const void print() const { this->print(""); }
|
||||
const void print(const char *msg) const;
|
||||
|
||||
// Sets up the script_pos_ member using the blobs_list to get the bln
|
||||
// bounding boxes, *this to get the unichars, and this->unicharset
|
||||
// to get the target positions. If small_caps is true, sub/super are not
|
||||
// considered, but dropcaps are.
|
||||
// NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
|
||||
void SetScriptPositions(bool small_caps, TWERD* word);
|
||||
// Sets the script_pos_ member from some source positions with a given length.
|
||||
void SetScriptPositions(const tesseract::ScriptPos* positions, int length);
|
||||
// Sets all the script_pos_ positions to the given position.
|
||||
void SetAllScriptPositions(tesseract::ScriptPos position);
|
||||
|
||||
static tesseract::ScriptPos ScriptPositionOf(bool print_debug,
|
||||
const UNICHARSET& unicharset,
|
||||
const TBOX& blob_box,
|
||||
UNICHAR_ID unichar_id);
|
||||
|
||||
// Returns the "dominant" script ID for the word. By "dominant", the script
|
||||
// must account for at least half the characters. Otherwise, it returns 0.
|
||||
// Note that for Japanese, Hiragana and Katakana are simply treated as Han.
|
||||
int GetTopScriptID() const;
|
||||
|
||||
// Fixes the state_ for a chop at the given blob_posiiton.
|
||||
void UpdateStateForSplit(int blob_position);
|
||||
|
||||
// Returns the sum of all the state elements, being the total number of blobs.
|
||||
int TotalOfStates() const;
|
||||
|
||||
void print() const { this->print(""); }
|
||||
void print(const char *msg) const;
|
||||
// Prints the segmentation state with an introductory message.
|
||||
void print_state(const char *msg) const;
|
||||
|
||||
// Displays the segmentation state of *this (if not the same as the last
|
||||
// one displayed) and waits for a click in the window.
|
||||
void DisplaySegmentation(TWERD* word);
|
||||
|
||||
WERD_CHOICE& operator+= ( // concatanate
|
||||
const WERD_CHOICE & second);// second on first
|
||||
@ -412,41 +561,55 @@ class WERD_CHOICE {
|
||||
|
||||
private:
|
||||
const UNICHARSET *unicharset_;
|
||||
// TODO(rays) Perhaps replace the multiple arrays with an array of structs?
|
||||
// unichar_ids_ is an array of classifier "results" that make up a word.
|
||||
// For each unichar_ids_[i], script_pos_[i] has the sub/super/normal position
|
||||
// of each unichar_id.
|
||||
// state_[i] indicates the number of blobs in WERD_RES::chopped_word that
|
||||
// were put together to make the classification results in the ith position
|
||||
// in unichar_ids_, and certainties_[i] is the certainty of the choice that
|
||||
// was used in this word.
|
||||
// == Change from before ==
|
||||
// Previously there was fragment_lengths_ that allowed a word to be
|
||||
// artificially composed of multiple fragment results. Since the new
|
||||
// segmentation search doesn't do fragments, treatment of fragments has
|
||||
// been moved to a lower level, augmenting the ratings matrix with the
|
||||
// combined fragments, and allowing the language-model/segmentation-search
|
||||
// to deal with only the combined unichar_ids.
|
||||
UNICHAR_ID *unichar_ids_; // unichar ids that represent the text of the word
|
||||
char *fragment_lengths_; // number of fragments in each unichar
|
||||
tesseract::ScriptPos* script_pos_; // Normal/Sub/Superscript of each unichar.
|
||||
int* state_; // Number of blobs in each unichar.
|
||||
float* certainties_; // Certainty of each unichar.
|
||||
int reserved_; // size of the above arrays
|
||||
int length_; // word length
|
||||
// Factor that was used to adjust the rating.
|
||||
float adjust_factor_;
|
||||
// Rating is the sum of the ratings of the individual blobs in the word.
|
||||
float rating_; // size related
|
||||
// certainty is the min (worst) certainty of the individual blobs in the word.
|
||||
float certainty_; // absolute
|
||||
// xheight computed from the result, or 0 if inconsistent.
|
||||
float min_x_height_;
|
||||
float max_x_height_;
|
||||
uinT8 permuter_; // permuter code
|
||||
bool fragment_mark_; // if true, indicates that this choice
|
||||
// was chosen over a better one that
|
||||
// contained a fragment
|
||||
BLOB_CHOICE_LIST_CLIST *blob_choices_; // best choices for each blob
|
||||
|
||||
// Normally, the blob_choices_ represent the recognition results in order
|
||||
// Normally, the ratings_ matrix represents the recognition results in order
|
||||
// from left-to-right. However, some engines (say Cube) may return
|
||||
// recognition results in the order of the script's major reading direction
|
||||
// (for Arabic, that is right-to-left).
|
||||
bool unichars_in_script_order_;
|
||||
// True if NoDangerousAmbig found an ambiguity.
|
||||
bool dangerous_ambig_found_;
|
||||
|
||||
// The following variables are populated and passed by reference any
|
||||
// time unichar_string() or unichar_lengths() are called.
|
||||
mutable STRING unichar_string_;
|
||||
mutable STRING unichar_lengths_;
|
||||
|
||||
bool unichar_info_present;
|
||||
|
||||
private:
|
||||
void delete_blob_choices();
|
||||
};
|
||||
|
||||
// Make WERD_CHOICE listable.
|
||||
ELISTIZEH (WERD_CHOICE)
|
||||
ELISTIZEH(WERD_CHOICE)
|
||||
typedef GenericVector<BLOB_CHOICE_LIST *> BLOB_CHOICE_LIST_VECTOR;
|
||||
typedef GenericVector<WERD_CHOICE_LIST *> WERD_CHOICE_LIST_VECTOR;
|
||||
|
||||
// Utilities for comparing WERD_CHOICEs
|
||||
|
||||
@ -454,27 +617,11 @@ bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1,
|
||||
const WERD_CHOICE &word2);
|
||||
|
||||
// Utilities for debug printing.
|
||||
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings);
|
||||
void print_ratings_list(
|
||||
const char *msg, // intro message
|
||||
BLOB_CHOICE_LIST *ratings, // list of results
|
||||
const UNICHARSET ¤t_unicharset // unicharset that can be used
|
||||
// for id-to-unichar conversion
|
||||
);
|
||||
void print_ratings_info(
|
||||
FILE *fp, // file to use
|
||||
BLOB_CHOICE_LIST *ratings, // list of results
|
||||
const UNICHARSET ¤t_unicharset // unicharset that can be used
|
||||
// for id-to-unichar conversion
|
||||
);
|
||||
void print_char_choices_list(
|
||||
const char *msg,
|
||||
const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
||||
const UNICHARSET ¤t_unicharset,
|
||||
BOOL8 detailed
|
||||
);
|
||||
void print_word_alternates_list(
|
||||
WERD_CHOICE *word,
|
||||
GenericVector<WERD_CHOICE *> *alternates);
|
||||
|
||||
#endif
|
||||
|
@ -171,6 +171,16 @@ void TBOX::plot( //paint box
|
||||
}
|
||||
#endif
|
||||
|
||||
// Appends the bounding box as (%d,%d)->(%d,%d) to a STRING.
|
||||
void TBOX::print_to_str(STRING *str) const {
|
||||
// "(%d,%d)->(%d,%d)", left(), bottom(), right(), top()
|
||||
str->add_str_int("(", left());
|
||||
str->add_str_int(",", bottom());
|
||||
str->add_str_int(")->(", right());
|
||||
str->add_str_int(",", top());
|
||||
*str += ')';
|
||||
}
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool TBOX::Serialize(FILE* fp) const {
|
||||
if (!bot_left.Serialize(fp)) return false;
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include "points.h"
|
||||
#include "ndminx.h"
|
||||
#include "scrollview.h"
|
||||
#include "strngs.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
class DLLSYM TBOX { // bounding box
|
||||
@ -264,15 +265,8 @@ class DLLSYM TBOX { // bounding box
|
||||
tprintf("Bounding box=(%d,%d)->(%d,%d)\n",
|
||||
left(), bottom(), right(), top());
|
||||
}
|
||||
|
||||
// Same as print(), but appends debug information to the given string
|
||||
// instead of printing it to stdout.
|
||||
void append_debug(STRING *str) const {
|
||||
char buffer[256];
|
||||
sprintf(buffer, "Bounding box=(%d,%d)->(%d,%d)\n",
|
||||
left(), bottom(), right(), top());
|
||||
*str += buffer;
|
||||
}
|
||||
// Appends the bounding box as (%d,%d)->(%d,%d) to a STRING.
|
||||
void print_to_str(STRING *str) const;
|
||||
|
||||
#ifndef GRAPHICS_DISABLED
|
||||
void plot( // use current settings
|
||||
|
@ -27,8 +27,8 @@
|
||||
----------------------------------------------------------------------*/
|
||||
#include "seam.h"
|
||||
#include "blobs.h"
|
||||
#include "callcpp.h"
|
||||
#include "structures.h"
|
||||
#include "freelist.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
#ifdef __UNIX__
|
||||
#include <assert.h>
|
||||
@ -38,7 +38,6 @@
|
||||
V a r i a b l e s
|
||||
----------------------------------------------------------------------*/
|
||||
#define NUM_STARTING_SEAMS 20
|
||||
makestructure(newseam, free_seam, SEAM);
|
||||
|
||||
/*----------------------------------------------------------------------
|
||||
Public Function Code
|
||||
@ -66,7 +65,7 @@ bool point_in_split(SPLIT *split, EDGEPT *point1, EDGEPT *point2) {
|
||||
* seam.
|
||||
* @returns TRUE if one of them is.
|
||||
*/
|
||||
bool point_in_seam(SEAM *seam, SPLIT *split) {
|
||||
bool point_in_seam(const SEAM *seam, SPLIT *split) {
|
||||
return (point_in_split(seam->split1, split->point1, split->point2) ||
|
||||
point_in_split(seam->split2, split->point1, split->point2) ||
|
||||
point_in_split(seam->split3, split->point1, split->point2));
|
||||
@ -96,16 +95,6 @@ bool point_used_by_seam(SEAM *seam, EDGEPT *point) {
|
||||
point_used_by_split(seam->split3, point);
|
||||
}
|
||||
|
||||
/**
|
||||
* @name add_seam
|
||||
*
|
||||
* Add another seam to a collection of seams.
|
||||
*/
|
||||
SEAMS add_seam(SEAMS seam_list, SEAM *seam) {
|
||||
return (array_push (seam_list, seam));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @name combine_seam
|
||||
*
|
||||
@ -126,7 +115,8 @@ void combine_seams(SEAM *dest_seam, SEAM *source_seam) {
|
||||
else if (!dest_seam->split3)
|
||||
dest_seam->split3 = source_seam->split1;
|
||||
else
|
||||
cprintf("combine_seam: Seam is too crowded, can't be combined !\n");
|
||||
delete source_seam->split1; // Wouldn't have fitted.
|
||||
source_seam->split1 = NULL;
|
||||
}
|
||||
if (source_seam->split2) {
|
||||
if (!dest_seam->split2)
|
||||
@ -134,35 +124,17 @@ void combine_seams(SEAM *dest_seam, SEAM *source_seam) {
|
||||
else if (!dest_seam->split3)
|
||||
dest_seam->split3 = source_seam->split2;
|
||||
else
|
||||
cprintf("combine_seam: Seam is too crowded, can't be combined !\n");
|
||||
delete source_seam->split2; // Wouldn't have fitted.
|
||||
source_seam->split2 = NULL;
|
||||
}
|
||||
if (source_seam->split3) {
|
||||
if (!dest_seam->split3)
|
||||
dest_seam->split3 = source_seam->split3;
|
||||
else
|
||||
cprintf("combine_seam: Seam is too crowded, can't be combined !\n");
|
||||
}
|
||||
free_seam(source_seam);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @name delete_seam
|
||||
*
|
||||
* Free this seam record and the splits that are attached to it.
|
||||
*/
|
||||
void delete_seam(void *arg) { //SEAM *seam)
|
||||
SEAM *seam = (SEAM *) arg;
|
||||
|
||||
if (seam) {
|
||||
if (seam->split1)
|
||||
delete_split(seam->split1);
|
||||
if (seam->split2)
|
||||
delete_split(seam->split2);
|
||||
if (seam->split3)
|
||||
delete_split(seam->split3);
|
||||
free_seam(seam);
|
||||
delete source_seam->split3; // Wouldn't have fitted.
|
||||
source_seam->split3 = NULL;
|
||||
}
|
||||
delete source_seam;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -172,36 +144,17 @@ void delete_seam(void *arg) { //SEAM *seam)
|
||||
* present in the starting segmentation. Each of the seams created
|
||||
* by this routine have location information only.
|
||||
*/
|
||||
SEAMS start_seam_list(TBLOB *blobs) {
|
||||
TBLOB *blob;
|
||||
SEAMS seam_list;
|
||||
void start_seam_list(TWERD *word, GenericVector<SEAM*>* seam_array) {
|
||||
seam_array->truncate(0);
|
||||
TPOINT location;
|
||||
/* Seam slot per char */
|
||||
seam_list = new_seam_list ();
|
||||
|
||||
for (blob = blobs; blob->next != NULL; blob = blob->next) {
|
||||
TBOX bbox = blob->bounding_box();
|
||||
TBOX nbox = blob->next->bounding_box();
|
||||
for (int b = 1; b < word->NumBlobs(); ++b) {
|
||||
TBOX bbox = word->blobs[b - 1]->bounding_box();
|
||||
TBOX nbox = word->blobs[b]->bounding_box();
|
||||
location.x = (bbox.right() + nbox.left()) / 2;
|
||||
location.y = (bbox.bottom() + bbox.top() + nbox.bottom() + nbox.top()) / 4;
|
||||
seam_list = add_seam(seam_list,
|
||||
new_seam(0.0, location, NULL, NULL, NULL));
|
||||
seam_array->push_back(new SEAM(0.0f, location, NULL, NULL, NULL));
|
||||
}
|
||||
|
||||
return seam_list;
|
||||
}
|
||||
|
||||
/**
|
||||
* @name free_seam_list
|
||||
*
|
||||
* Free all the seams that have been allocated in this list. Reclaim
|
||||
* the memory for each of the splits as well.
|
||||
*/
|
||||
void free_seam_list(SEAMS seam_list) {
|
||||
int x;
|
||||
|
||||
array_loop(seam_list, x) delete_seam(array_value (seam_list, x));
|
||||
array_free(seam_list);
|
||||
}
|
||||
|
||||
|
||||
@ -210,32 +163,26 @@ void free_seam_list(SEAMS seam_list) {
|
||||
*
|
||||
* @returns true if insert_seam will succeed.
|
||||
*/
|
||||
bool test_insert_seam(SEAMS seam_list,
|
||||
int index,
|
||||
TBLOB *left_blob,
|
||||
TBLOB *first_blob) {
|
||||
bool test_insert_seam(const GenericVector<SEAM*>& seam_array,
|
||||
TWERD *word, int index) {
|
||||
SEAM *test_seam;
|
||||
TBLOB *blob;
|
||||
int test_index;
|
||||
int list_length;
|
||||
|
||||
list_length = array_count (seam_list);
|
||||
for (test_index=0, blob=first_blob->next;
|
||||
test_index < index;
|
||||
test_index++, blob=blob->next) {
|
||||
test_seam = (SEAM *) array_value(seam_list, test_index);
|
||||
list_length = seam_array.size();
|
||||
for (int test_index = 0; test_index < index; ++test_index) {
|
||||
test_seam = seam_array[test_index];
|
||||
if (test_index + test_seam->widthp < index &&
|
||||
test_seam->widthp + test_index == index - 1 &&
|
||||
account_splits_right(test_seam, blob) < 0)
|
||||
account_splits(test_seam, word, test_index + 1, 1) < 0)
|
||||
return false;
|
||||
}
|
||||
for (test_index=index, blob=left_blob->next;
|
||||
test_index < list_length;
|
||||
test_index++, blob=blob->next) {
|
||||
test_seam = (SEAM *) array_value(seam_list, test_index);
|
||||
for (int test_index = index; test_index < list_length; test_index++) {
|
||||
test_seam = seam_array[test_index];
|
||||
if (test_index - test_seam->widthn >= index &&
|
||||
test_index - test_seam->widthn == index &&
|
||||
account_splits_left(test_seam, first_blob, blob) < 0)
|
||||
account_splits(test_seam, word, test_index + 1, -1) < 0)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
@ -247,58 +194,51 @@ bool test_insert_seam(SEAMS seam_list,
|
||||
* Add another seam to a collection of seams at a particular location
|
||||
* in the seam array.
|
||||
*/
|
||||
SEAMS insert_seam(SEAMS seam_list,
|
||||
int index,
|
||||
SEAM *seam,
|
||||
TBLOB *left_blob,
|
||||
TBLOB *first_blob) {
|
||||
void insert_seam(const TWERD* word, int index, SEAM *seam,
|
||||
GenericVector<SEAM*>* seam_array) {
|
||||
SEAM *test_seam;
|
||||
TBLOB *blob;
|
||||
int test_index;
|
||||
int list_length;
|
||||
|
||||
list_length = array_count(seam_list);
|
||||
for (test_index=0, blob=first_blob->next;
|
||||
test_index < index;
|
||||
test_index++, blob=blob->next) {
|
||||
test_seam = (SEAM *) array_value(seam_list, test_index);
|
||||
list_length = seam_array->size();
|
||||
for (int test_index = 0; test_index < index; ++test_index) {
|
||||
test_seam = seam_array->get(test_index);
|
||||
if (test_index + test_seam->widthp >= index) {
|
||||
test_seam->widthp++; /*got in the way */
|
||||
} else if (test_seam->widthp + test_index == index - 1) {
|
||||
test_seam->widthp = account_splits_right(test_seam, blob);
|
||||
test_seam->widthp = account_splits(test_seam, word, test_index + 1, 1);
|
||||
if (test_seam->widthp < 0) {
|
||||
cprintf("Failed to find any right blob for a split!\n");
|
||||
tprintf("Failed to find any right blob for a split!\n");
|
||||
print_seam("New dud seam", seam);
|
||||
print_seam("Failed seam", test_seam);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (test_index=index, blob=left_blob->next;
|
||||
test_index < list_length;
|
||||
test_index++, blob=blob->next) {
|
||||
test_seam = (SEAM *) array_value(seam_list, test_index);
|
||||
for (int test_index = index; test_index < list_length; test_index++) {
|
||||
test_seam = seam_array->get(test_index);
|
||||
if (test_index - test_seam->widthn < index) {
|
||||
test_seam->widthn++; /*got in the way */
|
||||
} else if (test_index - test_seam->widthn == index) {
|
||||
test_seam->widthn = account_splits_left(test_seam, first_blob, blob);
|
||||
test_seam->widthn = account_splits(test_seam, word, test_index + 1, -1);
|
||||
if (test_seam->widthn < 0) {
|
||||
cprintf("Failed to find any left blob for a split!\n");
|
||||
tprintf("Failed to find any left blob for a split!\n");
|
||||
print_seam("New dud seam", seam);
|
||||
print_seam("Failed seam", test_seam);
|
||||
}
|
||||
}
|
||||
}
|
||||
return (array_insert (seam_list, index, seam));
|
||||
seam_array->insert(seam, index);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @name account_splits_right
|
||||
* @name account_splits
|
||||
*
|
||||
* Account for all the splits by looking to the right.
|
||||
* in the blob list.
|
||||
* Account for all the splits by looking to the right (blob_direction == 1),
|
||||
* or to the left (blob_direction == -1) in the word.
|
||||
*/
|
||||
int account_splits_right(SEAM *seam, TBLOB *blob) {
|
||||
int account_splits(const SEAM *seam, const TWERD *word, int blob_index,
|
||||
int blob_direction) {
|
||||
inT8 found_em[3];
|
||||
inT8 width;
|
||||
|
||||
@ -309,6 +249,7 @@ int account_splits_right(SEAM *seam, TBLOB *blob) {
|
||||
return 0;
|
||||
width = 0;
|
||||
do {
|
||||
TBLOB* blob = word->blobs[blob_index];
|
||||
if (!found_em[0])
|
||||
found_em[0] = find_split_in_blob(seam->split1, blob);
|
||||
if (!found_em[1])
|
||||
@ -319,54 +260,12 @@ int account_splits_right(SEAM *seam, TBLOB *blob) {
|
||||
return width;
|
||||
}
|
||||
width++;
|
||||
blob = blob->next;
|
||||
} while (blob != NULL);
|
||||
blob_index += blob_direction;
|
||||
} while (0 <= blob_index && blob_index < word->NumBlobs());
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @name account_splits_left
|
||||
*
|
||||
* Account for all the splits by looking to the left.
|
||||
* in the blob list.
|
||||
*/
|
||||
int account_splits_left(SEAM *seam, TBLOB *blob, TBLOB *end_blob) {
|
||||
inT32 depth = 0;
|
||||
inT8 width = 0;
|
||||
inT8 found_em[3];
|
||||
account_splits_left_helper(seam, blob, end_blob, &depth, &width, found_em);
|
||||
return width;
|
||||
}
|
||||
|
||||
void account_splits_left_helper(SEAM *seam, TBLOB *blob, TBLOB *end_blob,
|
||||
inT32 *depth, inT8 *width, inT8* found_em) {
|
||||
if (blob != end_blob) {
|
||||
(*depth)++;
|
||||
account_splits_left_helper(seam, blob->next, end_blob,
|
||||
depth, width, found_em);
|
||||
(*depth)--;
|
||||
} else {
|
||||
found_em[0] = seam->split1 == NULL;
|
||||
found_em[1] = seam->split2 == NULL;
|
||||
found_em[2] = seam->split3 == NULL;
|
||||
*width = 0;
|
||||
}
|
||||
if (!found_em[0])
|
||||
found_em[0] = find_split_in_blob(seam->split1, blob);
|
||||
if (!found_em[1])
|
||||
found_em[1] = find_split_in_blob(seam->split2, blob);
|
||||
if (!found_em[2])
|
||||
found_em[2] = find_split_in_blob(seam->split3, blob);
|
||||
if (!found_em[0] || !found_em[1] || !found_em[2]) {
|
||||
(*width)++;
|
||||
if (*depth == 0) {
|
||||
*width = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @name find_split_in_blob
|
||||
*
|
||||
@ -393,7 +292,7 @@ bool find_split_in_blob(SPLIT *split, TBLOB *blob) {
|
||||
* Merge these two seams into a new seam. Duplicate the split records
|
||||
* in both of the input seams. Return the resultant seam.
|
||||
*/
|
||||
SEAM *join_two_seams(SEAM *seam1, SEAM *seam2) {
|
||||
SEAM *join_two_seams(const SEAM *seam1, const SEAM *seam2) {
|
||||
SEAM *result = NULL;
|
||||
SEAM *temp;
|
||||
|
||||
@ -403,52 +302,13 @@ SEAM *join_two_seams(SEAM *seam1, SEAM *seam2) {
|
||||
(seam1->split2 == NULL && seam2->split3 == NULL) ||
|
||||
seam1->split1 == NULL || seam2->split1 == NULL) &&
|
||||
(!shared_split_points(seam1, seam2))) {
|
||||
clone_seam(result, seam1);
|
||||
clone_seam(temp, seam2);
|
||||
result = new SEAM(*seam1);
|
||||
temp = new SEAM(*seam2);
|
||||
combine_seams(result, temp);
|
||||
}
|
||||
return (result);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @name new_seam
|
||||
*
|
||||
* Create a structure for a "seam" between two blobs. This data
|
||||
* structure may actually hold up to three different splits.
|
||||
* Initailization of this record is done by this routine.
|
||||
*/
|
||||
SEAM *new_seam(PRIORITY priority,
|
||||
const TPOINT& location,
|
||||
SPLIT *split1,
|
||||
SPLIT *split2,
|
||||
SPLIT *split3) {
|
||||
SEAM *seam;
|
||||
|
||||
seam = newseam ();
|
||||
|
||||
seam->priority = priority;
|
||||
seam->location = location;
|
||||
seam->widthp = 0;
|
||||
seam->widthn = 0;
|
||||
seam->split1 = split1;
|
||||
seam->split2 = split2;
|
||||
seam->split3 = split3;
|
||||
|
||||
return (seam);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @name new_seam_list
|
||||
*
|
||||
* Create a collection of seam records in an array.
|
||||
*/
|
||||
SEAMS new_seam_list() {
|
||||
return (array_new (NUM_STARTING_SEAMS));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @name print_seam
|
||||
*
|
||||
@ -457,21 +317,21 @@ SEAMS new_seam_list() {
|
||||
*/
|
||||
void print_seam(const char *label, SEAM *seam) {
|
||||
if (seam) {
|
||||
cprintf(label);
|
||||
cprintf(" %6.2f @ (%d,%d), p=%d, n=%d ",
|
||||
tprintf(label);
|
||||
tprintf(" %6.2f @ (%d,%d), p=%d, n=%d ",
|
||||
seam->priority, seam->location.x, seam->location.y,
|
||||
seam->widthp, seam->widthn);
|
||||
print_split(seam->split1);
|
||||
|
||||
if (seam->split2) {
|
||||
cprintf(", ");
|
||||
tprintf(", ");
|
||||
print_split (seam->split2);
|
||||
if (seam->split3) {
|
||||
cprintf(", ");
|
||||
tprintf(", ");
|
||||
print_split (seam->split3);
|
||||
}
|
||||
}
|
||||
cprintf ("\n");
|
||||
tprintf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
@ -482,17 +342,16 @@ void print_seam(const char *label, SEAM *seam) {
|
||||
* Print a list of splits. Show the coordinates of both points in
|
||||
* each split.
|
||||
*/
|
||||
void print_seams(const char *label, SEAMS seams) {
|
||||
int x;
|
||||
void print_seams(const char *label, const GenericVector<SEAM*>& seams) {
|
||||
char number[CHARS_PER_LINE];
|
||||
|
||||
if (seams) {
|
||||
cprintf("%s\n", label);
|
||||
array_loop(seams, x) {
|
||||
if (!seams.empty()) {
|
||||
tprintf("%s\n", label);
|
||||
for (int x = 0; x < seams.size(); ++x) {
|
||||
sprintf(number, "%2d: ", x);
|
||||
print_seam(number, (SEAM *) array_value(seams, x));
|
||||
print_seam(number, seams[x]);
|
||||
}
|
||||
cprintf("\n");
|
||||
tprintf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
@ -504,7 +363,7 @@ void print_seams(const char *label, SEAMS seams) {
|
||||
* points in common. Return TRUE if any of the same points are present
|
||||
* in any of the splits of both seams.
|
||||
*/
|
||||
int shared_split_points(SEAM *seam1, SEAM *seam2) {
|
||||
int shared_split_points(const SEAM *seam1, const SEAM *seam2) {
|
||||
if (seam1 == NULL || seam2 == NULL)
|
||||
return (FALSE);
|
||||
|
||||
@ -532,23 +391,20 @@ int shared_split_points(SEAM *seam1, SEAM *seam2) {
|
||||
* Break up the blobs in this chain so that they are all independent.
|
||||
* This operation should undo the affect of join_pieces.
|
||||
**********************************************************************/
|
||||
void break_pieces(TBLOB *blobs, SEAMS seams, inT16 start, inT16 end) {
|
||||
TESSLINE *outline = blobs->outlines;
|
||||
TBLOB *next_blob;
|
||||
inT16 x;
|
||||
void break_pieces(const GenericVector<SEAM*>& seams, int first, int last,
|
||||
TWERD *word) {
|
||||
for (int x = first; x < last; ++x)
|
||||
reveal_seam(seams[x]);
|
||||
|
||||
for (x = start; x < end; x++)
|
||||
reveal_seam ((SEAM *) array_value (seams, x));
|
||||
TESSLINE *outline = word->blobs[first]->outlines;
|
||||
int next_blob = first + 1;
|
||||
|
||||
next_blob = blobs->next;
|
||||
|
||||
while (outline && next_blob) {
|
||||
if (outline->next == next_blob->outlines) {
|
||||
while (outline != NULL && next_blob <= last) {
|
||||
if (outline->next == word->blobs[next_blob]->outlines) {
|
||||
outline->next = NULL;
|
||||
outline = next_blob->outlines;
|
||||
next_blob = next_blob->next;
|
||||
}
|
||||
else {
|
||||
outline = word->blobs[next_blob]->outlines;
|
||||
++next_blob;
|
||||
} else {
|
||||
outline = outline->next;
|
||||
}
|
||||
}
|
||||
@ -561,30 +417,19 @@ void break_pieces(TBLOB *blobs, SEAMS seams, inT16 start, inT16 end) {
|
||||
* Join a group of base level pieces into a single blob that can then
|
||||
* be classified.
|
||||
**********************************************************************/
|
||||
void join_pieces(TBLOB *piece_blobs, SEAMS seams, inT16 start, inT16 end) {
|
||||
TBLOB *next_blob;
|
||||
TBLOB *blob;
|
||||
inT16 x;
|
||||
TESSLINE *outline;
|
||||
SEAM *seam;
|
||||
|
||||
for (x = 0, blob = piece_blobs; x < start; x++)
|
||||
blob = blob->next;
|
||||
next_blob = blob->next;
|
||||
outline = blob->outlines;
|
||||
void join_pieces(const GenericVector<SEAM*>& seams, int first, int last,
|
||||
TWERD *word) {
|
||||
TESSLINE *outline = word->blobs[first]->outlines;
|
||||
if (!outline)
|
||||
return;
|
||||
|
||||
while (x < end) {
|
||||
seam = (SEAM *) array_value (seams, x);
|
||||
if (x - seam->widthn >= start && x + seam->widthp < end)
|
||||
for (int x = first; x < last; ++x) {
|
||||
SEAM *seam = seams[x];
|
||||
if (x - seam->widthn >= first && x + seam->widthp < last)
|
||||
hide_seam(seam);
|
||||
while (outline->next)
|
||||
outline = outline->next;
|
||||
outline->next = next_blob->outlines;
|
||||
next_blob = next_blob->next;
|
||||
|
||||
x++;
|
||||
outline->next = word->blobs[x + 1]->outlines;
|
||||
}
|
||||
}
|
||||
|
||||
@ -626,7 +471,7 @@ void hide_edge_pair(EDGEPT *pt1, EDGEPT *pt2) {
|
||||
}
|
||||
while (!exact_point (edgept, pt2) && edgept != pt1);
|
||||
if (edgept == pt1) {
|
||||
/* cprintf("Hid entire outline at (%d,%d)!!\n",
|
||||
/* tprintf("Hid entire outline at (%d,%d)!!\n",
|
||||
edgept->pos.x,edgept->pos.y); */
|
||||
}
|
||||
edgept = pt2;
|
||||
@ -636,7 +481,7 @@ void hide_edge_pair(EDGEPT *pt1, EDGEPT *pt2) {
|
||||
}
|
||||
while (!exact_point (edgept, pt1) && edgept != pt2);
|
||||
if (edgept == pt2) {
|
||||
/* cprintf("Hid entire outline at (%d,%d)!!\n",
|
||||
/* tprintf("Hid entire outline at (%d,%d)!!\n",
|
||||
edgept->pos.x,edgept->pos.y); */
|
||||
}
|
||||
}
|
||||
@ -679,7 +524,7 @@ void reveal_edge_pair(EDGEPT *pt1, EDGEPT *pt2) {
|
||||
}
|
||||
while (!exact_point (edgept, pt2) && edgept != pt1);
|
||||
if (edgept == pt1) {
|
||||
/* cprintf("Hid entire outline at (%d,%d)!!\n",
|
||||
/* tprintf("Hid entire outline at (%d,%d)!!\n",
|
||||
edgept->pos.x,edgept->pos.y); */
|
||||
}
|
||||
edgept = pt2;
|
||||
@ -689,7 +534,7 @@ void reveal_edge_pair(EDGEPT *pt1, EDGEPT *pt2) {
|
||||
}
|
||||
while (!exact_point (edgept, pt1) && edgept != pt2);
|
||||
if (edgept == pt2) {
|
||||
/* cprintf("Hid entire outline at (%d,%d)!!\n",
|
||||
/* tprintf("Hid entire outline at (%d,%d)!!\n",
|
||||
edgept->pos.x,edgept->pos.y); */
|
||||
}
|
||||
}
|
||||
|
109
ccstruct/seam.h
109
ccstruct/seam.h
@ -30,15 +30,36 @@
|
||||
----------------------------------------------------------------------*/
|
||||
#include "blobs.h"
|
||||
#include "split.h"
|
||||
#include "tessarray.h"
|
||||
|
||||
/*----------------------------------------------------------------------
|
||||
T y p e s
|
||||
----------------------------------------------------------------------*/
|
||||
typedef float PRIORITY; /* PRIORITY */
|
||||
|
||||
typedef struct seam_record
|
||||
{ /* SEAM */
|
||||
struct SEAM {
|
||||
// Constructor that was formerly new_seam.
|
||||
SEAM(PRIORITY priority0, const TPOINT& location0,
|
||||
SPLIT *splita, SPLIT *splitb, SPLIT *splitc)
|
||||
: priority(priority0), widthp(0), widthn(0), location(location0),
|
||||
split1(splita), split2(splitb), split3(splitc) {}
|
||||
// Copy constructor that was formerly clone_seam.
|
||||
SEAM(const SEAM& src)
|
||||
: priority(src.priority), widthp(src.widthp), widthn(src.widthn),
|
||||
location(src.location) {
|
||||
clone_split(split1, src.split1);
|
||||
clone_split(split2, src.split2);
|
||||
clone_split(split3, src.split3);
|
||||
}
|
||||
// Destructor was delete_seam.
|
||||
~SEAM() {
|
||||
if (split1)
|
||||
delete_split(split1);
|
||||
if (split2)
|
||||
delete_split(split2);
|
||||
if (split3)
|
||||
delete_split(split3);
|
||||
}
|
||||
|
||||
PRIORITY priority;
|
||||
inT8 widthp;
|
||||
inT8 widthn;
|
||||
@ -46,36 +67,7 @@ typedef struct seam_record
|
||||
SPLIT *split1;
|
||||
SPLIT *split2;
|
||||
SPLIT *split3;
|
||||
} SEAM;
|
||||
|
||||
typedef ARRAY SEAMS; /* SEAMS */
|
||||
|
||||
extern SEAM *newseam();
|
||||
|
||||
/*----------------------------------------------------------------------
|
||||
M a c r o s
|
||||
----------------------------------------------------------------------*/
|
||||
/**
|
||||
* @name clone_seam
|
||||
*
|
||||
* Create a new seam record and copy the contents of this seam into it.
|
||||
*/
|
||||
|
||||
#define clone_seam(dest,source) \
|
||||
if (source) { \
|
||||
(dest) = newseam (); \
|
||||
(dest)->location = (source)->location; \
|
||||
(dest)->widthp = (source)->widthp; \
|
||||
(dest)->widthn = (source)->widthn; \
|
||||
(dest)->priority = (source)->priority; \
|
||||
clone_split ((dest)->split1, (source)->split1); \
|
||||
clone_split ((dest)->split2, (source)->split2); \
|
||||
clone_split ((dest)->split3, (source)->split3); \
|
||||
} \
|
||||
else { \
|
||||
(dest) = (SEAM*) NULL; \
|
||||
} \
|
||||
|
||||
};
|
||||
|
||||
/**
|
||||
* exact_point
|
||||
@ -92,61 +84,40 @@ else { \
|
||||
----------------------------------------------------------------------*/
|
||||
bool point_in_split(SPLIT *split, EDGEPT *point1, EDGEPT *point2);
|
||||
|
||||
bool point_in_seam(SEAM *seam, SPLIT *split);
|
||||
bool point_in_seam(const SEAM *seam, SPLIT *split);
|
||||
|
||||
bool point_used_by_split(SPLIT *split, EDGEPT *point);
|
||||
|
||||
bool point_used_by_seam(SEAM *seam, EDGEPT *point);
|
||||
|
||||
SEAMS add_seam(SEAMS seam_list, SEAM *seam);
|
||||
|
||||
void combine_seams(SEAM *dest_seam, SEAM *source_seam);
|
||||
|
||||
void delete_seam(void *arg); //SEAM *seam);
|
||||
void start_seam_list(TWERD *word, GenericVector<SEAM*>* seam_array);
|
||||
|
||||
SEAMS start_seam_list(TBLOB *blobs);
|
||||
bool test_insert_seam(const GenericVector<SEAM*>& seam_array,
|
||||
TWERD *word, int index);
|
||||
|
||||
void free_seam_list(SEAMS seam_list);
|
||||
void insert_seam(const TWERD *word, int index, SEAM *seam,
|
||||
GenericVector<SEAM*>* seam_array);
|
||||
|
||||
bool test_insert_seam(SEAMS seam_list,
|
||||
int index,
|
||||
TBLOB *left_blob,
|
||||
TBLOB *first_blob);
|
||||
|
||||
SEAMS insert_seam(SEAMS seam_list,
|
||||
int index,
|
||||
SEAM *seam,
|
||||
TBLOB *left_blob,
|
||||
TBLOB *first_blob);
|
||||
|
||||
int account_splits_right(SEAM *seam, TBLOB *blob);
|
||||
|
||||
int account_splits_left(SEAM *seam, TBLOB *blob, TBLOB *end_blob);
|
||||
|
||||
void account_splits_left_helper(SEAM *seam, TBLOB *blob, TBLOB *end_blob,
|
||||
inT32 *depth, inT8 *width, inT8 *found_em);
|
||||
int account_splits(const SEAM *seam, const TWERD *word, int blob_index,
|
||||
int blob_direction);
|
||||
|
||||
bool find_split_in_blob(SPLIT *split, TBLOB *blob);
|
||||
|
||||
SEAM *join_two_seams(SEAM *seam1, SEAM *seam2);
|
||||
|
||||
SEAM *new_seam(PRIORITY priority,
|
||||
const TPOINT& location,
|
||||
SPLIT *split1,
|
||||
SPLIT *split2,
|
||||
SPLIT *split3);
|
||||
|
||||
SEAMS new_seam_list();
|
||||
SEAM *join_two_seams(const SEAM *seam1, const SEAM *seam2);
|
||||
|
||||
void print_seam(const char *label, SEAM *seam);
|
||||
|
||||
void print_seams(const char *label, SEAMS seams);
|
||||
void print_seams(const char *label, const GenericVector<SEAM*>& seams);
|
||||
|
||||
int shared_split_points(SEAM *seam1, SEAM *seam2);
|
||||
int shared_split_points(const SEAM *seam1, const SEAM *seam2);
|
||||
|
||||
void break_pieces(TBLOB *blobs, SEAMS seams, inT16 start, inT16 end);
|
||||
void break_pieces(const GenericVector<SEAM*>& seams,
|
||||
int first, int last, TWERD *word);
|
||||
|
||||
void join_pieces(TBLOB *piece_blobs, SEAMS seams, inT16 start, inT16 end);
|
||||
void join_pieces(const GenericVector<SEAM*>& seams,
|
||||
int first, int last, TWERD *word);
|
||||
|
||||
void hide_seam(SEAM *seam);
|
||||
|
||||
|
@ -26,8 +26,8 @@
|
||||
I n c l u d e s
|
||||
----------------------------------------------------------------------*/
|
||||
#include "split.h"
|
||||
#include "structures.h"
|
||||
#include "callcpp.h"
|
||||
#include "coutln.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
#ifdef __UNIX__
|
||||
#include <assert.h>
|
||||
@ -38,8 +38,6 @@
|
||||
----------------------------------------------------------------------*/
|
||||
BOOL_VAR(wordrec_display_splits, 0, "Display splits");
|
||||
|
||||
makestructure(newsplit, free_split, SPLIT);
|
||||
|
||||
/*----------------------------------------------------------------------
|
||||
F u n c t i o n s
|
||||
----------------------------------------------------------------------*/
|
||||
@ -47,12 +45,11 @@ makestructure(newsplit, free_split, SPLIT);
|
||||
/**********************************************************************
|
||||
* delete_split
|
||||
*
|
||||
* Remove this split from existance. Take if off the display list and
|
||||
* deallocate its memory.
|
||||
* Remove this split from existence.
|
||||
**********************************************************************/
|
||||
void delete_split(SPLIT *split) {
|
||||
if (split) {
|
||||
free_split(split);
|
||||
delete split;
|
||||
}
|
||||
}
|
||||
|
||||
@ -68,6 +65,43 @@ EDGEPT *make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev) {
|
||||
this_edgept = new EDGEPT;
|
||||
this_edgept->pos.x = x;
|
||||
this_edgept->pos.y = y;
|
||||
// Now deal with the src_outline steps.
|
||||
C_OUTLINE* prev_ol = prev->src_outline;
|
||||
if (prev_ol != NULL && prev->next == next) {
|
||||
// Compute the fraction of the segment that is being cut.
|
||||
FCOORD segment_vec(next->pos.x - prev->pos.x, next->pos.y - prev->pos.y);
|
||||
FCOORD target_vec(x - prev->pos.x, y - prev->pos.y);
|
||||
double cut_fraction = target_vec.length() / segment_vec.length();
|
||||
// Get the start and end at the step level.
|
||||
ICOORD step_start = prev_ol->position_at_index(prev->start_step);
|
||||
int end_step = prev->start_step + prev->step_count;
|
||||
int step_length = prev_ol->pathlength();
|
||||
ICOORD step_end = prev_ol->position_at_index(end_step % step_length);
|
||||
ICOORD step_vec = step_end - step_start;
|
||||
double target_length = step_vec.length() * cut_fraction;
|
||||
// Find the point on the segment that gives the length nearest to target.
|
||||
int best_step = prev->start_step;
|
||||
ICOORD total_step(0, 0);
|
||||
double best_dist = target_length;
|
||||
for (int s = prev->start_step; s < end_step; ++s) {
|
||||
total_step += prev_ol->step(s % step_length);
|
||||
double dist = fabs(target_length - total_step.length());
|
||||
if (dist < best_dist) {
|
||||
best_dist = dist;
|
||||
best_step = s + 1;
|
||||
}
|
||||
}
|
||||
// The new point is an intermediate point.
|
||||
this_edgept->src_outline = prev_ol;
|
||||
this_edgept->step_count = end_step - best_step;
|
||||
this_edgept->start_step = best_step % step_length;
|
||||
prev->step_count = best_step - prev->start_step;
|
||||
} else {
|
||||
// The new point is poly only.
|
||||
this_edgept->src_outline = NULL;
|
||||
this_edgept->step_count = 0;
|
||||
this_edgept->start_step = 0;
|
||||
}
|
||||
/* Hook it up */
|
||||
this_edgept->next = next;
|
||||
this_edgept->prev = prev;
|
||||
@ -78,8 +112,7 @@ EDGEPT *make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev) {
|
||||
this_edgept->vec.y = this_edgept->next->pos.y - y;
|
||||
this_edgept->prev->vec.x = x - this_edgept->prev->pos.x;
|
||||
this_edgept->prev->vec.y = y - this_edgept->prev->pos.y;
|
||||
|
||||
return (this_edgept);
|
||||
return this_edgept;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
@ -90,6 +123,10 @@ EDGEPT *make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev) {
|
||||
void remove_edgept(EDGEPT *point) {
|
||||
EDGEPT *prev = point->prev;
|
||||
EDGEPT *next = point->next;
|
||||
// Add point's steps onto prev's steps if they are from the same outline.
|
||||
if (prev->src_outline == point->src_outline && prev->src_outline != NULL) {
|
||||
prev->step_count += point->step_count;
|
||||
}
|
||||
prev->next = next;
|
||||
next->prev = prev;
|
||||
prev->vec.x = next->pos.x - prev->pos.x;
|
||||
@ -104,8 +141,7 @@ void remove_edgept(EDGEPT *point) {
|
||||
* list.
|
||||
**********************************************************************/
|
||||
SPLIT *new_split(EDGEPT *point1, EDGEPT *point2) {
|
||||
SPLIT *s;
|
||||
s = (SPLIT *) newsplit ();
|
||||
SPLIT *s = new SPLIT;
|
||||
s->point1 = point1;
|
||||
s->point2 = point2;
|
||||
return (s);
|
||||
@ -120,9 +156,9 @@ SPLIT *new_split(EDGEPT *point1, EDGEPT *point2) {
|
||||
**********************************************************************/
|
||||
void print_split(SPLIT *split) {
|
||||
if (split) {
|
||||
cprintf ("(%d,%d)--(%d,%d)",
|
||||
split->point1->pos.x, split->point1->pos.y,
|
||||
split->point2->pos.x, split->point2->pos.y);
|
||||
tprintf("(%d,%d)--(%d,%d)",
|
||||
split->point1->pos.x, split->point1->pos.y,
|
||||
split->point2->pos.x, split->point2->pos.y);
|
||||
}
|
||||
}
|
||||
|
||||
@ -130,23 +166,35 @@ void print_split(SPLIT *split) {
|
||||
/**********************************************************************
|
||||
* split_outline
|
||||
*
|
||||
* Split between these two edge points. Apply a split and return a
|
||||
* pointer to the other side of the split.
|
||||
* Split between these two edge points.
|
||||
**********************************************************************/
|
||||
void split_outline(EDGEPT *join_point1, EDGEPT *join_point2) {
|
||||
EDGEPT *join_point1a;
|
||||
EDGEPT *temp2;
|
||||
EDGEPT *temp1;
|
||||
assert(join_point1 != join_point2);
|
||||
|
||||
assert (join_point1 != join_point2);
|
||||
|
||||
temp2 = join_point2->next;
|
||||
temp1 = join_point1->next;
|
||||
EDGEPT* temp2 = join_point2->next;
|
||||
EDGEPT* temp1 = join_point1->next;
|
||||
/* Create two new points */
|
||||
join_point1a = make_edgept (join_point1->pos.x,
|
||||
join_point1->pos.y, temp1, join_point2);
|
||||
|
||||
make_edgept (join_point2->pos.x, join_point2->pos.y, temp2, join_point1);
|
||||
EDGEPT* new_point1 = make_edgept(join_point1->pos.x, join_point1->pos.y,
|
||||
temp1, join_point2);
|
||||
EDGEPT* new_point2 = make_edgept(join_point2->pos.x, join_point2->pos.y,
|
||||
temp2, join_point1);
|
||||
// Join_point1 and 2 are now cross-over points, so they must have NULL
|
||||
// src_outlines and give their src_outline information their new
|
||||
// replacements.
|
||||
new_point1->src_outline = join_point1->src_outline;
|
||||
new_point1->start_step = join_point1->start_step;
|
||||
new_point1->step_count = join_point1->step_count;
|
||||
new_point2->src_outline = join_point2->src_outline;
|
||||
new_point2->start_step = join_point2->start_step;
|
||||
new_point2->step_count = join_point2->step_count;
|
||||
join_point1->src_outline = NULL;
|
||||
join_point1->start_step = 0;
|
||||
join_point1->step_count = 0;
|
||||
join_point2->src_outline = NULL;
|
||||
join_point2->start_step = 0;
|
||||
join_point2->step_count = 0;
|
||||
join_point1->MarkChop();
|
||||
join_point2->MarkChop();
|
||||
}
|
||||
|
||||
|
||||
@ -164,8 +212,18 @@ void unsplit_outlines(EDGEPT *p1, EDGEPT *p2) {
|
||||
tmp1->next->prev = p2;
|
||||
tmp2->next->prev = p1;
|
||||
|
||||
// tmp2 is coincident with p1. p1 takes tmp2's place as tmp2 is deleted.
|
||||
p1->next = tmp2->next;
|
||||
p1->src_outline = tmp2->src_outline;
|
||||
p1->start_step = tmp2->start_step;
|
||||
p1->step_count = tmp2->step_count;
|
||||
// Likewise p2 takes tmp1's place.
|
||||
p2->next = tmp1->next;
|
||||
p2->src_outline = tmp1->src_outline;
|
||||
p2->start_step = tmp1->start_step;
|
||||
p2->step_count = tmp1->step_count;
|
||||
p1->UnmarkChop();
|
||||
p2->UnmarkChop();
|
||||
|
||||
delete tmp1;
|
||||
delete tmp2;
|
||||
|
@ -42,8 +42,7 @@ class EDGEPT;
|
||||
|
||||
#define point_diff(p,p1,p2) \
|
||||
((p).x = (p1).x - (p2).x, \
|
||||
(p).y = (p1).y - (p2).y, \
|
||||
(p))
|
||||
(p).y = (p1).y - (p2).y)
|
||||
|
||||
/**********************************************************************
|
||||
* CROSS
|
||||
|
@ -465,7 +465,7 @@ WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
|
||||
TBOX a_blob_box = a_blob->bounding_box();
|
||||
if ((not_found_box.major_overlap(a_blob_box) ||
|
||||
a_blob_box.major_overlap(not_found_box)) &&
|
||||
not_found_box.y_overlap(a_blob_box)) {
|
||||
not_found_box.y_overlap(a_blob_box) > 0.8) {
|
||||
// Already taken care of.
|
||||
delete not_found_it.extract();
|
||||
break;
|
||||
|
@ -10,18 +10,16 @@ AM_CXXFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
|
||||
AM_CPPFLAGS += -DTESS_EXPORTS
|
||||
endif
|
||||
|
||||
EXTRA_DIST = mfcpch.cpp
|
||||
|
||||
include_HEADERS = \
|
||||
basedir.h errcode.h fileerr.h genericvector.h helpers.h host.h memry.h \
|
||||
ndminx.h params.h ocrclass.h platform.h serialis.h strngs.h \
|
||||
tesscallback.h unichar.h unicharmap.h unicharset.h
|
||||
|
||||
noinst_HEADERS = \
|
||||
ambigs.h bits16.h bitvector.h ccutil.h clst.h elst2.h \
|
||||
elst.h globaloc.h hashfn.h indexmapbidi.h lsterr.h \
|
||||
nwmain.h qrsequence.h secname.h sorthelper.h stderr.h tessdatamanager.h \
|
||||
tprintf.h unicity_table.h unicodes.h
|
||||
ambigs.h bits16.h bitvector.h ccutil.h clst.h doubleptr.h elst2.h \
|
||||
elst.h genericheap.h globaloc.h hashfn.h indexmapbidi.h kdpair.h lsterr.h \
|
||||
nwmain.h object_cache.h qrsequence.h secname.h sorthelper.h stderr.h tessdatamanager.h \
|
||||
tprintf.h unicity_table.h unicodes.h universalambigs.h
|
||||
|
||||
if !USING_MULTIPLELIBS
|
||||
noinst_LTLIBRARIES = libtesseract_ccutil.la
|
||||
@ -39,7 +37,7 @@ libtesseract_ccutil_la_SOURCES = \
|
||||
serialis.cpp strngs.cpp \
|
||||
tessdatamanager.cpp tprintf.cpp \
|
||||
unichar.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \
|
||||
params.cpp
|
||||
params.cpp universalambigs.cpp
|
||||
|
||||
if EMBEDDED
|
||||
include_HEADERS += scanutils.h
|
||||
@ -50,4 +48,4 @@ if MINGW
|
||||
AM_CPPFLAGS += -I$(top_srcdir)/vs2008/port -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\"
|
||||
noinst_HEADERS += ../vs2008/port/strtok_r.h
|
||||
libtesseract_ccutil_la_SOURCES += ../vs2008/port/strtok_r.cpp
|
||||
endif
|
||||
endif
|
||||
|
@ -19,7 +19,10 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "ambigs.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include "helpers.h"
|
||||
#include "universalambigs.h"
|
||||
|
||||
#ifdef _WIN32
|
||||
#ifndef __GNUC__
|
||||
@ -31,6 +34,11 @@
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Maximum line size:
|
||||
// 10 for sizes of ambigs, tabs, abmig type and newline
|
||||
// UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig
|
||||
const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);
|
||||
|
||||
AmbigSpec::AmbigSpec() {
|
||||
wrong_ngram[0] = INVALID_UNICHAR_ID;
|
||||
correct_fragments[0] = INVALID_UNICHAR_ID;
|
||||
@ -41,14 +49,10 @@ AmbigSpec::AmbigSpec() {
|
||||
|
||||
ELISTIZE(AmbigSpec);
|
||||
|
||||
void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile,
|
||||
inT64 end_offset,
|
||||
int debug_level,
|
||||
bool use_ambigs_for_adaption,
|
||||
UNICHARSET *unicharset) {
|
||||
int i, j;
|
||||
UnicharIdVector *adaption_ambigs_entry;
|
||||
for (i = 0; i < unicharset->size(); ++i) {
|
||||
// Initializes the ambigs by adding a NULL pointer to each table.
|
||||
void UnicharAmbigs::InitUnicharAmbigs(const UNICHARSET& unicharset,
|
||||
bool use_ambigs_for_adaption) {
|
||||
for (int i = 0; i < unicharset.size(); ++i) {
|
||||
replace_ambigs_.push_back(NULL);
|
||||
dang_ambigs_.push_back(NULL);
|
||||
one_to_one_definite_ambigs_.push_back(NULL);
|
||||
@ -57,85 +61,103 @@ void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile,
|
||||
reverse_ambigs_for_adaption_.push_back(NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Loads the universal ambigs that are useful for any language.
|
||||
void UnicharAmbigs::LoadUniversal(const UNICHARSET& encoder_set,
|
||||
UNICHARSET* unicharset) {
|
||||
FILE* fp = fmemopen(const_cast<char*>(kUniversalAmbigsFile),
|
||||
ksizeofUniversalAmbigsFile, "rb");
|
||||
if (fp == NULL) return;
|
||||
LoadUnicharAmbigs(encoder_set, fp, -1ll, 0, false, unicharset);
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET& encoder_set,
|
||||
FILE *ambig_file,
|
||||
inT64 end_offset,
|
||||
int debug_level,
|
||||
bool use_ambigs_for_adaption,
|
||||
UNICHARSET *unicharset) {
|
||||
int i, j;
|
||||
UnicharIdVector *adaption_ambigs_entry;
|
||||
if (debug_level) tprintf("Reading ambiguities\n");
|
||||
|
||||
int TestAmbigPartSize;
|
||||
int ReplacementAmbigPartSize;
|
||||
// Maximum line size:
|
||||
// 10 for sizes of ambigs, tabs, abmig type and newline
|
||||
// UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig
|
||||
int test_ambig_part_size;
|
||||
int replacement_ambig_part_size;
|
||||
// The space for buffer is allocated on the heap to avoid
|
||||
// GCC frame size warning.
|
||||
const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);
|
||||
const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;
|
||||
char *buffer = new char[kBufferSize];
|
||||
char ReplacementString[kMaxAmbigStringSize];
|
||||
UNICHAR_ID TestUnicharIds[MAX_AMBIG_SIZE + 1];
|
||||
char replacement_string[kMaxAmbigStringSize];
|
||||
UNICHAR_ID test_unichar_ids[MAX_AMBIG_SIZE + 1];
|
||||
int line_num = 0;
|
||||
int type = NOT_AMBIG;
|
||||
|
||||
// Determine the version of the ambigs file.
|
||||
int version = 0;
|
||||
ASSERT_HOST(fgets(buffer, kBufferSize, AmbigFile) != NULL &&
|
||||
ASSERT_HOST(fgets(buffer, kBufferSize, ambig_file) != NULL &&
|
||||
strlen(buffer) > 0);
|
||||
if (*buffer == 'v') {
|
||||
version = static_cast<int>(strtol(buffer+1, NULL, 10));
|
||||
++line_num;
|
||||
} else {
|
||||
rewind(AmbigFile);
|
||||
rewind(ambig_file);
|
||||
}
|
||||
while ((end_offset < 0 || ftell(AmbigFile) < end_offset) &&
|
||||
fgets(buffer, kBufferSize, AmbigFile) != NULL) {
|
||||
while ((end_offset < 0 || ftell(ambig_file) < end_offset) &&
|
||||
fgets(buffer, kBufferSize, ambig_file) != NULL) {
|
||||
chomp_string(buffer);
|
||||
if (debug_level > 2) tprintf("read line %s\n", buffer);
|
||||
++line_num;
|
||||
if (!ParseAmbiguityLine(line_num, version, debug_level, *unicharset,
|
||||
buffer, &TestAmbigPartSize, TestUnicharIds,
|
||||
&ReplacementAmbigPartSize,
|
||||
ReplacementString, &type)) continue;
|
||||
if (!ParseAmbiguityLine(line_num, version, debug_level, encoder_set,
|
||||
buffer, &test_ambig_part_size, test_unichar_ids,
|
||||
&replacement_ambig_part_size,
|
||||
replacement_string, &type)) continue;
|
||||
// Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.
|
||||
AmbigSpec *ambig_spec = new AmbigSpec();
|
||||
InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_,
|
||||
TestAmbigPartSize, TestUnicharIds,
|
||||
ReplacementAmbigPartSize, ReplacementString, type,
|
||||
ambig_spec, unicharset);
|
||||
if (!InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_
|
||||
: dang_ambigs_,
|
||||
test_ambig_part_size, test_unichar_ids,
|
||||
replacement_ambig_part_size, replacement_string, type,
|
||||
ambig_spec, unicharset))
|
||||
continue;
|
||||
|
||||
// Update one_to_one_definite_ambigs_.
|
||||
if (TestAmbigPartSize == 1 &&
|
||||
ReplacementAmbigPartSize == 1 && type == DEFINITE_AMBIG) {
|
||||
if (one_to_one_definite_ambigs_[TestUnicharIds[0]] == NULL) {
|
||||
one_to_one_definite_ambigs_[TestUnicharIds[0]] = new UnicharIdVector();
|
||||
if (test_ambig_part_size == 1 &&
|
||||
replacement_ambig_part_size == 1 && type == DEFINITE_AMBIG) {
|
||||
if (one_to_one_definite_ambigs_[test_unichar_ids[0]] == NULL) {
|
||||
one_to_one_definite_ambigs_[test_unichar_ids[0]] = new UnicharIdVector();
|
||||
}
|
||||
one_to_one_definite_ambigs_[TestUnicharIds[0]]->push_back(
|
||||
one_to_one_definite_ambigs_[test_unichar_ids[0]]->push_back(
|
||||
ambig_spec->correct_ngram_id);
|
||||
}
|
||||
// Update ambigs_for_adaption_.
|
||||
if (use_ambigs_for_adaption) {
|
||||
for (i = 0; i < TestAmbigPartSize; ++i) {
|
||||
if (ambigs_for_adaption_[TestUnicharIds[i]] == NULL) {
|
||||
ambigs_for_adaption_[TestUnicharIds[i]] = new UnicharIdVector();
|
||||
}
|
||||
adaption_ambigs_entry = ambigs_for_adaption_[TestUnicharIds[i]];
|
||||
const char *tmp_ptr = ReplacementString;
|
||||
const char *tmp_ptr_end = ReplacementString + strlen(ReplacementString);
|
||||
int step = unicharset->step(tmp_ptr);
|
||||
while (step > 0) {
|
||||
UNICHAR_ID id_to_insert = unicharset->unichar_to_id(tmp_ptr, step);
|
||||
ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID);
|
||||
// Add the new unichar id to adaption_ambigs_entry (only if the
|
||||
// vector does not already contain it) keeping it in sorted order.
|
||||
for (j = 0; j < adaption_ambigs_entry->size() &&
|
||||
(*adaption_ambigs_entry)[j] > id_to_insert; ++j);
|
||||
if (j < adaption_ambigs_entry->size()) {
|
||||
if ((*adaption_ambigs_entry)[j] != id_to_insert) {
|
||||
adaption_ambigs_entry->insert(id_to_insert, j);
|
||||
}
|
||||
} else {
|
||||
adaption_ambigs_entry->push_back(id_to_insert);
|
||||
GenericVector<UNICHAR_ID> encoding;
|
||||
// Silently ignore invalid strings, as before, so it is safe to use a
|
||||
// universal ambigs file.
|
||||
if (unicharset->encode_string(replacement_string, true, &encoding,
|
||||
NULL, NULL)) {
|
||||
for (i = 0; i < test_ambig_part_size; ++i) {
|
||||
if (ambigs_for_adaption_[test_unichar_ids[i]] == NULL) {
|
||||
ambigs_for_adaption_[test_unichar_ids[i]] = new UnicharIdVector();
|
||||
}
|
||||
adaption_ambigs_entry = ambigs_for_adaption_[test_unichar_ids[i]];
|
||||
for (int r = 0; r < encoding.size(); ++r) {
|
||||
UNICHAR_ID id_to_insert = encoding[r];
|
||||
ASSERT_HOST(id_to_insert != INVALID_UNICHAR_ID);
|
||||
// Add the new unichar id to adaption_ambigs_entry (only if the
|
||||
// vector does not already contain it) keeping it in sorted order.
|
||||
for (j = 0; j < adaption_ambigs_entry->size() &&
|
||||
(*adaption_ambigs_entry)[j] > id_to_insert; ++j);
|
||||
if (j < adaption_ambigs_entry->size()) {
|
||||
if ((*adaption_ambigs_entry)[j] != id_to_insert) {
|
||||
adaption_ambigs_entry->insert(id_to_insert, j);
|
||||
}
|
||||
} else {
|
||||
adaption_ambigs_entry->push_back(id_to_insert);
|
||||
}
|
||||
}
|
||||
// Update tmp_ptr and step.
|
||||
tmp_ptr += step;
|
||||
step = tmp_ptr < tmp_ptr_end ? unicharset->step(tmp_ptr) : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -204,51 +226,96 @@ void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile,
|
||||
|
||||
bool UnicharAmbigs::ParseAmbiguityLine(
|
||||
int line_num, int version, int debug_level, const UNICHARSET &unicharset,
|
||||
char *buffer, int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
|
||||
int *ReplacementAmbigPartSize, char *ReplacementString, int *type) {
|
||||
char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
|
||||
int *replacement_ambig_part_size, char *replacement_string, int *type) {
|
||||
if (version > 1) {
|
||||
// Simpler format is just wrong-string correct-string type\n.
|
||||
STRING input(buffer);
|
||||
GenericVector<STRING> fields;
|
||||
input.split(' ', &fields);
|
||||
if (fields.size() != 3) {
|
||||
if (debug_level) tprintf(kIllegalMsg, line_num);
|
||||
return false;
|
||||
}
|
||||
// Encode wrong-string.
|
||||
GenericVector<UNICHAR_ID> unichars;
|
||||
if (!unicharset.encode_string(fields[0].string(), true, &unichars, NULL,
|
||||
NULL)) {
|
||||
return false;
|
||||
}
|
||||
*test_ambig_part_size = unichars.size();
|
||||
if (*test_ambig_part_size > MAX_AMBIG_SIZE) {
|
||||
if (debug_level)
|
||||
tprintf("Too many unichars in ambiguity on line %d\n", line_num);
|
||||
return false;
|
||||
}
|
||||
// Copy encoded string to output.
|
||||
for (int i = 0; i < unichars.size(); ++i)
|
||||
test_unichar_ids[i] = unichars[i];
|
||||
test_unichar_ids[unichars.size()] = INVALID_UNICHAR_ID;
|
||||
// Encode replacement-string to check validity.
|
||||
if (!unicharset.encode_string(fields[1].string(), true, &unichars, NULL,
|
||||
NULL)) {
|
||||
return false;
|
||||
}
|
||||
*replacement_ambig_part_size = unichars.size();
|
||||
if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) {
|
||||
if (debug_level)
|
||||
tprintf("Too many unichars in ambiguity on line %d\n", line_num);
|
||||
return false;
|
||||
}
|
||||
if (sscanf(fields[2].string(), "%d", type) != 1) {
|
||||
if (debug_level) tprintf(kIllegalMsg, line_num);
|
||||
return false;
|
||||
}
|
||||
snprintf(replacement_string, kMaxAmbigStringSize, "%s", fields[1].string());
|
||||
return true;
|
||||
}
|
||||
int i;
|
||||
char *token;
|
||||
char *next_token;
|
||||
if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) ||
|
||||
!sscanf(token, "%d", TestAmbigPartSize) || TestAmbigPartSize <= 0) {
|
||||
!sscanf(token, "%d", test_ambig_part_size) || test_ambig_part_size <= 0) {
|
||||
if (debug_level) tprintf(kIllegalMsg, line_num);
|
||||
return false;
|
||||
}
|
||||
if (*TestAmbigPartSize > MAX_AMBIG_SIZE) {
|
||||
tprintf("Too many unichars in ambiguity on line %d\n");
|
||||
if (*test_ambig_part_size > MAX_AMBIG_SIZE) {
|
||||
if (debug_level)
|
||||
tprintf("Too many unichars in ambiguity on line %d\n", line_num);
|
||||
return false;
|
||||
}
|
||||
for (i = 0; i < *TestAmbigPartSize; ++i) {
|
||||
for (i = 0; i < *test_ambig_part_size; ++i) {
|
||||
if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
|
||||
if (!unicharset.contains_unichar(token)) {
|
||||
if (debug_level) tprintf(kIllegalUnicharMsg, token);
|
||||
break;
|
||||
}
|
||||
TestUnicharIds[i] = unicharset.unichar_to_id(token);
|
||||
test_unichar_ids[i] = unicharset.unichar_to_id(token);
|
||||
}
|
||||
TestUnicharIds[i] = INVALID_UNICHAR_ID;
|
||||
test_unichar_ids[i] = INVALID_UNICHAR_ID;
|
||||
|
||||
if (i != *TestAmbigPartSize ||
|
||||
if (i != *test_ambig_part_size ||
|
||||
!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
|
||||
!sscanf(token, "%d", ReplacementAmbigPartSize) ||
|
||||
*ReplacementAmbigPartSize <= 0) {
|
||||
!sscanf(token, "%d", replacement_ambig_part_size) ||
|
||||
*replacement_ambig_part_size <= 0) {
|
||||
if (debug_level) tprintf(kIllegalMsg, line_num);
|
||||
return false;
|
||||
}
|
||||
if (*ReplacementAmbigPartSize > MAX_AMBIG_SIZE) {
|
||||
tprintf("Too many unichars in ambiguity on line %d\n");
|
||||
if (*replacement_ambig_part_size > MAX_AMBIG_SIZE) {
|
||||
if (debug_level)
|
||||
tprintf("Too many unichars in ambiguity on line %d\n", line_num);
|
||||
return false;
|
||||
}
|
||||
ReplacementString[0] = '\0';
|
||||
for (i = 0; i < *ReplacementAmbigPartSize; ++i) {
|
||||
replacement_string[0] = '\0';
|
||||
for (i = 0; i < *replacement_ambig_part_size; ++i) {
|
||||
if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
|
||||
strcat(ReplacementString, token);
|
||||
strcat(replacement_string, token);
|
||||
if (!unicharset.contains_unichar(token)) {
|
||||
if (debug_level) tprintf(kIllegalUnicharMsg, token);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i != *ReplacementAmbigPartSize) {
|
||||
if (i != *replacement_ambig_part_size) {
|
||||
if (debug_level) tprintf(kIllegalMsg, line_num);
|
||||
return false;
|
||||
}
|
||||
@ -271,20 +338,20 @@ bool UnicharAmbigs::ParseAmbiguityLine(
|
||||
return true;
|
||||
}
|
||||
|
||||
void UnicharAmbigs::InsertIntoTable(
|
||||
UnicharAmbigsVector &table, int TestAmbigPartSize,
|
||||
UNICHAR_ID *TestUnicharIds, int ReplacementAmbigPartSize,
|
||||
const char *ReplacementString, int type,
|
||||
bool UnicharAmbigs::InsertIntoTable(
|
||||
UnicharAmbigsVector &table, int test_ambig_part_size,
|
||||
UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size,
|
||||
const char *replacement_string, int type,
|
||||
AmbigSpec *ambig_spec, UNICHARSET *unicharset) {
|
||||
ambig_spec->type = static_cast<AmbigType>(type);
|
||||
if (TestAmbigPartSize == 1 && ReplacementAmbigPartSize == 1 &&
|
||||
unicharset->to_lower(TestUnicharIds[0]) ==
|
||||
unicharset->to_lower(unicharset->unichar_to_id(ReplacementString))) {
|
||||
if (test_ambig_part_size == 1 && replacement_ambig_part_size == 1 &&
|
||||
unicharset->to_lower(test_unichar_ids[0]) ==
|
||||
unicharset->to_lower(unicharset->unichar_to_id(replacement_string))) {
|
||||
ambig_spec->type = CASE_AMBIG;
|
||||
}
|
||||
|
||||
ambig_spec->wrong_ngram_size =
|
||||
UnicharIdArrayUtils::copy(TestUnicharIds, ambig_spec->wrong_ngram);
|
||||
UnicharIdArrayUtils::copy(test_unichar_ids, ambig_spec->wrong_ngram);
|
||||
|
||||
// Since we need to maintain a constant number of unichar positions in
|
||||
// order to construct ambig_blob_choices vector in NoDangerousAmbig(), for
|
||||
@ -297,21 +364,21 @@ void UnicharAmbigs::InsertIntoTable(
|
||||
// Insert the corresponding correct ngram into the unicharset.
|
||||
// Unicharset code assumes that the "base" ngram is inserted into
|
||||
// the unicharset before fragments of this ngram are inserted.
|
||||
unicharset->unichar_insert(ReplacementString);
|
||||
unicharset->unichar_insert(replacement_string);
|
||||
ambig_spec->correct_ngram_id =
|
||||
unicharset->unichar_to_id(ReplacementString);
|
||||
if (ReplacementAmbigPartSize > 1) {
|
||||
unicharset->unichar_to_id(replacement_string);
|
||||
if (replacement_ambig_part_size > 1) {
|
||||
unicharset->set_isngram(ambig_spec->correct_ngram_id, true);
|
||||
}
|
||||
// Add the corresponding fragments of the wrong ngram to unicharset.
|
||||
int i;
|
||||
for (i = 0; i < TestAmbigPartSize; ++i) {
|
||||
for (i = 0; i < test_ambig_part_size; ++i) {
|
||||
UNICHAR_ID unichar_id;
|
||||
if (TestAmbigPartSize == 1) {
|
||||
if (test_ambig_part_size == 1) {
|
||||
unichar_id = ambig_spec->correct_ngram_id;
|
||||
} else {
|
||||
STRING frag_str = CHAR_FRAGMENT::to_string(
|
||||
ReplacementString, i, TestAmbigPartSize, false);
|
||||
replacement_string, i, test_ambig_part_size, false);
|
||||
unicharset->unichar_insert(frag_str.string());
|
||||
unichar_id = unicharset->unichar_to_id(frag_str.string());
|
||||
}
|
||||
@ -321,11 +388,14 @@ void UnicharAmbigs::InsertIntoTable(
|
||||
|
||||
// Add AmbigSpec for this ambiguity to the corresponding AmbigSpec_LIST.
|
||||
// Keep AmbigSpec_LISTs sorted by AmbigSpec.wrong_ngram.
|
||||
if (table[TestUnicharIds[0]] == NULL) {
|
||||
table[TestUnicharIds[0]] = new AmbigSpec_LIST();
|
||||
if (table[test_unichar_ids[0]] == NULL) {
|
||||
table[test_unichar_ids[0]] = new AmbigSpec_LIST();
|
||||
}
|
||||
table[TestUnicharIds[0]]->add_sorted(
|
||||
AmbigSpec::compare_ambig_specs, false, ambig_spec);
|
||||
if (table[test_unichar_ids[0]]->add_sorted(
|
||||
AmbigSpec::compare_ambig_specs, true, ambig_spec))
|
||||
return true;
|
||||
delete ambig_spec;
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
|
@ -123,7 +123,10 @@ class AmbigSpec : public ELIST_LINK {
|
||||
*reinterpret_cast<const AmbigSpec * const *>(spec1);
|
||||
const AmbigSpec *s2 =
|
||||
*reinterpret_cast<const AmbigSpec * const *>(spec2);
|
||||
return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
|
||||
int result = UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
|
||||
if (result != 0) return result;
|
||||
return UnicharIdArrayUtils::compare(s1->correct_fragments,
|
||||
s2->correct_fragments);
|
||||
}
|
||||
|
||||
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
|
||||
@ -150,6 +153,13 @@ class UnicharAmbigs {
|
||||
const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
|
||||
const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }
|
||||
|
||||
// Initializes the ambigs by adding a NULL pointer to each table.
|
||||
void InitUnicharAmbigs(const UNICHARSET& unicharset,
|
||||
bool use_ambigs_for_adaption);
|
||||
|
||||
// Loads the universal ambigs that are useful for any language.
|
||||
void LoadUniversal(const UNICHARSET& encoder_set, UNICHARSET* unicharset);
|
||||
|
||||
// Fills in two ambiguity tables (replaceable and dangerous) with information
|
||||
// read from the ambigs file. An ambiguity table is an array of lists.
|
||||
// The array is indexed by a class id. Each entry in the table provides
|
||||
@ -160,7 +170,10 @@ class UnicharAmbigs {
|
||||
// one_to_one_definite_ambigs_. This vector is also indexed by the class id
|
||||
// of the wrong part of the ambiguity and each entry contains a vector of
|
||||
// unichar ids that are ambiguous to it.
|
||||
void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset, int debug_level,
|
||||
// encoder_set is used to encode the ambiguity strings, undisturbed by new
|
||||
// unichar_ids that may be created by adding the ambigs.
|
||||
void LoadUnicharAmbigs(const UNICHARSET& encoder_set,
|
||||
FILE *ambigs_file, inT64 end_offset, int debug_level,
|
||||
bool use_ambigs_for_adaption, UNICHARSET *unicharset);
|
||||
|
||||
// Returns definite 1-1 ambigs for the given unichar id.
|
||||
@ -191,17 +204,18 @@ class UnicharAmbigs {
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
bool ParseAmbiguityLine(int line_num, int version, int debug_level,
|
||||
const UNICHARSET &unicharset, char *buffer,
|
||||
int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
|
||||
int *ReplacementAmbigPartSize,
|
||||
char *ReplacementString, int *type);
|
||||
void InsertIntoTable(UnicharAmbigsVector &table,
|
||||
int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
|
||||
int ReplacementAmbigPartSize,
|
||||
const char *ReplacementString, int type,
|
||||
int *test_ambig_part_size,
|
||||
UNICHAR_ID *test_unichar_ids,
|
||||
int *replacement_ambig_part_size,
|
||||
char *replacement_string, int *type);
|
||||
bool InsertIntoTable(UnicharAmbigsVector &table,
|
||||
int test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
|
||||
int replacement_ambig_part_size,
|
||||
const char *replacement_string, int type,
|
||||
AmbigSpec *ambig_spec, UNICHARSET *unicharset);
|
||||
|
||||
UnicharAmbigsVector dang_ambigs_;
|
||||
UnicharAmbigsVector replace_ambigs_;
|
||||
GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
|
||||
|
93
ccutil/doubleptr.h
Normal file
93
ccutil/doubleptr.h
Normal file
@ -0,0 +1,93 @@
|
||||
// Copyright 2012 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: doubleptr.h
|
||||
// Description: Double-ended pointer that keeps pointing correctly even
|
||||
// when reallocated or copied.
|
||||
// Author: Ray Smith
|
||||
// Created: Wed Mar 14 12:22:57 PDT 2012
|
||||
//
|
||||
// (C) Copyright 2012, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCUTIL_DOUBLEPTR_H_
|
||||
#define TESSERACT_CCUTIL_DOUBLEPTR_H_
|
||||
|
||||
#include "errcode.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// A smart pointer class that implements a double-ended pointer. Each end
|
||||
// points to the other end. The copy constructor and operator= have MOVE
|
||||
// semantics, meaning that the relationship with the other end moves to the
|
||||
// destination of the copy, leaving the source unattached.
|
||||
// For this reason both the copy constructor and the operator= take a non-const
|
||||
// reference argument, and the const reference versions cannot be used.
|
||||
// DoublePtr is useful to incorporate into structures that are part of a
|
||||
// collection such as GenericVector or STL containers, where reallocs can
|
||||
// relocate the members. DoublePtr is also useful in a GenericHeap, where it
|
||||
// can correctly maintain the pointer to an element of the heap despite it
|
||||
// getting moved around on the heap.
|
||||
class DoublePtr {
|
||||
public:
|
||||
DoublePtr() : other_end_(NULL) {}
|
||||
// Copy constructor steals the partner off src and is therefore a non
|
||||
// const reference arg.
|
||||
// Copying a const DoublePtr generates a compiler error.
|
||||
DoublePtr(DoublePtr& src) {
|
||||
other_end_ = src.other_end_;
|
||||
if (other_end_ != NULL) {
|
||||
other_end_->other_end_ = this;
|
||||
src.other_end_ = NULL;
|
||||
}
|
||||
}
|
||||
// Operator= steals the partner off src, and therefore needs src to be a non-
|
||||
// const reference.
|
||||
// Assigning from a const DoublePtr generates a compiler error.
|
||||
void operator=(DoublePtr& src) {
|
||||
Disconnect();
|
||||
other_end_ = src.other_end_;
|
||||
if (other_end_ != NULL) {
|
||||
other_end_->other_end_ = this;
|
||||
src.other_end_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// Connects this and other, discarding any existing connections.
|
||||
void Connect(DoublePtr* other) {
|
||||
other->Disconnect();
|
||||
Disconnect();
|
||||
other->other_end_ = this;
|
||||
other_end_ = other;
|
||||
}
|
||||
// Disconnects this and other, making OtherEnd() return NULL for both.
|
||||
void Disconnect() {
|
||||
if (other_end_ != NULL) {
|
||||
other_end_->other_end_ = NULL;
|
||||
other_end_ = NULL;
|
||||
}
|
||||
}
|
||||
// Returns the pointer to the other end of the double pointer.
|
||||
DoublePtr* OtherEnd() const {
|
||||
return other_end_;
|
||||
}
|
||||
|
||||
private:
|
||||
// Pointer to the other end of the link. It is always true that either
|
||||
// other_end_ == NULL or other_end_->other_end_ == this.
|
||||
DoublePtr* other_end_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // THIRD_PARTY_TESSERACT_CCUTIL_DOUBLEPTR_H_
|
@ -90,12 +90,6 @@ const ERRCODE ASSERT_FAILED = "Assert failed";
|
||||
void signal_exit( //
|
||||
int signal_code //Signal which
|
||||
);
|
||||
extern "C"
|
||||
{
|
||||
void err_exit();
|
||||
//The real signal
|
||||
void signal_termination_handler(int sig);
|
||||
};
|
||||
|
||||
void set_global_loc_code(int loc_code);
|
||||
|
||||
|
225
ccutil/genericheap.h
Normal file
225
ccutil/genericheap.h
Normal file
@ -0,0 +1,225 @@
|
||||
// Copyright 2012 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: genericheap.h
|
||||
// Description: Template heap class.
|
||||
// Author: Ray Smith, based on Dan Johnson's original code.
|
||||
// Created: Wed Mar 14 08:13:00 PDT 2012
|
||||
//
|
||||
// (C) Copyright 2012, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "errcode.h"
|
||||
#include "genericvector.h"
|
||||
|
||||
#ifndef TESSERACT_CCUTIL_GENERICHEAP_H_
|
||||
#define TESSERACT_CCUTIL_GENERICHEAP_H_
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// GenericHeap requires 1 template argument:
|
||||
// Pair will normally be either KDPairInc<Key, Data> or KDPairDec<Key, Data>
|
||||
// for some arbitrary Key and scalar, smart pointer, or non-ownership pointer
|
||||
// Data type, according to whether a MIN heap or a MAX heap is desired,
|
||||
// respectively. Using KDPtrPairInc<Key, Data> or KDPtrPairDec<Key, Data>,
|
||||
// GenericHeap can also handle simple Data pointers and own them.
|
||||
// If no additional data is required, Pair can also be a scalar, since
|
||||
// GenericHeap doesn't look inside it except for operator<.
|
||||
//
|
||||
// The heap is stored as a packed binary tree in an array hosted by a
|
||||
// GenericVector<Pair>, with the invariant that the children of each node are
|
||||
// both NOT Pair::operator< the parent node. KDPairInc defines Pair::operator<
|
||||
// to use Key::operator< to generate a MIN heap and KDPairDec defines
|
||||
// Pair::operator< to use Key::operator> to generate a MAX heap by reversing
|
||||
// all the comparisons.
|
||||
// See http://en.wikipedia.org/wiki/Heap_(data_structure) for more detail on
|
||||
// the basic heap implementation.
|
||||
//
|
||||
// Insertion and removal are both O(log n) and, unlike the STL heap, an
|
||||
// explicit Reshuffle function allows a node to be repositioned in time O(log n)
|
||||
// after changing its value.
|
||||
//
|
||||
// Accessing the element for revaluation is a more complex matter, since the
|
||||
// index and pointer can be changed arbitrarily by heap operations.
|
||||
// Revaluation can be done by making the Data type in the Pair derived from or
|
||||
// contain a DoublePtr as its first data element, making it possible to convert
|
||||
// the pointer to a Pair using KDPairInc::RecastDataPointer.
|
||||
template <typename Pair>
|
||||
class GenericHeap {
|
||||
public:
|
||||
GenericHeap() {}
|
||||
// The initial size is only a GenericVector::reserve. It is not enforced as
|
||||
// the size limit of the heap. Caller must implement their own enforcement.
|
||||
explicit GenericHeap(int initial_size) {
|
||||
heap_.reserve(initial_size);
|
||||
}
|
||||
|
||||
// Simple accessors.
|
||||
bool empty() const {
|
||||
return heap_.empty();
|
||||
}
|
||||
int size() const {
|
||||
return heap_.size();
|
||||
}
|
||||
int size_reserved() const {
|
||||
return heap_.size_reserved();
|
||||
}
|
||||
void clear() {
|
||||
// Clear truncates to 0 to keep the number reserved in tact.
|
||||
heap_.truncate(0);
|
||||
}
|
||||
// Provides access to the underlying vector.
|
||||
// Caution! any changes that modify the keys will invalidate the heap!
|
||||
GenericVector<Pair>* heap() {
|
||||
return &heap_;
|
||||
}
|
||||
|
||||
// Add entry to the heap, keeping the smallest item at the top, by operator<.
|
||||
// Note that *entry is used as the source of operator=, but it is non-const
|
||||
// to allow for a smart pointer to be contained within.
|
||||
// Time = O(log n).
|
||||
void Push(Pair* entry) {
|
||||
int hole_index = heap_.size();
|
||||
// Make a hole in the end of heap_ and sift it up to be the correct
|
||||
// location for the new *entry. To avoid needing a default constructor
|
||||
// for primitive types, and to allow for use of DoublePtr in the Pair
|
||||
// somewhere, we have to incur a double copy here.
|
||||
heap_.push_back(*entry);
|
||||
*entry = heap_.back();
|
||||
hole_index = SiftUp(hole_index, *entry);
|
||||
heap_[hole_index] = *entry;
|
||||
}
|
||||
|
||||
// Get the value of the top (smallest, defined by operator< ) element.
|
||||
const Pair& PeekTop() const {
|
||||
return heap_[0];
|
||||
}
|
||||
|
||||
// Removes the top element of the heap. If entry is not NULL, the element
|
||||
// is copied into *entry, otherwise it is discarded.
|
||||
// Returns false if the heap was already empty.
|
||||
// Time = O(log n).
|
||||
bool Pop(Pair* entry) {
|
||||
int new_size = heap_.size() - 1;
|
||||
if (new_size < 0)
|
||||
return false; // Already empty.
|
||||
if (entry != NULL)
|
||||
*entry = heap_[0];
|
||||
if (new_size > 0) {
|
||||
// Sift the hole at the start of the heap_ downwards to match the last
|
||||
// element.
|
||||
Pair hole_pair = heap_[new_size];
|
||||
heap_.truncate(new_size);
|
||||
int hole_index = SiftDown(0, hole_pair);
|
||||
heap_[hole_index] = hole_pair;
|
||||
} else {
|
||||
heap_.truncate(new_size);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Removes the MAXIMUM element of the heap. (MIN from a MAX heap.) If entry is
|
||||
// not NULL, the element is copied into *entry, otherwise it is discarded.
|
||||
// Time = O(n). Returns false if the heap was already empty.
|
||||
bool PopWorst(Pair* entry) {
|
||||
int heap_size = heap_.size();
|
||||
if (heap_size == 0) return false; // It cannot be empty!
|
||||
|
||||
// Find the maximum element. Its index is guaranteed to be greater than
|
||||
// the index of the parent of the last element, since by the heap invariant
|
||||
// the parent must be less than or equal to the children.
|
||||
int worst_index = heap_size - 1;
|
||||
int end_parent = ParentNode(worst_index);
|
||||
for (int i = worst_index - 1; i > end_parent; --i) {
|
||||
if (heap_[worst_index] < heap_[i])
|
||||
worst_index = i;
|
||||
}
|
||||
// Extract the worst element from the heap, leaving a hole at worst_index.
|
||||
if (entry != NULL)
|
||||
*entry = heap_[worst_index];
|
||||
--heap_size;
|
||||
if (heap_size > 0) {
|
||||
// Sift the hole upwards to match the last element of the heap_
|
||||
Pair hole_pair = heap_[heap_size];
|
||||
int hole_index = SiftUp(worst_index, hole_pair);
|
||||
heap_[hole_index] = hole_pair;
|
||||
}
|
||||
heap_.truncate(heap_size);
|
||||
return true;
|
||||
}
|
||||
|
||||
// The pointed-to Pair has changed its key value, so the location of pair
|
||||
// is reshuffled to maintain the heap invariant.
|
||||
// Must be a valid pointer to an element of the heap_!
|
||||
// Caution! Since GenericHeap is based on GenericVector, reallocs may occur
|
||||
// whenever the vector is extended and elements may get shuffled by any
|
||||
// Push or Pop operation. Therefore use this function only if Data in Pair is
|
||||
// of type DoublePtr, derived (first) from DoublePtr, or has a DoublePtr as
|
||||
// its first element. Reshuffles the heap to maintain the invariant.
|
||||
// Time = O(log n).
|
||||
void Reshuffle(Pair* pair) {
|
||||
int index = pair - &heap_[0];
|
||||
Pair hole_pair = heap_[index];
|
||||
index = SiftDown(index, hole_pair);
|
||||
index = SiftUp(index, hole_pair);
|
||||
heap_[index] = hole_pair;
|
||||
}
|
||||
|
||||
private:
|
||||
// A hole in the heap exists at hole_index, and we want to fill it with the
|
||||
// given pair. SiftUp sifts the hole upward to the correct position and
|
||||
// returns the destination index without actually putting pair there.
|
||||
int SiftUp(int hole_index, const Pair& pair) {
|
||||
int parent;
|
||||
while (hole_index > 0 && pair < heap_[parent = ParentNode(hole_index)]) {
|
||||
heap_[hole_index] = heap_[parent];
|
||||
hole_index = parent;
|
||||
}
|
||||
return hole_index;
|
||||
}
|
||||
|
||||
// A hole in the heap exists at hole_index, and we want to fill it with the
|
||||
// given pair. SiftDown sifts the hole downward to the correct position and
|
||||
// returns the destination index without actually putting pair there.
|
||||
int SiftDown(int hole_index, const Pair& pair) {
|
||||
int heap_size = heap_.size();
|
||||
int child;
|
||||
while ((child = LeftChild(hole_index)) < heap_size) {
|
||||
if (child + 1 < heap_size && heap_[child + 1] < heap_[child])
|
||||
++child;
|
||||
if (heap_[child] < pair) {
|
||||
heap_[hole_index] = heap_[child];
|
||||
hole_index = child;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return hole_index;
|
||||
}
|
||||
|
||||
// Functions to navigate the tree. Unlike the original implementation, we
|
||||
// store the root at index 0.
|
||||
int ParentNode(int index) const {
|
||||
return (index + 1) / 2 - 1;
|
||||
}
|
||||
int LeftChild(int index) const {
|
||||
return index * 2 + 1;
|
||||
}
|
||||
|
||||
private:
|
||||
GenericVector<Pair> heap_;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_CCUTIL_GENERICHEAP_H_
|
@ -20,6 +20,7 @@
|
||||
#ifndef TESSERACT_CCUTIL_GENERICVECTOR_H_
|
||||
#define TESSERACT_CCUTIL_GENERICVECTOR_H_
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
@ -34,8 +35,13 @@
|
||||
template <typename T>
|
||||
class GenericVector {
|
||||
public:
|
||||
GenericVector() { this->init(kDefaultVectorSize); }
|
||||
explicit GenericVector(int size) { this->init(size); }
|
||||
GenericVector() {
|
||||
init(kDefaultVectorSize);
|
||||
}
|
||||
GenericVector(int size, T init_val) {
|
||||
init(size);
|
||||
init_to_size(size, init_val);
|
||||
}
|
||||
|
||||
// Copy
|
||||
GenericVector(const GenericVector& other) {
|
||||
@ -45,7 +51,7 @@ class GenericVector {
|
||||
GenericVector<T> &operator+=(const GenericVector& other);
|
||||
GenericVector<T> &operator=(const GenericVector& other);
|
||||
|
||||
virtual ~GenericVector();
|
||||
~GenericVector();
|
||||
|
||||
// Reserve some memory.
|
||||
void reserve(int size);
|
||||
@ -59,6 +65,9 @@ class GenericVector {
|
||||
int size() const {
|
||||
return size_used_;
|
||||
}
|
||||
int size_reserved() const {
|
||||
return size_reserved_;
|
||||
}
|
||||
|
||||
int length() const {
|
||||
return size_used_;
|
||||
@ -73,6 +82,8 @@ class GenericVector {
|
||||
T &get(int index) const;
|
||||
T &back() const;
|
||||
T &operator[](int index) const;
|
||||
// Returns the last object and removes it.
|
||||
T pop_back();
|
||||
|
||||
// Return the index of the T object.
|
||||
// This method NEEDS a compare_callback to be passed to
|
||||
@ -105,11 +116,11 @@ class GenericVector {
|
||||
|
||||
// Removes an element at the given index and
|
||||
// shifts the remaining elements to the left.
|
||||
virtual void remove(int index);
|
||||
void remove(int index);
|
||||
|
||||
// Truncates the array to the given size by removing the end.
|
||||
// If the current size is less, the array is not expanded.
|
||||
virtual void truncate(int size) {
|
||||
void truncate(int size) {
|
||||
if (size < size_used_)
|
||||
size_used_ = size;
|
||||
}
|
||||
@ -126,7 +137,7 @@ class GenericVector {
|
||||
// All the owned callbacks are also deleted.
|
||||
// If you don't want the callbacks to be deleted, before calling clear, set
|
||||
// the callback to NULL.
|
||||
virtual void clear();
|
||||
void clear();
|
||||
|
||||
// Delete objects pointed to by data_[i]
|
||||
void delete_data_pointers();
|
||||
@ -147,12 +158,12 @@ class GenericVector {
|
||||
bool read(FILE* f, TessResultCallback3<bool, FILE*, T*, bool>* cb, bool swap);
|
||||
// Writes a vector of simple types to the given file. Assumes that bitwise
|
||||
// read/write of T will work. Returns false in case of error.
|
||||
virtual bool Serialize(FILE* fp) const;
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Reads a vector of simple types from the given file. Assumes that bitwise
|
||||
// read/write will work with ReverseN according to sizeof(T).
|
||||
// Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
virtual bool DeSerialize(bool swap, FILE* fp);
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
// Writes a vector of classes to the given file. Assumes the existence of
|
||||
// bool T::Serialize(FILE* fp) const that returns false in case of error.
|
||||
// Returns false in case of error.
|
||||
@ -262,7 +273,32 @@ class GenericVector {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns the index of what would be the target_index_th item in the array
|
||||
// if the members were sorted, without actually sorting. Members are
|
||||
// shuffled around, but it takes O(n) time.
|
||||
// NOTE: uses operator< and operator== on the members.
|
||||
int choose_nth_item(int target_index) {
|
||||
// Make sure target_index is legal.
|
||||
if (target_index < 0)
|
||||
target_index = 0; // ensure legal
|
||||
else if (target_index >= size_used_)
|
||||
target_index = size_used_ - 1;
|
||||
unsigned int seed = 1;
|
||||
return choose_nth_item(target_index, 0, size_used_, &seed);
|
||||
}
|
||||
|
||||
// Swaps the elements with the given indices.
|
||||
void swap(int index1, int index2) {
|
||||
if (index1 != index2) {
|
||||
T tmp = data_[index1];
|
||||
data_[index1] = data_[index2];
|
||||
data_[index2] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
// Internal recursive version of choose_nth_item.
|
||||
int choose_nth_item(int target_index, int start, int end, unsigned int* seed);
|
||||
|
||||
// Init the object, allocating size memory.
|
||||
void init(int size);
|
||||
@ -328,7 +364,7 @@ class PointerVector : public GenericVector<T*> {
|
||||
public:
|
||||
PointerVector() : GenericVector<T*>() { }
|
||||
explicit PointerVector(int size) : GenericVector<T*>(size) { }
|
||||
virtual ~PointerVector() {
|
||||
~PointerVector() {
|
||||
// Clear must be called here, even though it is called again by the base,
|
||||
// as the base will call the wrong clear.
|
||||
clear();
|
||||
@ -355,14 +391,14 @@ class PointerVector : public GenericVector<T*> {
|
||||
|
||||
// Removes an element at the given index and
|
||||
// shifts the remaining elements to the left.
|
||||
virtual void remove(int index) {
|
||||
void remove(int index) {
|
||||
delete GenericVector<T*>::data_[index];
|
||||
GenericVector<T*>::remove(index);
|
||||
}
|
||||
|
||||
// Truncates the array to the given size by removing the end.
|
||||
// If the current size is less, the array is not expanded.
|
||||
virtual void truncate(int size) {
|
||||
void truncate(int size) {
|
||||
for (int i = size; i < GenericVector<T*>::size_used_; ++i)
|
||||
delete GenericVector<T*>::data_[i];
|
||||
GenericVector<T*>::truncate(size);
|
||||
@ -394,14 +430,14 @@ class PointerVector : public GenericVector<T*> {
|
||||
// All the owned callbacks are also deleted.
|
||||
// If you don't want the callbacks to be deleted, before calling clear, set
|
||||
// the callback to NULL.
|
||||
virtual void clear() {
|
||||
void clear() {
|
||||
GenericVector<T*>::delete_data_pointers();
|
||||
GenericVector<T*>::clear();
|
||||
}
|
||||
|
||||
// Writes a vector of simple types to the given file. Assumes that bitwise
|
||||
// read/write of T will work. Returns false in case of error.
|
||||
virtual bool Serialize(FILE* fp) const {
|
||||
bool Serialize(FILE* fp) const {
|
||||
inT32 used = GenericVector<T*>::size_used_;
|
||||
if (fwrite(&used, sizeof(used), 1, fp) != 1) return false;
|
||||
for (int i = 0; i < used; ++i) {
|
||||
@ -416,7 +452,7 @@ class PointerVector : public GenericVector<T*> {
|
||||
// Also needs T::T(), as new T is used in this function.
|
||||
// Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
virtual bool DeSerialize(bool swap, FILE* fp) {
|
||||
bool DeSerialize(bool swap, FILE* fp) {
|
||||
inT32 reserved;
|
||||
if (fread(&reserved, sizeof(reserved), 1, fp) != 1) return false;
|
||||
if (swap) Reverse32(&reserved);
|
||||
@ -515,7 +551,8 @@ T &GenericVector<T>::get(int index) const {
|
||||
|
||||
template <typename T>
|
||||
T &GenericVector<T>::operator[](int index) const {
|
||||
return data_[index];
|
||||
assert(index >= 0 && index < size_used_);
|
||||
return data_[index];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -523,6 +560,12 @@ T &GenericVector<T>::back() const {
|
||||
ASSERT_HOST(size_used_ > 0);
|
||||
return data_[size_used_ - 1];
|
||||
}
|
||||
// Returns the last object and removes it.
|
||||
template <typename T>
|
||||
T GenericVector<T>::pop_back() {
|
||||
ASSERT_HOST(size_used_ > 0);
|
||||
return data_[--size_used_];
|
||||
}
|
||||
|
||||
// Return the object from an index.
|
||||
template <typename T>
|
||||
@ -536,7 +579,7 @@ void GenericVector<T>::set(T t, int index) {
|
||||
// at the specified index.
|
||||
template <typename T>
|
||||
void GenericVector<T>::insert(T t, int index) {
|
||||
ASSERT_HOST(index >= 0 && index < size_used_);
|
||||
ASSERT_HOST(index >= 0 && index <= size_used_);
|
||||
if (size_reserved_ == size_used_)
|
||||
double_the_size();
|
||||
for (int i = size_used_; i > index; --i) {
|
||||
@ -642,7 +685,8 @@ void GenericVector<T>::set_clear_callback(TessCallback1<T>* cb) {
|
||||
// Add a callback to be called to delete the elements when the array took
|
||||
// their ownership.
|
||||
template <typename T>
|
||||
void GenericVector<T>::set_compare_callback(TessResultCallback2<bool, T const &, T const &>* cb) {
|
||||
void GenericVector<T>::set_compare_callback(
|
||||
TessResultCallback2<bool, T const &, T const &>* cb) {
|
||||
compare_cb_ = cb;
|
||||
}
|
||||
|
||||
@ -804,4 +848,61 @@ void GenericVector<T>::sort() {
|
||||
sort(&tesseract::sort_cmp<T>);
|
||||
}
|
||||
|
||||
// Internal recursive version of choose_nth_item.
|
||||
// The algorithm used comes from "Algorithms" by Sedgewick:
|
||||
// http://books.google.com/books/about/Algorithms.html?id=idUdqdDXqnAC
|
||||
// The principle is to choose a random pivot, and move everything less than
|
||||
// the pivot to its left, and everything greater than the pivot to the end
|
||||
// of the array, then recurse on the part that contains the desired index, or
|
||||
// just return the answer if it is in the equal section in the middle.
|
||||
// The random pivot guarantees average linear time for the same reason that
|
||||
// n times vector::push_back takes linear time on average.
|
||||
// target_index, start and and end are all indices into the full array.
|
||||
// Seed is a seed for rand_r for thread safety purposes. Its value is
|
||||
// unimportant as the random numbers do not affect the result except
|
||||
// between equal answers.
|
||||
template <typename T>
|
||||
int GenericVector<T>::choose_nth_item(int target_index, int start, int end,
|
||||
unsigned int* seed) {
|
||||
// Number of elements to process.
|
||||
int num_elements = end - start;
|
||||
// Trivial cases.
|
||||
if (num_elements <= 1)
|
||||
return start;
|
||||
if (num_elements == 2) {
|
||||
if (data_[start] < data_[start + 1]) {
|
||||
return target_index > start ? start + 1 : start;
|
||||
} else {
|
||||
return target_index > start ? start : start + 1;
|
||||
}
|
||||
}
|
||||
// Place the pivot at start.
|
||||
int pivot = rand_r(seed) % num_elements + start;
|
||||
swap(pivot, start);
|
||||
// The invariant condition here is that items [start, next_lesser) are less
|
||||
// than the pivot (which is at index next_lesser) and items
|
||||
// [prev_greater, end) are greater than the pivot, with items
|
||||
// [next_lesser, prev_greater) being equal to the pivot.
|
||||
int next_lesser = start;
|
||||
int prev_greater = end;
|
||||
for (int next_sample = start + 1; next_sample < prev_greater;) {
|
||||
if (data_[next_sample] < data_[next_lesser]) {
|
||||
swap(next_lesser++, next_sample++);
|
||||
} else if (data_[next_sample] == data_[next_lesser]) {
|
||||
++next_sample;
|
||||
} else {
|
||||
swap(--prev_greater, next_sample);
|
||||
}
|
||||
}
|
||||
// Now the invariant is set up, we recurse on just the section that contains
|
||||
// the desired index.
|
||||
if (target_index < next_lesser)
|
||||
return choose_nth_item(target_index, start, next_lesser, seed);
|
||||
else if (target_index < prev_greater)
|
||||
return next_lesser; // In equal bracket.
|
||||
else
|
||||
return choose_nth_item(target_index, prev_greater, end, seed);
|
||||
}
|
||||
|
||||
|
||||
#endif // TESSERACT_CCUTIL_GENERICVECTOR_H_
|
||||
|
@ -18,84 +18,56 @@
|
||||
**********************************************************************/
|
||||
|
||||
#include <signal.h>
|
||||
#ifdef __linux__
|
||||
#include <sys/syscall.h> // For SYS_gettid.
|
||||
#include <unistd.h> // For syscall itself.
|
||||
#endif
|
||||
#include "allheaders.h"
|
||||
#include "errcode.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
/*inT16 global_loc_code = LOC_INIT;//location code
|
||||
inT16 global_subloc_code = SUBLOC_NORM;
|
||||
//pass2 subloc code
|
||||
inT16 global_subsubloc_code = SUBSUBLOC_OTHER;
|
||||
//location code
|
||||
inT16 global_abort_code = NO_ABORT_CODE;
|
||||
//Prog abort code
|
||||
*/
|
||||
void signal_exit( //
|
||||
int signal_code //Signal which
|
||||
) {
|
||||
/*int exit_status;
|
||||
// Size of thread-id array of pixes to keep in case of crash.
|
||||
const int kMaxNumThreadPixes = 32768;
|
||||
|
||||
if ((global_loc_code == LOC_PASS2) || (global_loc_code == LOC_FUZZY_SPACE))
|
||||
global_loc_code += global_subloc_code + global_subsubloc_code;
|
||||
Pix* global_crash_pixes[kMaxNumThreadPixes];
|
||||
|
||||
if (signal_code < 0) {
|
||||
exit_status = global_loc_code * 8 + global_abort_code * 2 + 1;
|
||||
tprintf ("Signal_exit %d ABORT. LocCode: %d AbortCode: %d\n",
|
||||
exit_status, global_loc_code, global_abort_code);
|
||||
void SavePixForCrash(int resolution, Pix* pix) {
|
||||
#ifdef __linux__
|
||||
int thread_id = syscall(SYS_gettid) % kMaxNumThreadPixes;
|
||||
pixDestroy(&global_crash_pixes[thread_id]);
|
||||
if (pix != NULL) {
|
||||
Pix* clone = pixClone(pix);
|
||||
pixSetXRes(clone, resolution);
|
||||
pixSetYRes(clone, resolution);
|
||||
global_crash_pixes[thread_id] = clone;
|
||||
}
|
||||
else {
|
||||
exit_status = global_loc_code * 8 + signal_code * 2;
|
||||
tprintf ("Signal_exit %d SIGNAL ABORT. LocCode: %d SignalCode: %d\n",
|
||||
exit_status, global_loc_code, signal_code);
|
||||
}
|
||||
|
||||
exit(exit_status);*/
|
||||
exit(signal_code);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/*************************************************************************
|
||||
* err_exit()
|
||||
* All program exits should go through this point. It allows a meaningful status
|
||||
* code to be generated for the real exit() call. The status code is made up
|
||||
* as follows:
|
||||
* Bit 0 : 1 = Program Abort 0 = System Abort
|
||||
* Bits 1,2 : IF bit 0 = 1 THEN ERRCODE::abort_code
|
||||
* ELSE 0 = Bus Err or Seg Vi
|
||||
* 1 = Floating point exception
|
||||
* 2 = TimeOut (Signal 15 from command timer)
|
||||
* 3 = Any other signal
|
||||
* Bits 3..7 : Location code NEVER 0 !
|
||||
*************************************************************************/
|
||||
|
||||
//extern "C" {
|
||||
// CALL ONLY from a signal handler! Writes a crash image to stderr.
|
||||
void signal_exit(int signal_code) {
|
||||
tprintf("Received signal %d!\n", signal_code);
|
||||
#ifdef __linux__
|
||||
int thread_id = syscall(SYS_gettid) % kMaxNumThreadPixes;
|
||||
if (global_crash_pixes[thread_id] != NULL) {
|
||||
fprintf(stderr, "Crash caused by image with resolution %d\n",
|
||||
pixGetYRes(global_crash_pixes[thread_id]));
|
||||
fprintf(stderr, "<Cut here>\n");
|
||||
pixWriteStreamPng(stderr, global_crash_pixes[thread_id], 0.0);
|
||||
fprintf(stderr, "\n<End cut>\n");
|
||||
}
|
||||
// Raise an uncaught signal, so as to get a useful stack trace.
|
||||
raise(SIGILL);
|
||||
#else
|
||||
abort();
|
||||
#endif
|
||||
}
|
||||
|
||||
void err_exit() {
|
||||
signal_exit (-1);
|
||||
ASSERT_HOST("Fatal error encountered!" == NULL);
|
||||
}
|
||||
|
||||
|
||||
void signal_termination_handler(int sig) {
|
||||
const ERRCODE SIGNAL_HANDLER_ERR = "Signal_termination_handler called";
|
||||
SIGNAL_HANDLER_ERR.error("signal_termination_handler", ABORT, "Code %d", sig);
|
||||
switch (sig) {
|
||||
case SIGABRT:
|
||||
signal_exit (-1); //use abort code
|
||||
// case SIGBUS:
|
||||
case SIGSEGV:
|
||||
signal_exit (0);
|
||||
case SIGFPE:
|
||||
signal_exit (1); //floating point
|
||||
case SIGTERM:
|
||||
signal_exit (2); //timeout by cmdtimer
|
||||
default:
|
||||
signal_exit (3); //Anything else
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//}; //end extern "C"
|
||||
|
||||
|
||||
void set_global_loc_code(int loc_code) {
|
||||
// global_loc_code = loc_code;
|
||||
|
||||
|
@ -22,14 +22,14 @@
|
||||
|
||||
#include "host.h"
|
||||
|
||||
void signal_exit( //
|
||||
int signal_code //Signal which
|
||||
);
|
||||
//extern "C" {
|
||||
// Saves a clone of the given pix, and notes its resolution in thread-specific
|
||||
// data, so that the image can be written prior to a crash.
|
||||
struct Pix;
|
||||
void SavePixForCrash(int resolution, Pix* pix);
|
||||
|
||||
void signal_exit(int signal_code);
|
||||
|
||||
void err_exit();
|
||||
//The real signal
|
||||
void signal_termination_handler(int sig);
|
||||
//};
|
||||
|
||||
void set_global_loc_code(int loc_code);
|
||||
|
||||
|
189
ccutil/kdpair.h
Normal file
189
ccutil/kdpair.h
Normal file
@ -0,0 +1,189 @@
|
||||
// Copyright 2012 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: kdpair.h
|
||||
// Description: Template pair class like STL pair but geared towards
|
||||
// the Key+Data design pattern in which some data needs
|
||||
// to be sorted or kept in a heap sorted on some separate key.
|
||||
// Author: Ray Smith.
|
||||
// Created: Thu Mar 15 14:48:05 PDT 2012
|
||||
//
|
||||
// (C) Copyright 2012, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCUTIL_KDPAIR_H_
|
||||
#define TESSERACT_CCUTIL_KDPAIR_H_
|
||||
|
||||
#include "genericvector.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// A useful base struct to facilitate the common operation of sorting a vector
|
||||
// of simple or smart-pointer data using a separate key. Similar to STL pair.
|
||||
template <typename Key, typename Data>
|
||||
struct KDPair {
|
||||
KDPair() {}
|
||||
KDPair(Key k, Data d) : data(d), key(k) {}
|
||||
|
||||
int operator==(const KDPair<Key, Data>& other) const {
|
||||
return key == other.key;
|
||||
}
|
||||
|
||||
// WARNING! Keep data as the first element! KDPairInc and KDPairDec depend
|
||||
// on the order of these elements so they can downcast pointers appropriately
|
||||
// for use by GenericHeap::Reshuffle.
|
||||
Data data;
|
||||
Key key;
|
||||
};
|
||||
// Specialization of KDPair to provide operator< for sorting in increasing order
|
||||
// and recasting of data pointers for use with DoublePtr.
|
||||
template <typename Key, typename Data>
|
||||
struct KDPairInc : public KDPair<Key, Data> {
|
||||
KDPairInc() {}
|
||||
KDPairInc(Key k, Data d) : KDPair<Key, Data>(k, d) {}
|
||||
// Operator< facilitates sorting in increasing order.
|
||||
int operator<(const KDPairInc<Key, Data>& other) const {
|
||||
return this->key < other.key;
|
||||
}
|
||||
// Returns the input Data pointer recast to a KDPairInc pointer.
|
||||
// Just casts a pointer to the first element to a pointer to the whole struct.
|
||||
static KDPairInc* RecastDataPointer(Data* data_ptr) {
|
||||
return reinterpret_cast<KDPairInc*>(data_ptr);
|
||||
}
|
||||
};
|
||||
// Specialization of KDPair to provide operator< for sorting in decreasing order
|
||||
// and recasting of data pointers for use with DoublePtr.
|
||||
template <typename Key, typename Data>
|
||||
struct KDPairDec : public KDPair<Key, Data> {
|
||||
KDPairDec() {}
|
||||
KDPairDec(Key k, Data d) : KDPair<Key, Data>(k, d) {}
|
||||
// Operator< facilitates sorting in decreasing order by using operator> on
|
||||
// the key values.
|
||||
int operator<(const KDPairDec<Key, Data>& other) const {
|
||||
return this->key > other.key;
|
||||
}
|
||||
// Returns the input Data pointer recast to a KDPairDec pointer.
|
||||
// Just casts a pointer to the first element to a pointer to the whole struct.
|
||||
static KDPairDec* RecastDataPointer(Data* data_ptr) {
|
||||
return reinterpret_cast<KDPairDec*>(data_ptr);
|
||||
}
|
||||
};
|
||||
|
||||
// A useful base class to facilitate the common operation of sorting a vector
|
||||
// of owned pointer data using a separate key. This class owns its data pointer,
|
||||
// deleting it when it has finished with it, and providing copy constructor and
|
||||
// operator= that have move semantics so that the data does not get copied and
|
||||
// only a single instance of KDPtrPair holds a specific data pointer.
|
||||
template <typename Key, typename Data>
|
||||
class KDPtrPair {
|
||||
public:
|
||||
KDPtrPair() : data_(NULL) {}
|
||||
KDPtrPair(Key k, Data* d) : data_(d), key_(k) {}
|
||||
// Copy constructor steals the pointer from src and NULLs it in src, thereby
|
||||
// moving the (single) ownership of the data.
|
||||
KDPtrPair(KDPtrPair& src) : data_(src.data_), key_(src.key_) {
|
||||
src.data_ = NULL;
|
||||
}
|
||||
// Destructor deletes data, assuming it is the sole owner.
|
||||
~KDPtrPair() {
|
||||
delete this->data_;
|
||||
this->data_ = NULL;
|
||||
}
|
||||
// Operator= steals the pointer from src and NULLs it in src, thereby
|
||||
// moving the (single) ownership of the data.
|
||||
void operator=(KDPtrPair& src) {
|
||||
delete this->data_;
|
||||
this->data_ = src.data_;
|
||||
src.data_ = NULL;
|
||||
this->key_ = src.key_;
|
||||
}
|
||||
|
||||
int operator==(const KDPtrPair<Key, Data>& other) const {
|
||||
return key_ == other.key_;
|
||||
}
|
||||
|
||||
// Accessors.
|
||||
const Key& key() const {
|
||||
return key_;
|
||||
}
|
||||
void set_key(const Key& new_key) {
|
||||
key_ = new_key;
|
||||
}
|
||||
const Data* data() const {
|
||||
return data_;
|
||||
}
|
||||
// Sets the data pointer, taking ownership of the data.
|
||||
void set_data(Data* new_data) {
|
||||
delete data_;
|
||||
data_ = new_data;
|
||||
}
|
||||
// Relinquishes ownership of the data pointer (setting it to NULL).
|
||||
Data* extract_data() {
|
||||
Data* result = data_;
|
||||
data_ = NULL;
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
// Data members are private to keep deletion of data_ encapsulated.
|
||||
Data* data_;
|
||||
Key key_;
|
||||
};
|
||||
// Specialization of KDPtrPair to provide operator< for sorting in increasing
|
||||
// order.
|
||||
template <typename Key, typename Data>
|
||||
struct KDPtrPairInc : public KDPtrPair<Key, Data> {
|
||||
// Since we are doing non-standard stuff we have to duplicate *all* the
|
||||
// constructors and operator=.
|
||||
KDPtrPairInc() : KDPtrPair<Key, Data>() {}
|
||||
KDPtrPairInc(Key k, Data* d) : KDPtrPair<Key, Data>(k, d) {}
|
||||
KDPtrPairInc(KDPtrPairInc& src) : KDPtrPair<Key, Data>(src) {}
|
||||
void operator=(KDPtrPairInc& src) {
|
||||
KDPtrPair<Key, Data>::operator=(src);
|
||||
}
|
||||
// Operator< facilitates sorting in increasing order.
|
||||
int operator<(const KDPtrPairInc<Key, Data>& other) const {
|
||||
return this->key() < other.key();
|
||||
}
|
||||
};
|
||||
// Specialization of KDPtrPair to provide operator< for sorting in decreasing
|
||||
// order.
|
||||
template <typename Key, typename Data>
|
||||
struct KDPtrPairDec : public KDPtrPair<Key, Data> {
|
||||
// Since we are doing non-standard stuff we have to duplicate *all* the
|
||||
// constructors and operator=.
|
||||
KDPtrPairDec() : KDPtrPair<Key, Data>() {}
|
||||
KDPtrPairDec(Key k, Data* d) : KDPtrPair<Key, Data>(k, d) {}
|
||||
KDPtrPairDec(KDPtrPairDec& src) : KDPtrPair<Key, Data>(src) {}
|
||||
void operator=(KDPtrPairDec& src) {
|
||||
KDPtrPair<Key, Data>::operator=(src);
|
||||
}
|
||||
// Operator< facilitates sorting in decreasing order by using operator> on
|
||||
// the key values.
|
||||
int operator<(const KDPtrPairDec<Key, Data>& other) const {
|
||||
return this->key() > other.key();
|
||||
}
|
||||
};
|
||||
|
||||
// Specialization for a pair of ints in increasing order.
|
||||
typedef KDPairInc<int, int> IntKDPair;
|
||||
|
||||
// Vector of IntKDPair.
|
||||
class KDVector : public GenericVector<IntKDPair> {
|
||||
// TODO(rays) Add some code to manipulate a KDVector. For now there
|
||||
// is nothing and this class is effectively a specialization typedef.
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_CCUTIL_KDPAIR_H_
|
125
ccutil/object_cache.h
Normal file
125
ccutil/object_cache.h
Normal file
@ -0,0 +1,125 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: object_cache.h
|
||||
// Description: A string indexed object cache.
|
||||
// Author: David Eger
|
||||
// Created: Fri Jan 27 12:08:00 PST 2012
|
||||
//
|
||||
// (C) Copyright 2012, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCUTIL_OBJECT_CACHE_H_
|
||||
#define TESSERACT_CCUTIL_OBJECT_CACHE_H_
|
||||
|
||||
#include "ccutil.h"
|
||||
#include "errcode.h"
|
||||
#include "genericvector.h"
|
||||
#include "tesscallback.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// A simple object cache which maps a string to an object of type T.
|
||||
// Usually, these are expensive objects that are loaded from disk.
|
||||
// Reference counting is performed, so every Get() needs to be followed later
|
||||
// by a Free(). Actual deletion is accomplished by DeleteUnusedObjects().
|
||||
template<typename T>
|
||||
class ObjectCache {
|
||||
public:
|
||||
ObjectCache() {}
|
||||
~ObjectCache() {
|
||||
mu_.Lock();
|
||||
for (int i = 0; i < cache_.size(); i++) {
|
||||
if (cache_[i].count > 0) {
|
||||
tprintf("ObjectCache(%p)::~ObjectCache(): WARNING! LEAK! object %p "
|
||||
"still has count %d (id %s)\n",
|
||||
this, cache_[i].object, cache_[i].count,
|
||||
cache_[i].id.string());
|
||||
} else {
|
||||
delete cache_[i].object;
|
||||
cache_[i].object = NULL;
|
||||
}
|
||||
}
|
||||
mu_.Unlock();
|
||||
}
|
||||
|
||||
// Return a pointer to the object identified by id.
|
||||
// If we haven't yet loaded the object, use loader to load it.
|
||||
// If loader fails to load it, record a NULL entry in the cache
|
||||
// and return NULL -- further attempts to load will fail (even
|
||||
// with a different loader) until DeleteUnusedObjects() is called.
|
||||
// We delete the given loader.
|
||||
T *Get(STRING id,
|
||||
TessResultCallback<T *> *loader) {
|
||||
T *retval = NULL;
|
||||
mu_.Lock();
|
||||
for (int i = 0; i < cache_.size(); i++) {
|
||||
if (id == cache_[i].id) {
|
||||
retval = cache_[i].object;
|
||||
if (cache_[i].object != NULL) {
|
||||
cache_[i].count++;
|
||||
}
|
||||
mu_.Unlock();
|
||||
delete loader;
|
||||
return retval;
|
||||
}
|
||||
}
|
||||
cache_.push_back(ReferenceCount());
|
||||
ReferenceCount &rc = cache_.back();
|
||||
rc.id = id;
|
||||
retval = rc.object = loader->Run();
|
||||
rc.count = (retval != NULL) ? 1 : 0;
|
||||
mu_.Unlock();
|
||||
return retval;
|
||||
}
|
||||
|
||||
// Decrement the count for t.
|
||||
// Return whether we knew about the given pointer.
|
||||
bool Free(T *t) {
|
||||
if (t == NULL) return false;
|
||||
mu_.Lock();
|
||||
for (int i = 0; i < cache_.size(); i++) {
|
||||
if (cache_[i].object == t) {
|
||||
--cache_[i].count;
|
||||
mu_.Unlock();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
mu_.Unlock();
|
||||
return false;
|
||||
}
|
||||
|
||||
void DeleteUnusedObjects() {
|
||||
mu_.Lock();
|
||||
for (int i = cache_.size() - 1; i >= 0; i--) {
|
||||
if (cache_[i].count <= 0) {
|
||||
delete cache_[i].object;
|
||||
cache_.remove(i);
|
||||
}
|
||||
}
|
||||
mu_.Unlock();
|
||||
}
|
||||
|
||||
private:
|
||||
struct ReferenceCount {
|
||||
STRING id; // A unique ID to identify the object (think path on disk)
|
||||
T *object; // A copy of the object in memory. Can be delete'd.
|
||||
int count; // A count of the number of active users of this object.
|
||||
};
|
||||
|
||||
CCUtilMutex mu_;
|
||||
GenericVector<ReferenceCount> cache_;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
|
||||
#endif // TESSERACT_CCUTIL_OBJECT_CACHE_H_
|
@ -207,4 +207,25 @@ void ParamUtils::PrintParams(FILE *fp, const ParamsVectors *member_params) {
|
||||
}
|
||||
}
|
||||
|
||||
// Resets all parameters back to default values;
|
||||
void ParamUtils::ResetToDefaults(ParamsVectors* member_params) {
|
||||
int v, i;
|
||||
int num_iterations = (member_params == NULL) ? 1 : 2;
|
||||
for (v = 0; v < num_iterations; ++v) {
|
||||
ParamsVectors *vec = (v == 0) ? GlobalParams() : member_params;
|
||||
for (i = 0; i < vec->int_params.size(); ++i) {
|
||||
vec->int_params[i]->ResetToDefault();
|
||||
}
|
||||
for (i = 0; i < vec->bool_params.size(); ++i) {
|
||||
vec->bool_params[i]->ResetToDefault();
|
||||
}
|
||||
for (int i = 0; i < vec->string_params.size(); ++i) {
|
||||
vec->string_params[i]->ResetToDefault();
|
||||
}
|
||||
for (int i = 0; i < vec->double_params.size(); ++i) {
|
||||
vec->double_params[i]->ResetToDefault();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
|
@ -104,6 +104,9 @@ class ParamUtils {
|
||||
|
||||
// Print parameters to the given file.
|
||||
static void PrintParams(FILE *fp, const ParamsVectors *member_params);
|
||||
|
||||
// Resets all parameters back to default values;
|
||||
static void ResetToDefaults(ParamsVectors* member_params);
|
||||
};
|
||||
|
||||
// Definition of various parameter types.
|
||||
@ -142,15 +145,20 @@ class IntParam : public Param {
|
||||
IntParam(inT32 value, const char *name, const char *comment, bool init,
|
||||
ParamsVectors *vec) : Param(name, comment, init) {
|
||||
value_ = value;
|
||||
default_ = value;
|
||||
params_vec_ = &(vec->int_params);
|
||||
vec->int_params.push_back(this);
|
||||
}
|
||||
~IntParam() { ParamUtils::RemoveParam<IntParam>(this, params_vec_); }
|
||||
operator inT32() const { return value_; }
|
||||
void set_value(inT32 value) { value_ = value; }
|
||||
void ResetToDefault() {
|
||||
value_ = default_;
|
||||
}
|
||||
|
||||
private:
|
||||
inT32 value_;
|
||||
inT32 default_;
|
||||
// Pointer to the vector that contains this param (not owened by this class).
|
||||
GenericVector<IntParam *> *params_vec_;
|
||||
};
|
||||
@ -160,15 +168,20 @@ class BoolParam : public Param {
|
||||
BoolParam(bool value, const char *name, const char *comment, bool init,
|
||||
ParamsVectors *vec) : Param(name, comment, init) {
|
||||
value_ = value;
|
||||
default_ = value;
|
||||
params_vec_ = &(vec->bool_params);
|
||||
vec->bool_params.push_back(this);
|
||||
}
|
||||
~BoolParam() { ParamUtils::RemoveParam<BoolParam>(this, params_vec_); }
|
||||
operator BOOL8() const { return value_; }
|
||||
void set_value(BOOL8 value) { value_ = value; }
|
||||
void ResetToDefault() {
|
||||
value_ = default_;
|
||||
}
|
||||
|
||||
private:
|
||||
BOOL8 value_;
|
||||
BOOL8 default_;
|
||||
// Pointer to the vector that contains this param (not owned by this class).
|
||||
GenericVector<BoolParam *> *params_vec_;
|
||||
};
|
||||
@ -179,17 +192,23 @@ class StringParam : public Param {
|
||||
const char *comment, bool init,
|
||||
ParamsVectors *vec) : Param(name, comment, init) {
|
||||
value_ = value;
|
||||
default_ = value;
|
||||
params_vec_ = &(vec->string_params);
|
||||
vec->string_params.push_back(this);
|
||||
}
|
||||
~StringParam() { ParamUtils::RemoveParam<StringParam>(this, params_vec_); }
|
||||
operator STRING &() { return value_; }
|
||||
const char *string() const { return value_.string(); }
|
||||
const char *c_str() const { return value_.string(); }
|
||||
bool empty() { return value_.length() <= 0; }
|
||||
void set_value(const STRING &value) { value_ = value; }
|
||||
void ResetToDefault() {
|
||||
value_ = default_;
|
||||
}
|
||||
|
||||
private:
|
||||
STRING value_;
|
||||
STRING default_;
|
||||
// Pointer to the vector that contains this param (not owened by this class).
|
||||
GenericVector<StringParam *> *params_vec_;
|
||||
};
|
||||
@ -199,15 +218,20 @@ class DoubleParam : public Param {
|
||||
DoubleParam(double value, const char *name, const char *comment,
|
||||
bool init, ParamsVectors *vec) : Param(name, comment, init) {
|
||||
value_ = value;
|
||||
default_ = value;
|
||||
params_vec_ = &(vec->double_params);
|
||||
vec->double_params.push_back(this);
|
||||
}
|
||||
~DoubleParam() { ParamUtils::RemoveParam<DoubleParam>(this, params_vec_); }
|
||||
operator double() const { return value_; }
|
||||
void set_value(double value) { value_ = value; }
|
||||
void ResetToDefault() {
|
||||
value_ = default_;
|
||||
}
|
||||
|
||||
private:
|
||||
double value_;
|
||||
double default_;
|
||||
// Pointer to the vector that contains this param (not owned by this class).
|
||||
GenericVector<DoubleParam *> *params_vec_;
|
||||
};
|
||||
|
@ -20,16 +20,12 @@
|
||||
#ifndef TESSERACT_CCUTIL_PLATFORM_H__
|
||||
#define TESSERACT_CCUTIL_PLATFORM_H__
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#define DLLSYM
|
||||
#ifdef _WIN32
|
||||
#ifdef __GNUC__
|
||||
#define ultoa _ultoa
|
||||
#ifndef __MINGW32__
|
||||
typedef struct _BLOB {
|
||||
unsigned int cbSize;
|
||||
char *pBlobData;
|
||||
} BLOB, *LPBLOB;
|
||||
#endif /* __MINGW32__ */
|
||||
#endif /* __GNUC__ */
|
||||
#define SIGNED
|
||||
#define snprintf _snprintf
|
||||
@ -71,4 +67,12 @@ typedef struct _BLOB {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32) || defined(__CYGWIN__)
|
||||
#define _TESS_FILE_BASENAME_ \
|
||||
(strrchr(__FILE__, '\\') ? strrchr(__FILE__, '\\') + 1 : __FILE__)
|
||||
#else // Unices
|
||||
#define _TESS_FILE_BASENAME_ \
|
||||
(strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
|
||||
#endif
|
||||
|
||||
#endif // TESSERACT_CCUTIL_PLATFORM_H__
|
||||
|
@ -56,7 +56,9 @@ class SortHelper {
|
||||
}
|
||||
|
||||
// Constructor takes a hint of the array size, but it need not be accurate.
|
||||
explicit SortHelper(int sizehint) : counts_(sizehint) {}
|
||||
explicit SortHelper(int sizehint) {
|
||||
counts_.reserve(sizehint);
|
||||
}
|
||||
|
||||
// Add a value that may be a duplicate of an existing value.
|
||||
// Uses a linear search.
|
||||
|
@ -24,8 +24,11 @@
|
||||
|
||||
#include <assert.h>
|
||||
// Size of buffer needed to host the decimal representation of the maximum
|
||||
// possible length of an int (in 64 bits, being -<20 digits>.
|
||||
// possible length of an int (in 64 bits), being -<20 digits>.
|
||||
const int kMaxIntSize = 22;
|
||||
// Size of buffer needed to host the decimal representation of the maximum
|
||||
// possible length of a %.8g being -0.12345678e+999<nul> = 15.
|
||||
const int kMaxDoubleSize = 15;
|
||||
|
||||
/**********************************************************************
|
||||
* STRING_HEADER provides metadata about the allocated buffer,
|
||||
@ -163,6 +166,10 @@ const char* STRING::string() const {
|
||||
return GetCStr();
|
||||
}
|
||||
|
||||
const char* STRING::c_str() const {
|
||||
return string();
|
||||
}
|
||||
|
||||
/******
|
||||
* The STRING_IS_PROTECTED interface adds additional support to migrate
|
||||
* code that needs to modify the STRING in ways not otherwise supported
|
||||
@ -220,6 +227,8 @@ void STRING::erase_range(inT32 index, int len) {
|
||||
|
||||
#else
|
||||
void STRING::truncate_at(inT32 index) {
|
||||
ASSERT_HOST(index >= 0);
|
||||
FixHeader();
|
||||
char* this_cstr = ensure_cstr(index + 1);
|
||||
this_cstr[index] = '\0';
|
||||
GetHeader()->used_ = index + 1;
|
||||
@ -339,6 +348,16 @@ void STRING::add_str_int(const char* str, int number) {
|
||||
num_buffer[kMaxIntSize - 1] = '\0';
|
||||
*this += num_buffer;
|
||||
}
|
||||
// Appends the given string and double (as a %.8g) to this.
|
||||
void STRING::add_str_double(const char* str, double number) {
|
||||
if (str != NULL)
|
||||
*this += str;
|
||||
// Allow space for the maximum possible length of %8g.
|
||||
char num_buffer[kMaxDoubleSize];
|
||||
snprintf(num_buffer, kMaxDoubleSize - 1, "%.8g", number);
|
||||
num_buffer[kMaxDoubleSize - 1] = '\0';
|
||||
*this += num_buffer;
|
||||
}
|
||||
|
||||
STRING & STRING::operator=(const char* cstr) {
|
||||
STRING_HEADER* this_header = GetHeader();
|
||||
|
@ -55,6 +55,7 @@ class TESS_API STRING
|
||||
inT32 length() const;
|
||||
inT32 size() const { return length(); }
|
||||
const char *string() const;
|
||||
const char *c_str() const;
|
||||
|
||||
inline char* strdup() const {
|
||||
inT32 len = length() + 1;
|
||||
@ -94,8 +95,10 @@ class TESS_API STRING
|
||||
// be ambiguous, and ints usually need a string before or between them
|
||||
// anyway.
|
||||
void add_str_int(const char* str, int number);
|
||||
// Appends the given string and double (as a %.8g) to this.
|
||||
void add_str_double(const char* str, double number);
|
||||
|
||||
// ensure capcaity but keep pointer encapsulated
|
||||
// ensure capacity but keep pointer encapsulated
|
||||
inline void ensure(inT32 min_capacity) { ensure_cstr(min_capacity); }
|
||||
|
||||
private:
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -35,6 +35,7 @@ namespace tesseract {
|
||||
bool TessdataManager::Init(const char *data_file_name, int debug_level) {
|
||||
int i;
|
||||
debug_level_ = debug_level;
|
||||
data_file_name_ = data_file_name;
|
||||
data_file_ = fopen(data_file_name, "rb");
|
||||
if (data_file_ == NULL) {
|
||||
tprintf("Error opening data file %s\n", data_file_name);
|
||||
@ -244,7 +245,7 @@ bool TessdataManager::ExtractToFile(const char *filename) {
|
||||
|
||||
FILE *output_file = fopen(filename, "wb");
|
||||
if (output_file == NULL) {
|
||||
tprintf("Error openning %s\n", filename);
|
||||
tprintf("Error opening %s\n", filename);
|
||||
exit(1);
|
||||
}
|
||||
inT64 begin_offset = ftell(GetDataFilePtr());
|
||||
|
@ -21,7 +21,9 @@
|
||||
#define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "host.h"
|
||||
#include "strngs.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
static const char kTrainedDataSuffix[] = "traineddata";
|
||||
@ -44,7 +46,7 @@ static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
|
||||
static const char kShapeTableFileSuffix[] = "shapetable";
|
||||
static const char kBigramDawgFileSuffix[] = "bigram-dawg";
|
||||
static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
|
||||
static const char kParamsTrainingModelFileSuffix[] = "params-training-model";
|
||||
static const char kParamsModelFileSuffix[] = "params-model";
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
@ -59,13 +61,13 @@ enum TessdataType {
|
||||
TESSDATA_SYSTEM_DAWG, // 7
|
||||
TESSDATA_NUMBER_DAWG, // 8
|
||||
TESSDATA_FREQ_DAWG, // 9
|
||||
TESSDATA_FIXED_LENGTH_DAWGS, // 10
|
||||
TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated
|
||||
TESSDATA_CUBE_UNICHARSET, // 11
|
||||
TESSDATA_CUBE_SYSTEM_DAWG, // 12
|
||||
TESSDATA_SHAPE_TABLE, // 13
|
||||
TESSDATA_BIGRAM_DAWG, // 14
|
||||
TESSDATA_UNAMBIG_DAWG, // 15
|
||||
TESSDATA_PARAMS_TRAINING_MODEL, // 16
|
||||
TESSDATA_PARAMS_MODEL, // 16
|
||||
|
||||
TESSDATA_NUM_ENTRIES
|
||||
};
|
||||
@ -85,13 +87,13 @@ static const char * const kTessdataFileSuffixes[] = {
|
||||
kSystemDawgFileSuffix, // 7
|
||||
kNumberDawgFileSuffix, // 8
|
||||
kFreqDawgFileSuffix, // 9
|
||||
kFixedLengthDawgsFileSuffix, // 10
|
||||
kFixedLengthDawgsFileSuffix, // 10 // deprecated
|
||||
kCubeUnicharsetFileSuffix, // 11
|
||||
kCubeSystemDawgFileSuffix, // 12
|
||||
kShapeTableFileSuffix, // 13
|
||||
kBigramDawgFileSuffix, // 14
|
||||
kUnambigDawgFileSuffix, // 15
|
||||
kParamsTrainingModelFileSuffix, // 16
|
||||
kParamsModelFileSuffix, // 16
|
||||
};
|
||||
|
||||
/**
|
||||
@ -109,13 +111,13 @@ static const bool kTessdataFileIsText[] = {
|
||||
false, // 7
|
||||
false, // 8
|
||||
false, // 9
|
||||
false, // 10
|
||||
false, // 10 // deprecated
|
||||
true, // 11
|
||||
false, // 12
|
||||
false, // 13
|
||||
false, // 14
|
||||
false, // 15
|
||||
false, // 16
|
||||
true, // 16
|
||||
};
|
||||
|
||||
/**
|
||||
@ -146,6 +148,9 @@ class TessdataManager {
|
||||
*/
|
||||
bool Init(const char *data_file_name, int debug_level);
|
||||
|
||||
// Return the name of the underlying data file.
|
||||
const STRING &GetDataFileName() const { return data_file_name_; }
|
||||
|
||||
/** Returns data file pointer. */
|
||||
inline FILE *GetDataFilePtr() const { return data_file_; }
|
||||
|
||||
@ -279,6 +284,7 @@ class TessdataManager {
|
||||
* when new tessdata types are introduced.
|
||||
*/
|
||||
inT32 actual_tessdata_num_entries_;
|
||||
STRING data_file_name_; // name of the data file.
|
||||
FILE *data_file_; ///< pointer to the data file.
|
||||
int debug_level_;
|
||||
// True if the bytes need swapping.
|
||||
|
@ -24,43 +24,46 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdarg.h>
|
||||
#include "strngs.h"
|
||||
#include "params.h"
|
||||
#include "tprintf.h"
|
||||
#include "ccutil.h"
|
||||
#include "params.h"
|
||||
#include "strngs.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
#define MAX_MSG_LEN 65536
|
||||
|
||||
#define EXTERN
|
||||
// Since tprintf is protected by a mutex, these parameters can rmain global.
|
||||
// Since tprintf is protected by a mutex, these parameters can remain global.
|
||||
DLLSYM STRING_VAR(debug_file, "", "File to send tprintf output to");
|
||||
|
||||
DLLSYM INT_VAR(FLAGS_v, 0, "Minimum logging level for tlog() output");
|
||||
|
||||
DLLSYM void
|
||||
tprintf( // Trace printf
|
||||
const char *format, ... // special message
|
||||
tprintf_internal( // Trace printf
|
||||
const int level, // Logging level
|
||||
const char *format, ... // Message
|
||||
) {
|
||||
if (FLAGS_v < level) return;
|
||||
tesseract::tprintfMutex.Lock();
|
||||
va_list args; //variable args
|
||||
static FILE *debugfp = NULL; //debug file
|
||||
//debug window
|
||||
inT32 offset = 0; //into message
|
||||
va_list args; // variable args
|
||||
static FILE *debugfp = NULL; // debug file
|
||||
// debug window
|
||||
inT32 offset = 0; // into message
|
||||
static char msg[MAX_MSG_LEN + 1];
|
||||
|
||||
va_start(args, format); //variable list
|
||||
va_start(args, format); // variable list
|
||||
// Format into msg
|
||||
#ifdef _WIN32
|
||||
//Format into msg
|
||||
offset += _vsnprintf (msg + offset, MAX_MSG_LEN - offset, format, args);
|
||||
offset += _vsnprintf(msg + offset, MAX_MSG_LEN - offset, format, args);
|
||||
if (strcmp(debug_file.string(), "/dev/null") == 0)
|
||||
debug_file.set_value("nul");
|
||||
#else
|
||||
//Format into msg
|
||||
offset += vsprintf (msg + offset, format, args);
|
||||
offset += vsnprintf(msg + offset, MAX_MSG_LEN - offset, format, args);
|
||||
#endif
|
||||
va_end(args);
|
||||
|
||||
if (debugfp == NULL && strlen (debug_file.string ()) > 0) {
|
||||
debugfp = fopen (debug_file.string (), "wb");
|
||||
} else if (debugfp != NULL && strlen (debug_file.string ()) == 0) {
|
||||
if (debugfp == NULL && strlen(debug_file.string()) > 0) {
|
||||
debugfp = fopen(debug_file.string(), "wb");
|
||||
} else if (debugfp != NULL && strlen(debug_file.string()) == 0) {
|
||||
fclose(debugfp);
|
||||
debugfp = NULL;
|
||||
}
|
||||
@ -70,46 +73,3 @@ const char *format, ... // special message
|
||||
fprintf(stderr, "%s", msg);
|
||||
tesseract::tprintfMutex.Unlock();
|
||||
}
|
||||
|
||||
|
||||
/*************************************************************************
|
||||
* pause_continue()
|
||||
* UI for a debugging pause - to see an intermediate state
|
||||
* Returns TRUE to continue as normal to the next pause in the current mode;
|
||||
* FALSE to quit the current pausing mode.
|
||||
*************************************************************************/
|
||||
|
||||
DLLSYM BOOL8
|
||||
//special message
|
||||
pause_continue (const char *format, ...
|
||||
) {
|
||||
va_list args; //variable args
|
||||
char msg[1000];
|
||||
STRING str = STRING ("DEBUG PAUSE:\n");
|
||||
|
||||
va_start(args, format); //variable list
|
||||
vsprintf(msg, format, args); //Format into msg
|
||||
va_end(args);
|
||||
|
||||
#ifdef GRAPHICS_DISABLED
|
||||
// No interaction allowed -> simply go on
|
||||
return true;
|
||||
#else
|
||||
|
||||
#ifdef __UNIX__
|
||||
printf ("%s\n", msg);
|
||||
printf ("Type \"c\" to cancel, anything else to continue: ");
|
||||
char c = getchar ();
|
||||
return (c != 'c');
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
str +=
|
||||
STRING (msg) + STRING ("\nUse OK to continue, CANCEL to stop pausing");
|
||||
// return AfxMessageBox( str.string(), MB_OKCANCEL ) == IDOK;
|
||||
return::MessageBox (NULL, msg, "IMGAPP",
|
||||
MB_APPLMODAL | MB_OKCANCEL) == IDOK;
|
||||
#endif
|
||||
|
||||
#endif
|
||||
}
|
||||
|
@ -17,19 +17,29 @@
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TPRINTF_H
|
||||
#define TPRINTF_H
|
||||
#ifndef TESSERACT_CCUTIL_TPRINTF_H
|
||||
#define TESSERACT_CCUTIL_TPRINTF_H
|
||||
|
||||
#include "params.h"
|
||||
#include "params.h"
|
||||
|
||||
extern DLLSYM STRING_VAR_H (debug_file, "", "File to send tprintf output to");
|
||||
extern DLLSYM BOOL_VAR_H (debug_window_on, TRUE,
|
||||
"Send tprintf to window unless file set");
|
||||
extern DLLSYM STRING_VAR_H(debug_file, "",
|
||||
"File to send tprintf output to");
|
||||
extern DLLSYM BOOL_VAR_H(debug_window_on, TRUE,
|
||||
"Send tprintf to window unless file set");
|
||||
|
||||
extern TESS_API void tprintf( // Trace printf
|
||||
const char *format, ... // special message
|
||||
);
|
||||
// special message
|
||||
DLLSYM BOOL8 pause_continue (const char *format, ...
|
||||
);
|
||||
#endif
|
||||
// Main logging function.
|
||||
#define tprintf(args...) tprintf_internal(0, args)
|
||||
|
||||
// Variant guarded by the numeric logging level parameter FLAGS_v (default 0).
|
||||
// Code using ParseCommandLineFlags() can control its value using the --v
|
||||
// commandline argument. Otherwise it must be specified in a config file like
|
||||
// other params.
|
||||
#define tlog(level, args...) tprintf_internal(level, args)
|
||||
|
||||
#define TLOG_IS_ON(level) (FLAGS_v >= level)
|
||||
|
||||
extern TESS_API void tprintf_internal( // Trace printf
|
||||
const int level, // Logging level
|
||||
const char *format, ...); // Message
|
||||
|
||||
#endif // define TESSERACT_CCUTIL_TPRINTF_H
|
||||
|
21370
ccutil/universalambigs.cpp
Normal file
21370
ccutil/universalambigs.cpp
Normal file
File diff suppressed because it is too large
Load Diff
26
ccutil/universalambigs.h
Normal file
26
ccutil/universalambigs.h
Normal file
@ -0,0 +1,26 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: universalambigs.h
|
||||
// Description: Data for a universal ambigs file that is useful for
|
||||
// any language.
|
||||
// Author: Ray Smith
|
||||
// Created: Mon Mar 18 11:26:00 PDT 2013
|
||||
//
|
||||
// (C) Copyright 2013, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
extern const char kUniversalAmbigsFile[];
|
||||
extern const int ksizeofUniversalAmbigsFile;
|
||||
|
||||
} // namespace tesseract
|
10
configure.ac
10
configure.ac
@ -7,7 +7,7 @@
|
||||
# ----------------------------------------
|
||||
|
||||
AC_PREREQ(2.50)
|
||||
AC_INIT([tesseract], [3.02.03], [http://code.google.com/p/tesseract-ocr/issues/list])
|
||||
AC_INIT([tesseract], [3.03], [http://code.google.com/p/tesseract-ocr/issues/list])
|
||||
AC_CONFIG_MACRO_DIR([m4])
|
||||
AC_REVISION($Id: configure.ac,v 1.4 2007/02/02 22:38:17 theraysmith Exp $)
|
||||
AC_CONFIG_AUX_DIR(config)
|
||||
@ -18,7 +18,7 @@ AC_PREFIX_DEFAULT(/usr/local)
|
||||
# documentation.
|
||||
# TODO(luc) Generate good documentation using doxygen or equivalent
|
||||
PACKAGE_YEAR=2013
|
||||
PACKAGE_DATE="07/03"
|
||||
PACKAGE_DATE="08/13"
|
||||
|
||||
AC_DEFINE_UNQUOTED(PACKAGE_NAME,["${PACKAGE_NAME}"],[Name of package])
|
||||
AC_DEFINE_UNQUOTED(PACKAGE_VERSION,["${PACKAGE_VERSION}"],[Version number])
|
||||
@ -34,8 +34,8 @@ GENERIC_LIBRARY_NAME=tesseract
|
||||
|
||||
# Release versioning
|
||||
GENERIC_MAJOR_VERSION=3
|
||||
GENERIC_MINOR_VERSION=2
|
||||
GENERIC_MICRO_VERSION=3
|
||||
GENERIC_MINOR_VERSION=3
|
||||
GENERIC_MICRO_VERSION=0
|
||||
|
||||
# API version (often = GENERIC_MAJOR_VERSION.GENERIC_MINOR_VERSION)
|
||||
GENERIC_API_VERSION=$GENERIC_MAJOR_VERSION.$GENERIC_MINOR_VERSION
|
||||
@ -248,6 +248,7 @@ AC_HEADER_TIME
|
||||
AC_HEADER_SYS_WAIT
|
||||
AC_CHECK_HEADERS(sys/ipc.h sys/shm.h)
|
||||
AC_CHECK_HEADERS(limits.h malloc.h)
|
||||
AC_CHECK_HEADERS(allheaders.h)
|
||||
# Enable use of system-defined bool type if available:
|
||||
AC_HEADER_STDBOOL
|
||||
|
||||
@ -261,6 +262,7 @@ AC_SYS_LARGEFILE
|
||||
# ----------------------------------------
|
||||
|
||||
AC_CHECK_TYPES(wchar_t)
|
||||
AC_CHECK_TYPES(long long int)
|
||||
AC_CHECK_TYPES(mbstate_t,,,[#include "wchar.h"])
|
||||
|
||||
# ----------------------------------------
|
||||
|
@ -65,13 +65,13 @@ CharSet *CharSet::Create(TessdataManager *tessdata_manager,
|
||||
!tessdata_manager->SeekToStart(TESSDATA_UNICHARSET)) {
|
||||
fprintf(stderr, "Cube ERROR (CharSet::Create): could not find "
|
||||
"either cube or tesseract unicharset\n");
|
||||
return false;
|
||||
return NULL;
|
||||
}
|
||||
FILE *charset_fp = tessdata_manager->GetDataFilePtr();
|
||||
if (!charset_fp) {
|
||||
fprintf(stderr, "Cube ERROR (CharSet::Create): could not load "
|
||||
"a unicharset\n");
|
||||
return false;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// If we found a cube unicharset separate from tesseract's, load it and
|
||||
@ -90,7 +90,7 @@ CharSet *CharSet::Create(TessdataManager *tessdata_manager,
|
||||
}
|
||||
if (!loaded) {
|
||||
delete char_set;
|
||||
return false;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char_set->init_ = true;
|
||||
|
@ -234,8 +234,8 @@ bool ConvNetCharClassifier::LoadFoldingSets(const string &data_file_path,
|
||||
fclose(fp);
|
||||
|
||||
string fold_sets_str;
|
||||
if (!CubeUtils::ReadFileToString(fold_file_name.c_str(),
|
||||
&fold_sets_str)) {
|
||||
if (!CubeUtils::ReadFileToString(fold_file_name,
|
||||
&fold_sets_str)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -327,7 +327,7 @@ bool ConvNetCharClassifier::LoadNets(const string &data_file_path,
|
||||
fclose(fp);
|
||||
|
||||
// load main net
|
||||
char_net_ = tesseract::NeuralNet::FromFile(char_net_file.c_str());
|
||||
char_net_ = tesseract::NeuralNet::FromFile(char_net_file);
|
||||
if (char_net_ == NULL) {
|
||||
fprintf(stderr, "Cube ERROR (ConvNetCharClassifier::LoadNets): "
|
||||
"could not load %s\n", char_net_file.c_str());
|
||||
|
@ -124,7 +124,7 @@ Pixa *CubeLineSegmenter::CrackLine(Pix *cracked_line_pix,
|
||||
|
||||
if (line_con_comps == NULL) {
|
||||
delete []lines_pixa;
|
||||
return false;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// assign each conn comp to the a line based on its centroid
|
||||
@ -142,7 +142,7 @@ Pixa *CubeLineSegmenter::CrackLine(Pix *cracked_line_pix,
|
||||
delete []lines_pixa;
|
||||
boxaDestroy(&line_con_comps);
|
||||
pixaDestroy(&line_con_comps_pix);
|
||||
return false;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
@ -413,14 +413,14 @@ Pix *CubeLineSegmenter::Pixa2Pix(Pixa *pixa, Box **dest_box,
|
||||
|
||||
(*dest_box) = boxCreate(min_x, min_y, max_x - min_x, max_y - min_y);
|
||||
if ((*dest_box) == NULL) {
|
||||
return false;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// create the union pix
|
||||
Pix *union_pix = pixCreate((*dest_box)->w, (*dest_box)->h, img_->d);
|
||||
if (union_pix == NULL) {
|
||||
boxDestroy(dest_box);
|
||||
return false;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// create a pix corresponding to the union of all pixs
|
||||
|
@ -165,7 +165,7 @@ WordAltList *CubeObject::Recognize(LangModel *lang_mod, bool word_mode) {
|
||||
if (deslanted_beam_obj_ == NULL) {
|
||||
fprintf(stderr, "Cube ERROR (CubeObject::Recognize): could not "
|
||||
"construct deslanted BeamSearch\n");
|
||||
return false;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -230,8 +230,8 @@ bool HybridNeuralNetCharClassifier::LoadFoldingSets(
|
||||
fclose(fp);
|
||||
|
||||
string fold_sets_str;
|
||||
if (!CubeUtils::ReadFileToString(fold_file_name.c_str(),
|
||||
&fold_sets_str)) {
|
||||
if (!CubeUtils::ReadFileToString(fold_file_name,
|
||||
&fold_sets_str)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -323,7 +323,7 @@ bool HybridNeuralNetCharClassifier::LoadNets(const string &data_file_path,
|
||||
fclose(fp);
|
||||
|
||||
string str;
|
||||
if (!CubeUtils::ReadFileToString(hybrid_net_file.c_str(), &str)) {
|
||||
if (!CubeUtils::ReadFileToString(hybrid_net_file, &str)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -348,7 +348,7 @@ bool HybridNeuralNetCharClassifier::LoadNets(const string &data_file_path,
|
||||
}
|
||||
// load the net
|
||||
string net_file_name = data_file_path + tokens_vec[0];
|
||||
nets_[net_idx] = tesseract::NeuralNet::FromFile(net_file_name.c_str());
|
||||
nets_[net_idx] = tesseract::NeuralNet::FromFile(net_file_name);
|
||||
if (nets_[net_idx] == NULL) {
|
||||
return false;
|
||||
}
|
||||
|
@ -107,7 +107,7 @@ int TessLangModEdge::CreateChildren(CubeRecoContext *cntxt,
|
||||
LangModEdge **edge_array) {
|
||||
int edge_cnt = 0;
|
||||
NodeChildVector vec;
|
||||
dawg->unichar_ids_of(parent_node, &vec); // find all children of the parent
|
||||
dawg->unichar_ids_of(parent_node, &vec, false); // find all children
|
||||
for (int i = 0; i < vec.size(); ++i) {
|
||||
const NodeChild &child = vec[i];
|
||||
if (child.unichar_id == INVALID_UNICHAR_ID) continue;
|
||||
|
@ -74,7 +74,7 @@ LangModEdge **WordListLangModel::GetEdges(CharAltList *alt_list,
|
||||
// initialize if necessary
|
||||
if (init_ == false) {
|
||||
if (Init() == false) {
|
||||
return false;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
@ -92,7 +92,7 @@ LangModEdge **WordListLangModel::GetEdges(CharAltList *alt_list,
|
||||
// advance node
|
||||
edge_ref = dawg_->next_node(edge_ref);
|
||||
if (edge_ref == 0) {
|
||||
return 0;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -8,7 +8,7 @@ endif
|
||||
noinst_HEADERS = \
|
||||
bitvec.h callcpp.h const.h cutil.h cutil_class.h danerror.h efio.h \
|
||||
emalloc.h freelist.h globals.h listio.h \
|
||||
oldheap.h oldlist.h structures.h tessarray.h
|
||||
oldlist.h structures.h
|
||||
|
||||
if !USING_MULTIPLELIBS
|
||||
noinst_LTLIBRARIES = libtesseract_cutil.la
|
||||
@ -22,7 +22,7 @@ endif
|
||||
|
||||
libtesseract_cutil_la_SOURCES = \
|
||||
bitvec.cpp callcpp.cpp cutil.cpp cutil_class.cpp danerror.cpp efio.cpp \
|
||||
emalloc.cpp freelist.cpp listio.cpp oldheap.cpp \
|
||||
oldlist.cpp structures.cpp tessarray.cpp
|
||||
emalloc.cpp freelist.cpp listio.cpp \
|
||||
oldlist.cpp structures.cpp
|
||||
|
||||
|
||||
|
@ -73,27 +73,6 @@ void FreeBitVector(BIT_VECTOR BitVector) {
|
||||
} /* FreeBitVector */
|
||||
|
||||
|
||||
/**
|
||||
* hamming_distance(array1,array2,length) computes the hamming distance
|
||||
* between two bit strings.
|
||||
*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
int hamming_distance(uinT32* array1, uinT32* array2, int length) {
|
||||
register uinT32 diff; /*bit difference */
|
||||
register int dist; /*total distance */
|
||||
|
||||
dist = 0;
|
||||
for (; length > 0; length--) {
|
||||
diff = *array1++ ^ *array2++;/*different bits */
|
||||
while (diff) {
|
||||
diff &= diff - 1; /*lose a bit */
|
||||
dist++;
|
||||
}
|
||||
}
|
||||
return dist; /*total distance */
|
||||
}
|
||||
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
/**
|
||||
* Allocate and return a new bit vector large enough to
|
||||
|
@ -70,8 +70,6 @@ BIT_VECTOR ExpandBitVector(BIT_VECTOR Vector, int NewNumBits);
|
||||
|
||||
void FreeBitVector(BIT_VECTOR BitVector);
|
||||
|
||||
int hamming_distance(uinT32* array1, uinT32* array2, int length);
|
||||
|
||||
BIT_VECTOR NewBitVector(int NumBits);
|
||||
|
||||
#endif
|
||||
|
@ -53,5 +53,5 @@ void DoError(int Error, const char *Message) {
|
||||
tprintf("\nError: %s!\n", Message);
|
||||
}
|
||||
|
||||
signal_termination_handler(Error);
|
||||
err_exit();
|
||||
} /* DoError */
|
||||
|
@ -46,7 +46,6 @@ LIST read_list(const char *filename) {
|
||||
FILE *infile;
|
||||
char s[CHARS_PER_LINE];
|
||||
LIST list;
|
||||
char *chopAt250();
|
||||
|
||||
if ((infile = open_file (filename, "r")) == NULL)
|
||||
return (NIL_LIST);
|
||||
|
@ -1,334 +0,0 @@
|
||||
/******************************************************************************
|
||||
** Filename: heap.c
|
||||
** Purpose: Routines for managing heaps (smallest at root)
|
||||
** Author: Dan Johnson
|
||||
** History: 3/13/89, DSJ, Created.
|
||||
**
|
||||
** (c) Copyright Hewlett-Packard Company, 1988.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
******************************************************************************/
|
||||
/*-----------------------------------------------------------------------------
|
||||
Include Files and Type Defines
|
||||
-----------------------------------------------------------------------------*/
|
||||
#include "oldheap.h"
|
||||
#include "freelist.h"
|
||||
#include "danerror.h"
|
||||
#include "emalloc.h"
|
||||
#include <stdio.h>
|
||||
|
||||
#define FATHER(N) ((N)>>1)
|
||||
#define LEFTSON(N) ((N)<<1)
|
||||
#define RIGHTSON(N) ((N)<<1 + 1)
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
Public Code
|
||||
-----------------------------------------------------------------------------*/
|
||||
/*---------------------------------------------------------------------------*/
|
||||
/**
|
||||
* This routine creates and initializes a new heap data
|
||||
* structure containing Size elements. In actuality, Size + 1
|
||||
* elements are allocated. The first element, element 0, is
|
||||
* unused, this makes the index arithmetic easier.
|
||||
*
|
||||
* Globals:
|
||||
* - None
|
||||
*
|
||||
* @param Size maximum number of entries in the heap
|
||||
* @return Pointer to the new heap.
|
||||
* @note Exceptions: None
|
||||
* @note History: 3/13/89, DSJ, Created.
|
||||
*/
|
||||
HEAP *MakeHeap(int Size) {
|
||||
HEAP *NewHeap;
|
||||
|
||||
NewHeap = (HEAP *) Emalloc (sizeof (HEAP) + Size * sizeof (HEAPENTRY));
|
||||
|
||||
NewHeap->Size = Size;
|
||||
NewHeap->FirstFree = 1;
|
||||
return (NewHeap);
|
||||
} /* MakeHeap */
|
||||
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
/**
|
||||
* This routine removes the top item on the heap and places
|
||||
* its contents into Key and Data.
|
||||
*
|
||||
* Globals:
|
||||
* - None
|
||||
*
|
||||
* @param Heap ptr to heap whose top is to be removed and returned
|
||||
* @param Key place to put key of top heap item
|
||||
* @param out_ptr place to put data of top heap item
|
||||
*
|
||||
* @return OK if top entry returned, EMPTY if heap is empty
|
||||
* @note Exceptions: None
|
||||
* @note History: 5/10/91, DSJ, Created (Modified from GetTopOfHeap).
|
||||
*/
|
||||
int HeapPop(HEAP *Heap, FLOAT32 *Key, void *out_ptr) {
|
||||
inT32 Hole;
|
||||
FLOAT32 HoleKey;
|
||||
inT32 Son;
|
||||
void **Data = (void **) out_ptr;
|
||||
|
||||
if (Heap->FirstFree <= 1)
|
||||
return (EMPTY);
|
||||
|
||||
*Key = Heap->Entry[1].Key;
|
||||
*Data = Heap->Entry[1].Data;
|
||||
|
||||
Heap->FirstFree--;
|
||||
|
||||
/* imagine the hole at the root is filled with the last entry in the heap */
|
||||
HoleKey = Heap->Entry[Heap->FirstFree].Key;
|
||||
Hole = 1;
|
||||
|
||||
/* while hole has 2 sons */
|
||||
while ((Son = LEFTSON (Hole)) < Heap->FirstFree) {
|
||||
/* find the son with the smallest key */
|
||||
if (Heap->Entry[Son].Key > Heap->Entry[Son + 1].Key)
|
||||
Son++;
|
||||
|
||||
/* if key for hole is greater than key for son, sift hole down */
|
||||
if (HoleKey > Heap->Entry[Son].Key) {
|
||||
Heap->Entry[Hole].Key = Heap->Entry[Son].Key;
|
||||
Heap->Entry[Hole].Data = Heap->Entry[Son].Data;
|
||||
Hole = Son;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
Heap->Entry[Hole].Key = HoleKey;
|
||||
Heap->Entry[Hole].Data = Heap->Entry[Heap->FirstFree].Data;
|
||||
return (TESS_HEAP_OK);
|
||||
} /* HeapPop */
|
||||
|
||||
|
||||
/**
|
||||
* HeapPopWorst
|
||||
*
|
||||
* Remove the largest item from the heap.
|
||||
*
|
||||
* @param Heap ptr to heap whose top is to be removed and returned
|
||||
* @param Key place to put key of top heap item
|
||||
* @param out_ptr place to put data of top heap item
|
||||
*/
|
||||
int HeapPopWorst(HEAP *Heap, FLOAT32 *Key, void *out_ptr) {
|
||||
inT32 Index; /*current index */
|
||||
inT32 Hole;
|
||||
FLOAT32 HoleKey;
|
||||
inT32 Father;
|
||||
void *HoleData;
|
||||
void **Data = (void **) out_ptr;
|
||||
|
||||
if (Heap->FirstFree <= 1)
|
||||
return (EMPTY);
|
||||
|
||||
HoleKey = Heap->Entry[1].Key;
|
||||
Hole = 1;
|
||||
Heap->FirstFree--;
|
||||
for (Index = Heap->FirstFree, Father = FATHER (Index); Index > Father;
|
||||
Index--)
|
||||
if (Heap->Entry[Index].Key > HoleKey) {
|
||||
/*find biggest */
|
||||
HoleKey = Heap->Entry[Index].Key;
|
||||
Hole = Index;
|
||||
}
|
||||
*Key = HoleKey;
|
||||
*Data = Heap->Entry[Hole].Data;
|
||||
|
||||
HoleKey = Heap->Entry[Heap->FirstFree].Key;
|
||||
Heap->Entry[Hole].Key = HoleKey;
|
||||
HoleData = Heap->Entry[Heap->FirstFree].Data;
|
||||
Heap->Entry[Hole].Data = HoleData;
|
||||
|
||||
/* now sift last entry to its rightful place */
|
||||
Father = FATHER (Hole); /*father of hole */
|
||||
while (Hole > 1 && Heap->Entry[Father].Key > HoleKey) {
|
||||
/*swap entries */
|
||||
Heap->Entry[Hole].Key = Heap->Entry[Father].Key;
|
||||
Heap->Entry[Hole].Data = Heap->Entry[Father].Data;
|
||||
Heap->Entry[Father].Data = HoleData;
|
||||
Heap->Entry[Father].Key = HoleKey;
|
||||
Hole = Father;
|
||||
Father = FATHER (Hole);
|
||||
}
|
||||
return (TESS_HEAP_OK);
|
||||
} /* HeapPop */
|
||||
|
||||
|
||||
// Pushes data onto the heap only if there is free space left.
|
||||
// Returns true if data was added to the heap, false if the heap was full.
|
||||
bool HeapPushCheckSize(HEAP *Heap, FLOAT32 Key, void *Data) {
|
||||
if (Heap->FirstFree > Heap->Size) return false;
|
||||
HeapPush(Heap, Key, Data);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
/**
|
||||
* This routine stores Data into Heap and associates it
|
||||
* with Key. The heap is
|
||||
* maintained in such a way that the item with the lowest key
|
||||
* is always at the top of the heap.
|
||||
*
|
||||
* Globals:
|
||||
* - None
|
||||
*
|
||||
* @param Heap ptr to heap to store new item in
|
||||
* @param Key numeric key associated with new item
|
||||
* @param Data ptr to data contents of new item
|
||||
*
|
||||
* @note Exceptions:
|
||||
* - HEAPFULL error if heap size is exceeded
|
||||
*
|
||||
* @note History: 5/10/91, DSJ, Created (Modified version of HeapStore).
|
||||
*/
|
||||
void HeapPush(HEAP *Heap, FLOAT32 Key, void *Data) {
|
||||
inT32 Item;
|
||||
inT32 Father;
|
||||
|
||||
if (Heap->FirstFree > Heap->Size)
|
||||
DoError (HEAPFULL, "Heap size exceeded");
|
||||
|
||||
Item = Heap->FirstFree;
|
||||
Heap->FirstFree++;
|
||||
while (Item != 1) {
|
||||
Father = FATHER (Item);
|
||||
if (Heap->Entry[Father].Key > Key) {
|
||||
Heap->Entry[Item].Key = Heap->Entry[Father].Key;
|
||||
Heap->Entry[Item].Data = Heap->Entry[Father].Data;
|
||||
Item = Father;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
Heap->Entry[Item].Key = Key;
|
||||
Heap->Entry[Item].Data = Data;
|
||||
} /* HeapPush */
|
||||
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
/**
|
||||
* This routine stores Entry into Heap. The heap is
|
||||
* maintained in such a way that the item with the lowest key
|
||||
* is always at the top of the heap.
|
||||
*
|
||||
* Globals:
|
||||
* - None
|
||||
*
|
||||
* @param Heap ptr to heap to store new item in
|
||||
* @param Entry ptr to item to be stored in Heap
|
||||
* @note Exceptions:
|
||||
* - HEAPFULL error if heap size is exceeded
|
||||
* @note History: 3/13/89, DSJ, Created.
|
||||
*/
|
||||
void HeapStore(HEAP *Heap, HEAPENTRY *Entry) {
|
||||
inT32 Item;
|
||||
inT32 Father;
|
||||
|
||||
if (Heap->FirstFree > Heap->Size)
|
||||
DoError (HEAPFULL, "Heap size exceeded");
|
||||
|
||||
Item = Heap->FirstFree;
|
||||
Heap->FirstFree++;
|
||||
while (Item != 1) {
|
||||
Father = FATHER (Item);
|
||||
if (Heap->Entry[Father].Key > Entry->Key) {
|
||||
Heap->Entry[Item].Key = Heap->Entry[Father].Key;
|
||||
Heap->Entry[Item].Data = Heap->Entry[Father].Data;
|
||||
Item = Father;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
Heap->Entry[Item].Key = Entry->Key;
|
||||
Heap->Entry[Item].Data = Entry->Data;
|
||||
} /* HeapStore */
|
||||
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
/**
|
||||
* This routine removes the top item on the heap and copies its
|
||||
* contents into Entry.
|
||||
*
|
||||
* @param Heap ptr to heap whose top is to be removed and returned
|
||||
* @param Entry ptr to heap entry to be filled with top entry on Heap
|
||||
*
|
||||
* Globals:
|
||||
* - None
|
||||
*
|
||||
* @return OK if top entry returned, EMPTY if heap is empty
|
||||
* @note Exceptions: None
|
||||
* @note History: 3/13/89, DSJ, Created.
|
||||
*/
|
||||
int GetTopOfHeap(HEAP *Heap, HEAPENTRY *Entry) {
|
||||
inT32 Hole;
|
||||
FLOAT32 HoleKey;
|
||||
inT32 Son;
|
||||
|
||||
if (Heap->FirstFree <= 1)
|
||||
return (EMPTY);
|
||||
|
||||
Entry->Key = Heap->Entry[1].Key;
|
||||
Entry->Data = Heap->Entry[1].Data;
|
||||
|
||||
Heap->FirstFree--;
|
||||
|
||||
/* imagine the hole at the root is filled with the last entry in the heap */
|
||||
HoleKey = Heap->Entry[Heap->FirstFree].Key;
|
||||
Hole = 1;
|
||||
|
||||
/* while hole has 2 sons */
|
||||
while ((Son = LEFTSON (Hole)) < Heap->FirstFree) {
|
||||
/* find the son with the smallest key */
|
||||
if (Heap->Entry[Son].Key > Heap->Entry[Son + 1].Key)
|
||||
Son++;
|
||||
|
||||
/* if key for hole is greater than key for son, sift hole down */
|
||||
if (HoleKey > Heap->Entry[Son].Key) {
|
||||
Heap->Entry[Hole].Key = Heap->Entry[Son].Key;
|
||||
Heap->Entry[Hole].Data = Heap->Entry[Son].Data;
|
||||
Hole = Son;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
Heap->Entry[Hole].Key = HoleKey;
|
||||
Heap->Entry[Hole].Data = Heap->Entry[Heap->FirstFree].Data;
|
||||
return (TESS_HEAP_OK);
|
||||
} /* GetTopOfHeap */
|
||||
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
/**
|
||||
* This routine is similar to FreeHeap in that it
|
||||
* deallocates the memory consumed by the heap. However, it
|
||||
* also calls Deallocator for each item in the heap so that
|
||||
* this data is also deallocated.
|
||||
*
|
||||
* @param Heap heap whose data is to be freed
|
||||
* @param destructor function to be used to deallocate data
|
||||
*
|
||||
* Globals:
|
||||
* - None
|
||||
*
|
||||
* @note Exceptions: none
|
||||
* @note History: Tue May 15 08:52:04 1990, DSJ, Created.
|
||||
*/
|
||||
void FreeHeapData(HEAP *Heap, void_dest destructor) {
|
||||
HEAPENTRY Entry;
|
||||
|
||||
while (GetTopOfHeap (Heap, &Entry) != EMPTY)
|
||||
destructor (Entry.Data);
|
||||
|
||||
FreeHeap(Heap);
|
||||
} /* FreeHeapData */
|
@ -1,80 +0,0 @@
|
||||
/******************************************************************************
|
||||
** Filename: heap.h
|
||||
** Purpose: Definition of heap access routines.
|
||||
** Author: Dan Johnson
|
||||
** History: 3/13/89, DSJ, Created.
|
||||
**
|
||||
** (c) Copyright Hewlett-Packard Company, 1988.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
******************************************************************************/
|
||||
#ifndef HEAP_H
|
||||
#define HEAP_H
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
Include Files and Type Defines
|
||||
-----------------------------------------------------------------------------*/
|
||||
#include "host.h"
|
||||
#include "cutil.h"
|
||||
|
||||
#define HEAPFULL 3000
|
||||
|
||||
#define EMPTY -1
|
||||
#define TESS_HEAP_OK 0
|
||||
|
||||
struct HEAPENTRY {
|
||||
FLOAT32 Key;
|
||||
void *Data;
|
||||
};
|
||||
|
||||
struct HEAP {
|
||||
inT32 Size;
|
||||
inT32 FirstFree;
|
||||
HEAPENTRY Entry[1];
|
||||
};
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
Macros
|
||||
-----------------------------------------------------------------------------*/
|
||||
#define FreeHeap(H) memfree(H)
|
||||
#define MaxSizeOfHeap(H) (H->Size)
|
||||
#define SizeOfHeap(H) (H->FirstFree - 1)
|
||||
#define InitHeap(H) (H->FirstFree = 1)
|
||||
#define HeapFull(H) ((H)->FirstFree > (H)->Size)
|
||||
#define HeapEmpty(H) ((H)->FirstFree <= 1)
|
||||
|
||||
/* macros for accessing elements in heap by index. The indicies vary from
|
||||
0 to SizeOfHeap-1. No bounds checking is done. Elements accessed in
|
||||
this manner are in random order relative to the Key values. These
|
||||
macros should never be used as the LHS of an assignment statement as this
|
||||
will corrupt the heap.*/
|
||||
#define HeapKeyFor(H,E) ((H)->Entry[(E)+1].Key)
|
||||
#define HeapDataFor(H,E) ((H)->Entry[(E)+1].Data)
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
Public Function Prototypes
|
||||
-----------------------------------------------------------------------------*/
|
||||
HEAP *MakeHeap(int Size);
|
||||
|
||||
int HeapPop(HEAP *Heap, FLOAT32 *Key, void *out_ptr);
|
||||
|
||||
int HeapPopWorst(HEAP *Heap, FLOAT32 *Key, void *out_ptr);
|
||||
|
||||
void HeapPush(HEAP *Heap, FLOAT32 Key, void *Data);
|
||||
|
||||
void HeapStore(HEAP *Heap, HEAPENTRY *Entry);
|
||||
|
||||
int GetTopOfHeap(HEAP *Heap, HEAPENTRY *Entry);
|
||||
|
||||
void FreeHeapData(HEAP *Heap, void_dest destructor);
|
||||
|
||||
bool HeapPushCheckSize(HEAP *Heap, FLOAT32 Key, void *Data);
|
||||
|
||||
#endif
|
@ -1,115 +0,0 @@
|
||||
/* -*-C-*-
|
||||
################################################################################
|
||||
#
|
||||
# File: array.c
|
||||
# Description: Dynamic Array of Strings
|
||||
# Author: Mark Seaman, Software Productivity
|
||||
# Created: Thu Jul 23 13:24:09 1987
|
||||
# Modified: Wed Mar 6 15:18:33 1991 (Mark Seaman) marks@hpgrlt
|
||||
# Language: C
|
||||
# Package: N/A
|
||||
# Status: Reusable Software Component
|
||||
#
|
||||
# (c) Copyright 1987, Hewlett-Packard Company.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
#
|
||||
################################################################################
|
||||
|
||||
This file contains the implentations of a set of dynamic array of string
|
||||
manipulation routines. For the interface definitions and documentation
|
||||
of these routines see the file "das.h".
|
||||
|
||||
***************************************************************************/
|
||||
|
||||
#include "tessarray.h"
|
||||
#include "callcpp.h"
|
||||
#include "freelist.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#ifdef _WIN32
|
||||
#include <process.h>
|
||||
#endif
|
||||
#include <ctype.h>
|
||||
#if MAC_OR_DOS
|
||||
#include <stdlib.h>
|
||||
#endif
|
||||
|
||||
/**********************************************************************
|
||||
* array_insert
|
||||
*
|
||||
* Insert a data element into a particular spot in the array. Move all
|
||||
* the elements in the array (past that spot) down one to make room for
|
||||
* the new element.
|
||||
**********************************************************************/
|
||||
ARRAY array_insert(ARRAY array, int index, void *value) {
|
||||
int x;
|
||||
|
||||
array = array_push (array, NULL);
|
||||
for (x = array_count (array) - 1; x > index; x--)
|
||||
array_value (array, x) = array_value (array, x - 1);
|
||||
array_value (array, index) = value;
|
||||
return (array);
|
||||
}
|
||||
|
||||
|
||||
/**********************************************************************
|
||||
* array_new
|
||||
*
|
||||
* Create a new array with a certain number of elements. If the number
|
||||
* of elements requested is 0 then the default number will be used.
|
||||
**********************************************************************/
|
||||
ARRAY array_new(int num) {
|
||||
ARRAY temp;
|
||||
int x;
|
||||
|
||||
if (num == 0)
|
||||
num = DEFAULT_SIZE;
|
||||
temp = (ARRAY) memalloc ((num - 2) * sizeof (char *) +
|
||||
sizeof (struct array_record));
|
||||
if (!temp) {
|
||||
cprintf ("error: Out of memory in array_new\n");
|
||||
exit (1); //?err_exit ();
|
||||
}
|
||||
array_count (temp) = 0;
|
||||
array_limit (temp) = num;
|
||||
for (x = 0; x < num; x++)
|
||||
array_value (temp, x) = (char *) 0;
|
||||
return (temp);
|
||||
}
|
||||
|
||||
|
||||
/**********************************************************************
|
||||
* array_push
|
||||
*
|
||||
* Add a new element onto the top of the array. If there is not room
|
||||
* more room is made by "realloc"ing the array. This means that the
|
||||
* new array location may change. All previous references to its old
|
||||
* location may no longer be valid.
|
||||
**********************************************************************/
|
||||
ARRAY array_push(ARRAY array, void *value) {
|
||||
if (array_count (array) == array_limit (array)) {
|
||||
array = (ARRAY) memrealloc (array, (array_limit (array) * 2 - 2) *
|
||||
sizeof (char *) +
|
||||
sizeof (struct array_record),
|
||||
(array_limit (array) -
|
||||
2) * sizeof (char *) +
|
||||
sizeof (struct array_record));
|
||||
if (!array) {
|
||||
cprintf ("error: Out of memory in array_push\n");
|
||||
exit (1); //?err_exit ();
|
||||
}
|
||||
array_limit (array) *= 2;
|
||||
}
|
||||
array_count (array)++;
|
||||
array_top (array) = value;
|
||||
return (array);
|
||||
}
|
@ -1,166 +0,0 @@
|
||||
/* -*-C-*-
|
||||
********************************************************************************
|
||||
*
|
||||
* File: array.h (Formerly array.h)
|
||||
* Description: Dynamic Array of String
|
||||
* Author: Mark Seaman, SW Productivity
|
||||
* Created: Fri Oct 16 14:37:00 1987
|
||||
* Modified: Mon Sep 24 14:15:59 1990 (Mark Seaman) marks@hpgrlt
|
||||
* Language: C
|
||||
* Package: N/A
|
||||
* Status: Reusable Software Component
|
||||
*
|
||||
* (c) Copyright 1987, Hewlett-Packard Company.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
*****************************************************************************
|
||||
|
||||
This file contains a set of general purpose dynamic array of string routines.
|
||||
These routines can be used in a wide variety of ways to provide several
|
||||
different popular data structures. A new "das" can be created by declaring
|
||||
a variable of type 'DAS'
|
||||
******************************************************************************/
|
||||
|
||||
#ifndef TESSARRAY_H
|
||||
#define TESSARRAY_H
|
||||
|
||||
/*
|
||||
----------------------------------------------------------------------
|
||||
I n c l u d e s
|
||||
----------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
/*
|
||||
----------------------------------------------------------------------
|
||||
T y p e s
|
||||
----------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
typedef struct array_record
|
||||
{
|
||||
size_t limit;
|
||||
size_t top;
|
||||
void *base[2];
|
||||
} *ARRAY;
|
||||
|
||||
typedef void (*voidProc) ();
|
||||
|
||||
typedef int (*intProc) ();
|
||||
|
||||
/*
|
||||
----------------------------------------------------------------------
|
||||
M a c r o s
|
||||
----------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#define DEFAULT_SIZE 2
|
||||
|
||||
/**********************************************************************
|
||||
* array_count
|
||||
*
|
||||
* Return the value of the number of elements currently in the array.
|
||||
**********************************************************************/
|
||||
|
||||
#define array_count(a) \
|
||||
((a)->top)
|
||||
|
||||
/**********************************************************************
|
||||
* array_free
|
||||
*
|
||||
* Free the memory allocated to this array.
|
||||
**********************************************************************/
|
||||
|
||||
#define array_free \
|
||||
memfree
|
||||
|
||||
/**********************************************************************
|
||||
* array_index
|
||||
*
|
||||
* Check to make sure that the index value is valid. Return the
|
||||
* value of the nth element currently in the array.
|
||||
**********************************************************************/
|
||||
|
||||
#define array_index(a,i) \
|
||||
((i<array_count(a)) ? (a)->base[i] : 0)
|
||||
|
||||
/**********************************************************************
|
||||
* array_limit
|
||||
*
|
||||
* Return the maximum number of elements that could be currently held
|
||||
* in this array without further expansion.
|
||||
**********************************************************************/
|
||||
|
||||
#define array_limit(a) \
|
||||
((a)->limit)
|
||||
|
||||
/**********************************************************************
|
||||
* array_loop
|
||||
*
|
||||
* Iterate through each of the array elements. Each value can then be
|
||||
* accessed by:
|
||||
* array_index (a, x)
|
||||
**********************************************************************/
|
||||
|
||||
#define array_loop(a,x) \
|
||||
for (x=0; x < array_count (a); x++)
|
||||
|
||||
/**********************************************************************
|
||||
* array_top
|
||||
*
|
||||
* Return the last element that was pushed on this array.
|
||||
**********************************************************************/
|
||||
|
||||
#define array_top(a) \
|
||||
((a)->base[array_count (a) - 1])
|
||||
|
||||
/**********************************************************************
|
||||
* array_value
|
||||
*
|
||||
* Return the nth element of the array. Don't do range checking.
|
||||
**********************************************************************/
|
||||
|
||||
#define array_value(a,i) \
|
||||
((a)->base[i])
|
||||
|
||||
/*----------------------------------------------------------------------
|
||||
F u n c t i o n s
|
||||
----------------------------------------------------------------------*/
|
||||
ARRAY array_insert(ARRAY array, int index, void *value);
|
||||
|
||||
ARRAY array_new(int num);
|
||||
|
||||
ARRAY array_push(ARRAY array, void *value);
|
||||
|
||||
/*
|
||||
#if defined(__STDC__) || defined(__cplusplus)
|
||||
# define _ARGS(s) s
|
||||
#else
|
||||
# define _ARGS(s) ()
|
||||
#endif*/
|
||||
|
||||
/* array.c
|
||||
ARRAY array_insert
|
||||
_ARGS((ARRAY array,
|
||||
int index,
|
||||
char *value));
|
||||
|
||||
ARRAY array_new
|
||||
_ARGS((int num));
|
||||
|
||||
ARRAY array_push
|
||||
_ARGS((ARRAY array,
|
||||
char *value));
|
||||
|
||||
#undef _ARGS
|
||||
*/
|
||||
#endif
|
@ -7,8 +7,8 @@ AM_CPPFLAGS += -DTESS_EXPORTS \
|
||||
endif
|
||||
|
||||
noinst_HEADERS = \
|
||||
dawg.h dict.h matchdefs.h \
|
||||
permute.h states.h stopper.h trie.h
|
||||
dawg.h dawg_cache.h dict.h matchdefs.h \
|
||||
stopper.h trie.h
|
||||
|
||||
if !USING_MULTIPLELIBS
|
||||
noinst_LTLIBRARIES = libtesseract_dict.la
|
||||
@ -25,7 +25,7 @@ endif
|
||||
|
||||
libtesseract_dict_la_SOURCES = \
|
||||
context.cpp \
|
||||
dawg.cpp dict.cpp hyphen.cpp \
|
||||
permdawg.cpp permute.cpp states.cpp stopper.cpp trie.cpp
|
||||
dawg.cpp dawg_cache.cpp dict.cpp hyphen.cpp \
|
||||
permdawg.cpp stopper.cpp trie.cpp
|
||||
|
||||
|
||||
|
@ -38,6 +38,7 @@
|
||||
#include "freelist.h"
|
||||
#include "helpers.h"
|
||||
#include "strngs.h"
|
||||
#include "tesscallback.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
/*----------------------------------------------------------------------
|
||||
@ -45,25 +46,29 @@
|
||||
----------------------------------------------------------------------*/
|
||||
namespace tesseract {
|
||||
|
||||
bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
|
||||
if (word.length() == 0) return false;
|
||||
bool Dawg::prefix_in_dawg(const WERD_CHOICE &word,
|
||||
bool requires_complete) const {
|
||||
if (word.length() == 0) return !requires_complete;
|
||||
NODE_REF node = 0;
|
||||
int end_index = word.length() - 1;
|
||||
for (int i = 0; i <= end_index; i++) {
|
||||
if (debug_level_ > 1) {
|
||||
tprintf("word_in_dawg: exploring node " REFFORMAT ":\n", node);
|
||||
print_node(node, MAX_NODE_EDGES_DISPLAY);
|
||||
tprintf("\n");
|
||||
for (int i = 0; i < end_index; i++) {
|
||||
EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
|
||||
if (edge == NO_EDGE) {
|
||||
return false;
|
||||
}
|
||||
EDGE_REF edge = edge_char_of(node, word.unichar_id(i), i == end_index);
|
||||
if (edge != NO_EDGE) {
|
||||
node = next_node(edge);
|
||||
if (node == 0) node = NO_EDGE;
|
||||
} else {
|
||||
if ((node = next_node(edge)) == 0) {
|
||||
// This only happens if all words following this edge terminate --
|
||||
// there are no larger words. See Trie::add_word_to_dawg()
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
// Now check the last character.
|
||||
return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
|
||||
NO_EDGE;
|
||||
}
|
||||
|
||||
bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
|
||||
return prefix_in_dawg(word, true);
|
||||
}
|
||||
|
||||
int Dawg::check_for_words(const char *filename,
|
||||
@ -99,23 +104,36 @@ int Dawg::check_for_words(const char *filename,
|
||||
}
|
||||
|
||||
void Dawg::iterate_words(const UNICHARSET &unicharset,
|
||||
TessCallback1<const char *> *cb) const {
|
||||
TessCallback1<const WERD_CHOICE *> *cb) const {
|
||||
WERD_CHOICE word(&unicharset);
|
||||
iterate_words_rec(word, 0, cb);
|
||||
}
|
||||
|
||||
void CallWithUTF8(TessCallback1<const char *> *cb, const WERD_CHOICE *wc) {
|
||||
STRING s;
|
||||
wc->string_and_lengths(&s, NULL);
|
||||
cb->Run(s.string());
|
||||
}
|
||||
|
||||
void Dawg::iterate_words(const UNICHARSET &unicharset,
|
||||
TessCallback1<const char *> *cb) const {
|
||||
TessCallback1<const WERD_CHOICE *> *shim =
|
||||
NewPermanentTessCallback(CallWithUTF8, cb);
|
||||
WERD_CHOICE word(&unicharset);
|
||||
iterate_words_rec(word, 0, shim);
|
||||
delete shim;
|
||||
}
|
||||
|
||||
void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
|
||||
NODE_REF to_explore,
|
||||
TessCallback1<const char *> *cb) const {
|
||||
TessCallback1<const WERD_CHOICE *> *cb) const {
|
||||
NodeChildVector children;
|
||||
this->unichar_ids_of(to_explore, &children);
|
||||
this->unichar_ids_of(to_explore, &children, false);
|
||||
for (int i = 0; i < children.size(); i++) {
|
||||
WERD_CHOICE next_word(word_so_far);
|
||||
next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
|
||||
if (this->end_of_word(children[i].edge_ref)) {
|
||||
STRING s;
|
||||
next_word.string_and_lengths(&s, NULL);
|
||||
cb->Run(s.string());
|
||||
cb->Run(&next_word);
|
||||
}
|
||||
NODE_REF next = next_node(children[i].edge_ref);
|
||||
if (next != 0) {
|
||||
@ -132,7 +150,7 @@ bool Dawg::match_words(WERD_CHOICE *word, inT32 index,
|
||||
if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
|
||||
bool any_matched = false;
|
||||
NodeChildVector vec;
|
||||
this->unichar_ids_of(node, &vec);
|
||||
this->unichar_ids_of(node, &vec, false);
|
||||
for (int i = 0; i < vec.size(); ++i) {
|
||||
word->set_unichar_id(vec[i].unichar_id, index);
|
||||
if (match_words(word, index, node, wildcard))
|
||||
|
108
dict/dawg.h
108
dict/dawg.h
@ -91,10 +91,6 @@ enum DawgType {
|
||||
#define NUM_FLAG_BITS 3
|
||||
#define REFFORMAT "%lld"
|
||||
|
||||
// Set kBeginningDawgsType[i] to true if a Dawg of
|
||||
// DawgType i can contain the beginning of a word.
|
||||
static const bool kBeginningDawgsType[] = { 1, 1, 1, 1 };
|
||||
|
||||
static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
|
||||
{ 0, 1, 1, 0 }, // for DAWG_TYPE_PUNCTUATION
|
||||
{ 1, 0, 0, 0 }, // for DAWG_TYPE_WORD
|
||||
@ -137,12 +133,21 @@ class Dawg {
|
||||
/// Returns true if the given word is in the Dawg.
|
||||
bool word_in_dawg(const WERD_CHOICE &word) const;
|
||||
|
||||
// Returns true if the given word prefix is not contraindicated by the dawg.
|
||||
// If requires_complete is true, then the exact complete word must be present.
|
||||
bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const;
|
||||
|
||||
/// Checks the Dawg for the words that are listed in the requested file.
|
||||
/// Returns the number of words in the given file missing from the Dawg.
|
||||
int check_for_words(const char *filename,
|
||||
const UNICHARSET &unicharset,
|
||||
bool enable_wildcard) const;
|
||||
|
||||
// For each word in the Dawg, call the given (permanent) callback with the
|
||||
// text (UTF-8) version of the word.
|
||||
void iterate_words(const UNICHARSET &unicharset,
|
||||
TessCallback1<const WERD_CHOICE *> *cb) const;
|
||||
|
||||
// For each word in the Dawg, call the given (permanent) callback with the
|
||||
// text (UTF-8) version of the word.
|
||||
void iterate_words(const UNICHARSET &unicharset,
|
||||
@ -156,7 +161,8 @@ class Dawg {
|
||||
|
||||
/// Fills the given NodeChildVector with all the unichar ids (and the
|
||||
/// corresponding EDGE_REFs) for which there is an edge out of this node.
|
||||
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const = 0;
|
||||
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
|
||||
bool word_end) const = 0;
|
||||
|
||||
/// Returns the next node visited by following the edge
|
||||
/// indicated by the given EDGE_REF.
|
||||
@ -277,7 +283,7 @@ class Dawg {
|
||||
// Recursively iterate over all words in a dawg (see public iterate_words).
|
||||
void iterate_words_rec(const WERD_CHOICE &word_so_far,
|
||||
NODE_REF to_explore,
|
||||
TessCallback1<const char *> *cb) const;
|
||||
TessCallback1<const WERD_CHOICE *> *cb) const;
|
||||
|
||||
// Member Variables.
|
||||
DawgType type_;
|
||||
@ -299,22 +305,71 @@ class Dawg {
|
||||
};
|
||||
|
||||
//
|
||||
/// DawgInfo struct and DawgInfoVector class are used for
|
||||
/// storing information about the current Dawg search state.
|
||||
// DawgPosition keeps track of where we are in the primary dawg we're searching
|
||||
// as well as where we may be in the "punctuation dawg" which may provide
|
||||
// surrounding context.
|
||||
//
|
||||
struct DawgInfo {
|
||||
DawgInfo() : dawg_index(-1), ref(NO_EDGE) {}
|
||||
DawgInfo(int i, EDGE_REF r) : dawg_index(i), ref(r) {}
|
||||
bool operator==(const DawgInfo &other) {
|
||||
return (this->dawg_index == other.dawg_index && this->ref == other.ref);
|
||||
// Example:
|
||||
// punctuation dawg -- space is the "pattern character"
|
||||
// " " // no punctuation
|
||||
// "' '" // leading and trailing apostrophes
|
||||
// " '" // trailing apostrophe
|
||||
// word dawg:
|
||||
// "cat"
|
||||
// "cab"
|
||||
// "cat's"
|
||||
//
|
||||
// DawgPosition(dawg_index, dawg_ref, punc_index, punc_ref, rtp)
|
||||
//
|
||||
// DawgPosition(-1, NO_EDGE, p, pe, false)
|
||||
// We're in the punctuation dawg, no other dawg has been started.
|
||||
// (1) If there's a pattern edge as a punc dawg child of us,
|
||||
// for each punc-following dawg starting with ch, produce:
|
||||
// Result: DawgPosition(k, w, p', false)
|
||||
// (2) If there's a valid continuation in the punc dawg, produce:
|
||||
// Result: DawgPosition(-k, NO_EDGE, p', false)
|
||||
//
|
||||
// DawgPosition(k, w, -1, NO_EDGE, false)
|
||||
// We're in dawg k. Going back to punctuation dawg is not an option.
|
||||
// Follow ch in dawg k.
|
||||
//
|
||||
// DawgPosition(k, w, p, pe, false)
|
||||
// We're in dawg k. Continue in dawg k and/or go back to the punc dawg.
|
||||
// If ending, check that the punctuation dawg is also ok to end here.
|
||||
//
|
||||
// DawgPosition(k, w, p, pe true)
|
||||
// We're back in the punctuation dawg. Continuing there is the only option.
|
||||
struct DawgPosition {
|
||||
DawgPosition()
|
||||
: dawg_index(-1), dawg_ref(NO_EDGE), punc_ref(NO_EDGE),
|
||||
back_to_punc(false) {}
|
||||
DawgPosition(int dawg_idx, EDGE_REF dawgref,
|
||||
int punc_idx, EDGE_REF puncref,
|
||||
bool backtopunc)
|
||||
: dawg_index(dawg_idx), dawg_ref(dawgref),
|
||||
punc_index(punc_idx), punc_ref(puncref),
|
||||
back_to_punc(backtopunc) {
|
||||
}
|
||||
int dawg_index;
|
||||
EDGE_REF ref;
|
||||
bool operator==(const DawgPosition &other) {
|
||||
return dawg_index == other.dawg_index &&
|
||||
dawg_ref == other.dawg_ref &&
|
||||
punc_index == other.punc_index &&
|
||||
punc_ref == other.punc_ref &&
|
||||
back_to_punc == other.back_to_punc;
|
||||
}
|
||||
|
||||
inT8 dawg_index;
|
||||
EDGE_REF dawg_ref;
|
||||
inT8 punc_index;
|
||||
EDGE_REF punc_ref;
|
||||
// Have we returned to the punc dawg at the end of the word?
|
||||
bool back_to_punc;
|
||||
};
|
||||
class DawgInfoVector : public GenericVector<DawgInfo> {
|
||||
|
||||
class DawgPositionVector : public GenericVector<DawgPosition> {
|
||||
public:
|
||||
/// Overload destructor, since clear() does not delete data_[] any more.
|
||||
~DawgInfoVector() {
|
||||
~DawgPositionVector() {
|
||||
if (size_reserved_ > 0) {
|
||||
delete[] data_;
|
||||
size_used_ = 0;
|
||||
@ -327,15 +382,17 @@ class DawgInfoVector : public GenericVector<DawgInfo> {
|
||||
/// Adds an entry for the given dawg_index with the given node to the vec.
|
||||
/// Returns false if the same entry already exists in the vector,
|
||||
/// true otherwise.
|
||||
inline bool add_unique(const DawgInfo &new_info, bool debug,
|
||||
inline bool add_unique(const DawgPosition &new_pos,
|
||||
bool debug,
|
||||
const char *debug_msg) {
|
||||
for (int i = 0; i < size_used_; ++i) {
|
||||
if (data_[i] == new_info) return false;
|
||||
if (data_[i] == new_pos) return false;
|
||||
}
|
||||
push_back(new_info);
|
||||
push_back(new_pos);
|
||||
if (debug) {
|
||||
tprintf("%s[%d, " REFFORMAT "]\n", debug_msg,
|
||||
new_info.dawg_index, new_info.ref);
|
||||
tprintf("%s[%d, " REFFORMAT "] [punc: " REFFORMAT "%s]\n",
|
||||
debug_msg, new_pos.dawg_index, new_pos.dawg_ref,
|
||||
new_pos.punc_ref, new_pos.back_to_punc ? " returned" : "");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -385,12 +442,15 @@ class SquishedDawg : public Dawg {
|
||||
|
||||
/// Fills the given NodeChildVector with all the unichar ids (and the
|
||||
/// corresponding EDGE_REFs) for which there is an edge out of this node.
|
||||
void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const {
|
||||
void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
|
||||
bool word_end) const {
|
||||
EDGE_REF edge = node;
|
||||
if (!edge_occupied(edge) || edge == NO_EDGE) return;
|
||||
assert(forward_edge(edge)); // we don't expect any backward edges to
|
||||
do { // be present when this funciton is called
|
||||
vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
|
||||
if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
|
||||
vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
|
||||
}
|
||||
} while (!last_edge(edge++));
|
||||
}
|
||||
|
||||
|
102
dict/dawg_cache.cpp
Normal file
102
dict/dawg_cache.cpp
Normal file
@ -0,0 +1,102 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: dawg_cache.h
|
||||
// Description: A class that knows about loading and caching dawgs.
|
||||
// Author: David Eger
|
||||
// Created: Fri Jan 27 12:08:00 PST 2012
|
||||
//
|
||||
// (C) Copyright 2012, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "dawg_cache.h"
|
||||
|
||||
#include "dawg.h"
|
||||
#include "object_cache.h"
|
||||
#include "strngs.h"
|
||||
#include "tessdatamanager.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
struct DawgLoader {
|
||||
DawgLoader(const STRING &lang,
|
||||
const char *data_file_name,
|
||||
TessdataType tessdata_dawg_type,
|
||||
int dawg_debug_level)
|
||||
: lang_(lang),
|
||||
data_file_name_(data_file_name),
|
||||
tessdata_dawg_type_(tessdata_dawg_type),
|
||||
dawg_debug_level_(dawg_debug_level) {}
|
||||
|
||||
Dawg *Load();
|
||||
|
||||
STRING lang_;
|
||||
const char *data_file_name_;
|
||||
TessdataType tessdata_dawg_type_;
|
||||
int dawg_debug_level_;
|
||||
};
|
||||
|
||||
Dawg *DawgCache::GetSquishedDawg(
|
||||
const STRING &lang,
|
||||
const char *data_file_name,
|
||||
TessdataType tessdata_dawg_type,
|
||||
int debug_level) {
|
||||
STRING data_id = data_file_name;
|
||||
data_id += kTessdataFileSuffixes[tessdata_dawg_type];
|
||||
DawgLoader loader(lang, data_file_name, tessdata_dawg_type, debug_level);
|
||||
return dawgs_.Get(data_id, NewTessCallback(&loader, &DawgLoader::Load));
|
||||
}
|
||||
|
||||
Dawg *DawgLoader::Load() {
|
||||
TessdataManager data_loader;
|
||||
if (!data_loader.Init(data_file_name_, dawg_debug_level_)) {
|
||||
return NULL;
|
||||
}
|
||||
if (!data_loader.SeekToStart(tessdata_dawg_type_)) return NULL;
|
||||
FILE *fp = data_loader.GetDataFilePtr();
|
||||
DawgType dawg_type;
|
||||
PermuterType perm_type;
|
||||
switch (tessdata_dawg_type_) {
|
||||
case TESSDATA_PUNC_DAWG:
|
||||
dawg_type = DAWG_TYPE_PUNCTUATION;
|
||||
perm_type = PUNC_PERM;
|
||||
break;
|
||||
case TESSDATA_SYSTEM_DAWG:
|
||||
dawg_type = DAWG_TYPE_WORD;
|
||||
perm_type = SYSTEM_DAWG_PERM;
|
||||
break;
|
||||
case TESSDATA_NUMBER_DAWG:
|
||||
dawg_type = DAWG_TYPE_NUMBER;
|
||||
perm_type = NUMBER_PERM;
|
||||
break;
|
||||
case TESSDATA_BIGRAM_DAWG:
|
||||
dawg_type = DAWG_TYPE_WORD; // doesn't actually matter
|
||||
perm_type = COMPOUND_PERM; // doesn't actually matter
|
||||
break;
|
||||
case TESSDATA_UNAMBIG_DAWG:
|
||||
dawg_type = DAWG_TYPE_WORD;
|
||||
perm_type = SYSTEM_DAWG_PERM;
|
||||
break;
|
||||
case TESSDATA_FREQ_DAWG:
|
||||
dawg_type = DAWG_TYPE_WORD;
|
||||
perm_type = FREQ_DAWG_PERM;
|
||||
break;
|
||||
default:
|
||||
data_loader.End();
|
||||
return NULL;
|
||||
}
|
||||
SquishedDawg *retval =
|
||||
new SquishedDawg(fp, dawg_type, lang_, perm_type, dawg_debug_level_);
|
||||
data_loader.End();
|
||||
return retval;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
56
dict/dawg_cache.h
Normal file
56
dict/dawg_cache.h
Normal file
@ -0,0 +1,56 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: dawg_cache.h
|
||||
// Description: A class that knows about loading and caching dawgs.
|
||||
// Author: David Eger
|
||||
// Created: Fri Jan 27 12:08:00 PST 2012
|
||||
//
|
||||
// (C) Copyright 2012, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_DICT_DAWG_CACHE_H_
|
||||
#define TESSERACT_DICT_DAWG_CACHE_H_
|
||||
|
||||
#include "dawg.h"
|
||||
#include "object_cache.h"
|
||||
#include "strngs.h"
|
||||
#include "tessdatamanager.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class DawgCache {
|
||||
public:
|
||||
Dawg *GetSquishedDawg(
|
||||
const STRING &lang,
|
||||
const char *data_file_name,
|
||||
TessdataType tessdata_dawg_type,
|
||||
int debug_level);
|
||||
|
||||
// If we manage the given dawg, decrement its count,
|
||||
// and possibly delete it if the count reaches zero.
|
||||
// If dawg is unknown to us, return false.
|
||||
bool FreeDawg(Dawg *dawg) {
|
||||
return dawgs_.Free(dawg);
|
||||
}
|
||||
|
||||
// Free up any currently unused dawgs.
|
||||
void DeleteUnusedDawgs() {
|
||||
dawgs_.DeleteUnusedObjects();
|
||||
}
|
||||
|
||||
private:
|
||||
ObjectCache<Dawg> dawgs_;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_DICT_DAWG_CACHE_H_
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user