mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 12:49:35 +08:00
Merge branch 'master' of github.com:tesseract-ocr/tesseract
This commit is contained in:
commit
f369585f56
2
.gitignore
vendored
2
.gitignore
vendored
@ -59,6 +59,8 @@ training/wordlist2dawg
|
||||
*.o
|
||||
*.Plo
|
||||
*.a
|
||||
*.class
|
||||
*.jar
|
||||
|
||||
# tessdata
|
||||
*.cube.*
|
||||
|
2
COPYING
2
COPYING
@ -1,5 +1,5 @@
|
||||
This package contains the Tesseract Open Source OCR Engine.
|
||||
Orignally developed at Hewlett Packard Laboratories Bristol and
|
||||
Originally developed at Hewlett Packard Laboratories Bristol and
|
||||
at Hewlett Packard Co, Greeley Colorado, all the code
|
||||
in this distribution is now licensed under the Apache License:
|
||||
|
||||
|
@ -100,7 +100,7 @@ find its data directory. You must either:
|
||||
./autogen.sh
|
||||
./configure
|
||||
make
|
||||
make install
|
||||
sudo make install
|
||||
sudo ldconfig
|
||||
|
||||
to move the data files to the standard place, or:
|
||||
|
@ -1660,7 +1660,7 @@ char* TessBaseAPI::GetUNLVText() {
|
||||
word->word->space() > 0 &&
|
||||
!word->word->flag(W_FUZZY_NON) &&
|
||||
!word->word->flag(W_FUZZY_SP)) {
|
||||
/* Write a space to separate from preceeding good text */
|
||||
/* Write a space to separate from preceding good text */
|
||||
*ptr++ = ' ';
|
||||
last_char_was_tilde = false;
|
||||
}
|
||||
|
@ -178,7 +178,7 @@ void TessPDFRenderer::AppendPDFObject(const char *data) {
|
||||
AppendString((const char *)data);
|
||||
}
|
||||
|
||||
// Helper function to prevent us from accidentaly writing
|
||||
// Helper function to prevent us from accidentally writing
|
||||
// scientific notation to an HOCR or PDF file. Besides, three
|
||||
// decimal points are all you really need.
|
||||
double prec(double x) {
|
||||
|
@ -227,7 +227,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
|
||||
// We have 2 possible sources of pagesegmode: a config file and
|
||||
// the command line. For backwards compatability reasons, the
|
||||
// the command line. For backwards compatibility reasons, the
|
||||
// default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
|
||||
// default for this program is tesseract::PSM_AUTO. We will let
|
||||
// the config file take priority, so the command-line default
|
||||
|
@ -1556,7 +1556,7 @@ void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word,
|
||||
word->fix_quotes();
|
||||
if (tessedit_fix_hyphens)
|
||||
word->fix_hyphens();
|
||||
/* Dont trust fix_quotes! - though I think I've fixed the bug */
|
||||
/* Don't trust fix_quotes! - though I think I've fixed the bug */
|
||||
if (word->best_choice->length() != word->box_word->length()) {
|
||||
tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
|
||||
" #Blobs=%d\n",
|
||||
@ -1694,7 +1694,7 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(
|
||||
goto not_a_word;
|
||||
/*
|
||||
Allow a single hyphen in a lower case word
|
||||
- dont trust upper case - I've seen several cases of "H" -> "I-I"
|
||||
- don't trust upper case - I've seen several cases of "H" -> "I-I"
|
||||
*/
|
||||
if (lengths[i] == 1 && s[offset] == '-') {
|
||||
hyphen_pos = i;
|
||||
|
@ -129,7 +129,7 @@ inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) {
|
||||
int expected_outline_count;
|
||||
|
||||
if (STRING (outlines_odd).contains (c))
|
||||
return 0; //Dont use this char
|
||||
return 0; //Don't use this char
|
||||
else if (STRING (outlines_2).contains (c))
|
||||
expected_outline_count = 2;
|
||||
else
|
||||
@ -157,7 +157,7 @@ void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
|
||||
* - Word segmentation is the same as the original image
|
||||
* - All characters have the expected number of outlines
|
||||
* NOTE - the rejection counts are recalculated after unrejection
|
||||
* - CANT do it in a single pass without a bit of fiddling
|
||||
* - CAN'T do it in a single pass without a bit of fiddling
|
||||
* - keep it simple but inefficient
|
||||
*************************************************************************/
|
||||
void Tesseract::unrej_good_quality_words( //unreject potential
|
||||
@ -403,7 +403,7 @@ void Tesseract::doc_and_block_rejection( //reject big chunks
|
||||
|
||||
/*************************************************************************
|
||||
* reject_whole_page()
|
||||
* Dont believe any of it - set the reject map to 00..00 in all words
|
||||
* Don't believe any of it - set the reject map to 00..00 in all words
|
||||
*
|
||||
*************************************************************************/
|
||||
|
||||
|
@ -55,7 +55,7 @@ void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
|
||||
WERD_RES *word_res;
|
||||
WERD_RES_LIST fuzzy_space_words;
|
||||
inT16 new_length;
|
||||
BOOL8 prevent_null_wd_fixsp; // DONT process blobless wds
|
||||
BOOL8 prevent_null_wd_fixsp; // DON'T process blobless wds
|
||||
inT32 word_index; // current word
|
||||
|
||||
block_res_it.set_to_list(&page_res->block_res_list);
|
||||
@ -222,7 +222,7 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
|
||||
* fuzzy spaces. The problem with the basic measure is that "561 63" would score
|
||||
* the same as "56163", though given our knowledge that the space is fuzzy, and
|
||||
* that there is a "1" next to the fuzzy space, we need to ensure that "56163"
|
||||
* is prefered.
|
||||
* is preferred.
|
||||
*
|
||||
* The solution is to NOT COUNT the score of any word which has a digit at one
|
||||
* end and a "1Il" as the character the other side of the space.
|
||||
@ -272,8 +272,8 @@ inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
|
||||
} else {
|
||||
/*
|
||||
Can we add the prev word score and potentially count this word?
|
||||
Yes IF it didnt end in a 1 when the first char of this word is a digit
|
||||
AND it didnt end in a digit when the first char of this word is a 1
|
||||
Yes IF it didn't end in a 1 when the first char of this word is a digit
|
||||
AND it didn't end in a digit when the first char of this word is a 1
|
||||
*/
|
||||
word_len = word->reject_map.length();
|
||||
current_word_ok_so_far = FALSE;
|
||||
@ -507,7 +507,7 @@ BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
|
||||
|
||||
/*
|
||||
Use all the standard pass 2 conditions for mode 5 in set_done() in
|
||||
reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT
|
||||
reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
|
||||
CARE WHETHER WE HAVE of/at on/an etc.
|
||||
*/
|
||||
if (fixsp_done_mode > 0 &&
|
||||
|
@ -297,7 +297,7 @@ UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
|
||||
/*************************************************************************
|
||||
* SUSPECT LEVELS
|
||||
*
|
||||
* 0 - dont reject ANYTHING
|
||||
* 0 - don't reject ANYTHING
|
||||
* 1,2 - partial rejection
|
||||
* 3 - BEST
|
||||
*
|
||||
@ -337,7 +337,7 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
|
||||
rating_per_ch = word.rating() / word_res->reject_map.length();
|
||||
|
||||
if (rating_per_ch >= suspect_rating_per_ch)
|
||||
return; //Dont touch bad ratings
|
||||
return; //Don't touch bad ratings
|
||||
|
||||
if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
|
||||
/* Unreject any Tess Acceptable word - but NOT tess reject chs*/
|
||||
|
@ -329,13 +329,13 @@ void ParamsEditor::WriteParams(char *filename,
|
||||
fclose(fp);
|
||||
sprintf (msg_str, "Overwrite file " "%s" "? (Y/N)", filename);
|
||||
int a = sv_window_->ShowYesNoDialog(msg_str);
|
||||
if (a == 'n') { return; } // dont write
|
||||
if (a == 'n') { return; } // don't write
|
||||
}
|
||||
|
||||
|
||||
fp = fopen (filename, "wb"); // can we write to it?
|
||||
if (fp == NULL) {
|
||||
sv_window_->AddMessage("Cant write to file " "%s" "", filename);
|
||||
sv_window_->AddMessage("Can't write to file " "%s" "", filename);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -521,7 +521,7 @@ BOOL8 Tesseract::word_contains_non_1_digit(const char *word,
|
||||
|
||||
/*************************************************************************
|
||||
* dont_allow_1Il()
|
||||
* Dont unreject LONE accepted 1Il conflict set chars
|
||||
* Don't unreject LONE accepted 1Il conflict set chars
|
||||
*************************************************************************/
|
||||
void Tesseract::dont_allow_1Il(WERD_RES *word) {
|
||||
int i = 0;
|
||||
@ -633,7 +633,7 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
|
||||
next_left = 9999;
|
||||
else
|
||||
next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
|
||||
// Dont touch small or touching blobs - it is too dangerous.
|
||||
// Don't touch small or touching blobs - it is too dangerous.
|
||||
if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
|
||||
(out_box.left() > prev_right) && (out_box.right() < next_left)) {
|
||||
aspect_ratio = out_box.width() / (float) out_box.height();
|
||||
|
@ -136,7 +136,7 @@ Tesseract::Tesseract()
|
||||
BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true,
|
||||
"Try to improve fuzzy spaces", this->params()),
|
||||
BOOL_MEMBER(tessedit_unrej_any_wd, false,
|
||||
"Dont bother with word plausibility", this->params()),
|
||||
"Don't bother with word plausibility", this->params()),
|
||||
BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height",
|
||||
@ -310,19 +310,19 @@ Tesseract::Tesseract()
|
||||
this->params()),
|
||||
INT_MEMBER(crunch_pot_indicators, 1,
|
||||
"How many potential indicators needed", this->params()),
|
||||
BOOL_MEMBER(crunch_leave_ok_strings, true, "Dont touch sensible strings",
|
||||
BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings",
|
||||
this->params()),
|
||||
BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
|
||||
this->params()),
|
||||
BOOL_MEMBER(crunch_leave_accept_strings, false,
|
||||
"Dont pot crunch sensible strings", this->params()),
|
||||
"Don't pot crunch sensible strings", this->params()),
|
||||
BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
|
||||
this->params()),
|
||||
INT_MEMBER(crunch_leave_lc_strings, 4,
|
||||
"Dont crunch words with long lower case strings",
|
||||
"Don't crunch words with long lower case strings",
|
||||
this->params()),
|
||||
INT_MEMBER(crunch_leave_uc_strings, 4,
|
||||
"Dont crunch words with long lower case strings",
|
||||
"Don't crunch words with long lower case strings",
|
||||
this->params()),
|
||||
INT_MEMBER(crunch_long_repetitions, 3,
|
||||
"Crunch words with long repetitions", this->params()),
|
||||
@ -393,21 +393,21 @@ Tesseract::Tesseract()
|
||||
INT_MEMBER(suspect_space_level, 100,
|
||||
"Min suspect level for rejecting spaces", this->params()),
|
||||
INT_MEMBER(suspect_short_words, 2,
|
||||
"Dont Suspect dict wds longer than this", this->params()),
|
||||
"Don't suspect dict wds longer than this", this->params()),
|
||||
BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
|
||||
this->params()),
|
||||
double_MEMBER(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit",
|
||||
double_MEMBER(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit",
|
||||
this->params()),
|
||||
double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_minimal_rejection, false,
|
||||
"Only reject tess failures", this->params()),
|
||||
BOOL_MEMBER(tessedit_zero_rejection, false, "Dont reject ANYTHING",
|
||||
BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_word_for_word, false,
|
||||
"Make output have exactly one word per WERD", this->params()),
|
||||
BOOL_MEMBER(tessedit_zero_kelvin_rejection, false,
|
||||
"Dont reject ANYTHING AT ALL", this->params()),
|
||||
"Don't reject ANYTHING AT ALL", this->params()),
|
||||
BOOL_MEMBER(tessedit_consistent_reps, true,
|
||||
"Force all rep chars the same", this->params()),
|
||||
INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
|
||||
@ -424,7 +424,7 @@ Tesseract::Tesseract()
|
||||
"Use DOC dawg in 11l conf. detector", this->params()),
|
||||
BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
|
||||
this->params()),
|
||||
BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Dont double check",
|
||||
BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check",
|
||||
this->params()),
|
||||
BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
|
||||
this->params()),
|
||||
|
@ -733,7 +733,7 @@ class Tesseract : public Wordrec {
|
||||
GenericVector<UNICHAR_ID>* class_ids);
|
||||
// Resegments the word to achieve the target_text from the classifier.
|
||||
// Returns false if the re-segmentation fails.
|
||||
// Uses brute-force combination of upto kMaxGroupSize adjacent blobs, and
|
||||
// Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
|
||||
// applies a full search on the classifier results to find the best classified
|
||||
// segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
|
||||
// substitutions ARE used.
|
||||
@ -833,7 +833,7 @@ class Tesseract : public Wordrec {
|
||||
BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true,
|
||||
"Try to improve fuzzy spaces");
|
||||
BOOL_VAR_H(tessedit_unrej_any_wd, false,
|
||||
"Dont bother with word plausibility");
|
||||
"Don't bother with word plausibility");
|
||||
BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?");
|
||||
BOOL_VAR_H(tessedit_redo_xheight, true, "Check/Correct x-height");
|
||||
BOOL_VAR_H(tessedit_enable_doc_dict, true,
|
||||
@ -954,15 +954,15 @@ class Tesseract : public Wordrec {
|
||||
double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this");
|
||||
INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch");
|
||||
INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed");
|
||||
BOOL_VAR_H(crunch_leave_ok_strings, true, "Dont touch sensible strings");
|
||||
BOOL_VAR_H(crunch_leave_ok_strings, true, "Don't touch sensible strings");
|
||||
BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring");
|
||||
BOOL_VAR_H(crunch_leave_accept_strings, false,
|
||||
"Dont pot crunch sensible strings");
|
||||
"Don't pot crunch sensible strings");
|
||||
BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures");
|
||||
INT_VAR_H(crunch_leave_lc_strings, 4,
|
||||
"Dont crunch words with long lower case strings");
|
||||
"Don't crunch words with long lower case strings");
|
||||
INT_VAR_H(crunch_leave_uc_strings, 4,
|
||||
"Dont crunch words with long lower case strings");
|
||||
"Don't crunch words with long lower case strings");
|
||||
INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions");
|
||||
INT_VAR_H(crunch_debug, 0, "As it says");
|
||||
INT_VAR_H(fixsp_non_noise_limit, 1,
|
||||
@ -1010,16 +1010,16 @@ class Tesseract : public Wordrec {
|
||||
INT_VAR_H(suspect_space_level, 100,
|
||||
"Min suspect level for rejecting spaces");
|
||||
INT_VAR_H(suspect_short_words, 2,
|
||||
"Dont Suspect dict wds longer than this");
|
||||
"Don't Suspect dict wds longer than this");
|
||||
BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected");
|
||||
double_VAR_H(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit");
|
||||
double_VAR_H(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit");
|
||||
double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit");
|
||||
BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures");
|
||||
BOOL_VAR_H(tessedit_zero_rejection, false, "Dont reject ANYTHING");
|
||||
BOOL_VAR_H(tessedit_zero_rejection, false, "Don't reject ANYTHING");
|
||||
BOOL_VAR_H(tessedit_word_for_word, false,
|
||||
"Make output have exactly one word per WERD");
|
||||
BOOL_VAR_H(tessedit_zero_kelvin_rejection, false,
|
||||
"Dont reject ANYTHING AT ALL");
|
||||
"Don't reject ANYTHING AT ALL");
|
||||
BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same");
|
||||
INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm");
|
||||
BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug");
|
||||
@ -1030,7 +1030,7 @@ class Tesseract : public Wordrec {
|
||||
"Aspect ratio dot/hyphen test");
|
||||
BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector");
|
||||
BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test");
|
||||
BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Dont double check");
|
||||
BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Don't double check");
|
||||
BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control");
|
||||
BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control");
|
||||
BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control");
|
||||
|
@ -33,7 +33,7 @@
|
||||
|
||||
ELISTIZE (BLOBNBOX) ELIST2IZE (TO_ROW) ELISTIZE (TO_BLOCK)
|
||||
|
||||
// Upto 30 degrees is allowed for rotations of diacritic blobs.
|
||||
// Up to 30 degrees is allowed for rotations of diacritic blobs.
|
||||
const double kCosSmallAngle = 0.866;
|
||||
// Min aspect ratio for a joined word to indicate an obvious flow direction.
|
||||
const double kDefiniteAspectRatio = 2.0;
|
||||
|
@ -35,7 +35,7 @@ FILE* OpenBoxFile(const STRING& fname) {
|
||||
FILE* box_file = NULL;
|
||||
if (!(box_file = fopen(filename.string(), "rb"))) {
|
||||
CANTOPENFILE.error("read_next_box", TESSEXIT,
|
||||
"Cant open box file %s",
|
||||
"Can't open box file %s",
|
||||
filename.string());
|
||||
}
|
||||
return box_file;
|
||||
|
@ -382,7 +382,7 @@ void DENORM::LocalDenormTransform(const FCOORD& pt, FCOORD* original) const {
|
||||
}
|
||||
|
||||
// Transforms the given coords all the way back to source image space using
|
||||
// the full transformation sequence defined by this and its predecesors
|
||||
// the full transformation sequence defined by this and its predecessors
|
||||
// recursively, shallowest first, and finally any block re_rotation.
|
||||
// If last_denorm is not NULL, then the last transformation used will
|
||||
// be last_denorm, and the block re_rotation will never be executed.
|
||||
|
@ -218,7 +218,7 @@ class DENORM {
|
||||
void LocalDenormTransform(const TPOINT& pt, TPOINT* original) const;
|
||||
void LocalDenormTransform(const FCOORD& pt, FCOORD* original) const;
|
||||
// Transforms the given coords all the way back to source image space using
|
||||
// the full transformation sequence defined by this and its predecesors
|
||||
// the full transformation sequence defined by this and its predecessors
|
||||
// recursively, shallowest first, and finally any block re_rotation.
|
||||
// If last_denorm is not NULL, then the last transformation used will
|
||||
// be last_denorm, and the block re_rotation will never be executed.
|
||||
|
@ -108,7 +108,7 @@ class PDBLK
|
||||
PDBLK & operator= (const PDBLK & source);
|
||||
|
||||
protected:
|
||||
POLY_BLOCK *hand_poly; //< wierd as well
|
||||
POLY_BLOCK *hand_poly; //< weird as well
|
||||
ICOORDELT_LIST leftside; //< left side vertices
|
||||
ICOORDELT_LIST rightside; //< right side vertices
|
||||
TBOX box; //< bounding box
|
||||
|
@ -16,7 +16,7 @@
|
||||
** limitations under the License.
|
||||
*
|
||||
|
||||
This module may look unneccessarily verbose, but here's the philosophy...
|
||||
This module may look unnecessarily verbose, but here's the philosophy...
|
||||
|
||||
ALL processing of the reject map is done in this module. There are lots of
|
||||
separate calls to set reject/accept flags. These have DELIBERATELY been kept
|
||||
@ -51,7 +51,7 @@ OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!!
|
||||
enum REJ_FLAGS
|
||||
{
|
||||
/* Reject modes which are NEVER overridden */
|
||||
R_TESS_FAILURE, // PERM Tess didnt classify
|
||||
R_TESS_FAILURE, // PERM Tess didn't classify
|
||||
R_SMALL_XHT, // PERM Xht too small
|
||||
R_EDGE_CHAR, // PERM Too close to edge of image
|
||||
R_1IL_CONFLICT, // PERM 1Il confusion
|
||||
@ -62,7 +62,7 @@ enum REJ_FLAGS
|
||||
|
||||
/* Initial reject modes (pre NN_ACCEPT) */
|
||||
R_POOR_MATCH, // TEMP Ray's original heuristic (Not used)
|
||||
R_NOT_TESS_ACCEPTED, // TEMP Tess didnt accept WERD
|
||||
R_NOT_TESS_ACCEPTED, // TEMP Tess didn't accept WERD
|
||||
R_CONTAINS_BLANKS, // TEMP Tess failed on other chs in WERD
|
||||
R_BAD_PERMUTER, // POTENTIAL Bad permuter for WERD
|
||||
|
||||
@ -82,7 +82,7 @@ enum REJ_FLAGS
|
||||
R_ROW_REJ, // TEMP Row rejection
|
||||
R_UNLV_REJ, // TEMP ~ turned to - or ^ turned to space
|
||||
|
||||
/* Accept modes which occur inbetween the above rejection groups */
|
||||
/* Accept modes which occur between the above rejection groups */
|
||||
R_NN_ACCEPT, //NN acceptance
|
||||
R_HYPHEN_ACCEPT, //Hyphen acceptance
|
||||
R_MM_ACCEPT, //Matrix match acceptance
|
||||
|
@ -204,7 +204,7 @@ double STATS::ile(double frac) const {
|
||||
/**********************************************************************
|
||||
* STATS::min_bucket
|
||||
*
|
||||
* Find REAL minimum bucket - ile(0.0) isnt necessarily correct
|
||||
* Find REAL minimum bucket - ile(0.0) isn't necessarily correct
|
||||
**********************************************************************/
|
||||
inT32 STATS::min_bucket() const { // Find min
|
||||
if (buckets_ == NULL || total_count_ == 0) {
|
||||
@ -219,7 +219,7 @@ inT32 STATS::min_bucket() const { // Find min
|
||||
/**********************************************************************
|
||||
* STATS::max_bucket
|
||||
*
|
||||
* Find REAL maximum bucket - ile(1.0) isnt necessarily correct
|
||||
* Find REAL maximum bucket - ile(1.0) isn't necessarily correct
|
||||
**********************************************************************/
|
||||
|
||||
inT32 STATS::max_bucket() const { // Find max
|
||||
@ -249,7 +249,7 @@ double STATS::median() const { //get median
|
||||
if ((total_count_ > 1) && (pile_count(median_pile) == 0)) {
|
||||
inT32 min_pile;
|
||||
inT32 max_pile;
|
||||
/* Find preceeding non zero pile */
|
||||
/* Find preceding non zero pile */
|
||||
for (min_pile = median_pile; pile_count(min_pile) == 0; min_pile--);
|
||||
/* Find following non zero pile */
|
||||
for (max_pile = median_pile; pile_count(max_pile) == 0; max_pile++);
|
||||
|
@ -23,7 +23,7 @@
|
||||
*
|
||||
********************************************************************************
|
||||
* Revision 5.1 89/07/27 11:47:50 11:47:50 ray ()
|
||||
* Added ratings acces methods.
|
||||
* Added ratings access methods.
|
||||
* This version ready for independent development.
|
||||
*/
|
||||
/*----------------------------------------------------------------------
|
||||
|
@ -190,7 +190,7 @@ const void *, const void *)) {
|
||||
|
||||
// Assuming list has been sorted already, insert new_data to
|
||||
// keep the list sorted according to the same comparison function.
|
||||
// Comparision function is the same as used by sort, i.e. uses double
|
||||
// Comparison function is the same as used by sort, i.e. uses double
|
||||
// indirection. Time is O(1) to add to beginning or end.
|
||||
// Time is linear to add pre-sorted items to an empty list.
|
||||
// If unique, then don't add duplicate entries.
|
||||
@ -513,7 +513,7 @@ CLIST_LINK *CLIST_ITERATOR::extract_sublist( //from
|
||||
|
||||
temp_it.mark_cycle_pt ();
|
||||
do { //walk sublist
|
||||
if (temp_it.cycled_list ()) //cant find end pt
|
||||
if (temp_it.cycled_list ()) //can't find end pt
|
||||
BAD_SUBLIST.error ("CLIST_ITERATOR.extract_sublist", ABORT, NULL);
|
||||
|
||||
if (temp_it.at_last ()) {
|
||||
|
@ -51,11 +51,11 @@ class DLLSYM CLIST_LINK
|
||||
}
|
||||
|
||||
CLIST_LINK( //copy constructor
|
||||
const CLIST_LINK &) { //dont copy link
|
||||
const CLIST_LINK &) { //don't copy link
|
||||
data = next = NULL;
|
||||
}
|
||||
|
||||
void operator= ( //dont copy links
|
||||
void operator= ( //don't copy links
|
||||
const CLIST_LINK &) {
|
||||
data = next = NULL;
|
||||
}
|
||||
@ -89,7 +89,7 @@ class DLLSYM CLIST
|
||||
void internal_deep_clear ( //destroy all links
|
||||
void (*zapper) (void *)); //ptr to zapper functn
|
||||
|
||||
void shallow_clear(); //clear list but dont
|
||||
void shallow_clear(); //clear list but don't
|
||||
//delete data elements
|
||||
|
||||
bool empty() const { //is list empty?
|
||||
@ -117,7 +117,7 @@ class DLLSYM CLIST
|
||||
|
||||
// Assuming list has been sorted already, insert new_data to
|
||||
// keep the list sorted according to the same comparison function.
|
||||
// Comparision function is the same as used by sort, i.e. uses double
|
||||
// Comparison function is the same as used by sort, i.e. uses double
|
||||
// indirection. Time is O(1) to add to beginning or end.
|
||||
// Time is linear to add pre-sorted items to an empty list.
|
||||
// If unique, then don't add duplicate entries.
|
||||
@ -232,7 +232,7 @@ class DLLSYM CLIST_ITERATOR
|
||||
BOOL8 cycled_list(); //Completed a cycle?
|
||||
|
||||
void add_to_end( //add at end &
|
||||
void *new_data); //dont move
|
||||
void *new_data); //don't move
|
||||
|
||||
void exchange( //positions of 2 links
|
||||
CLIST_ITERATOR *other_it); //other iterator
|
||||
@ -437,7 +437,7 @@ inline void CLIST_ITERATOR::add_before_then_move( // element to add
|
||||
/***********************************************************************
|
||||
* CLIST_ITERATOR::add_before_stay_put
|
||||
*
|
||||
* Add a new element to the list before the current element but dont move the
|
||||
* Add a new element to the list before the current element but don't move the
|
||||
* iterator to the new element.
|
||||
**********************************************************************/
|
||||
|
||||
@ -485,7 +485,7 @@ inline void CLIST_ITERATOR::add_before_stay_put( // element to add
|
||||
/***********************************************************************
|
||||
* CLIST_ITERATOR::add_list_after
|
||||
*
|
||||
* Insert another list to this list after the current element but dont move the
|
||||
* Insert another list to this list after the current element but don't move the
|
||||
* iterator.
|
||||
**********************************************************************/
|
||||
|
||||
@ -836,7 +836,7 @@ Replace <parm> with "<parm>". <parm> may be an arbitrary number of tokens
|
||||
|
||||
CLASSNAME is assumed to be the name of a class to be used in a CONS list
|
||||
|
||||
NOTE: Because we dont use virtual functions in the list code, the list code
|
||||
NOTE: Because we don't use virtual functions in the list code, the list code
|
||||
will NOT work correctly for classes derived from this.
|
||||
|
||||
The macro generates:
|
||||
@ -885,7 +885,7 @@ public: \
|
||||
CLASSNAME##_CLIST():CLIST() {} \
|
||||
/* constructor */ \
|
||||
\
|
||||
CLASSNAME##_CLIST( /* dont construct */ \
|
||||
CLASSNAME##_CLIST( /* don't construct */ \
|
||||
const CLASSNAME##_CLIST&) /*by initial assign*/ \
|
||||
{ DONT_CONSTRUCT_LIST_BY_COPY.error( QUOTE_IT( CLASSNAME##_CLIST ), \
|
||||
ABORT, NULL ); } \
|
||||
@ -963,7 +963,7 @@ CLISTIZEH_C( CLASSNAME )
|
||||
* A function which can delete a CLASSNAME element. This is passed to the \
|
||||
* generic deep_clear list member function so that when a list is cleared the \
|
||||
* elements on the list are properly destroyed from the base class, even \
|
||||
* though we dont use a virtual destructor function. \
|
||||
* though we don't use a virtual destructor function. \
|
||||
**********************************************************************/ \
|
||||
\
|
||||
DLLSYM void CLASSNAME##_c1_zapper( /*delete a link*/ \
|
||||
|
@ -117,7 +117,7 @@ inT32 ELIST::length() const { // count elements
|
||||
* ELIST::sort
|
||||
*
|
||||
* Sort elements on list
|
||||
* NB If you dont like the const declarations in the comparator, coerce yours:
|
||||
* NB If you don't like the const declarations in the comparator, coerce yours:
|
||||
* ( int (*)(const void *, const void *)
|
||||
**********************************************************************/
|
||||
|
||||
@ -161,7 +161,7 @@ const void *, const void *)) {
|
||||
|
||||
// Assuming list has been sorted already, insert new_link to
|
||||
// keep the list sorted according to the same comparison function.
|
||||
// Comparision function is the same as used by sort, i.e. uses double
|
||||
// Comparison function is the same as used by sort, i.e. uses double
|
||||
// indirection. Time is O(1) to add to beginning or end.
|
||||
// Time is linear to add pre-sorted items to an empty list.
|
||||
// If unique is set to true and comparator() returns 0 (an entry with the
|
||||
@ -455,7 +455,7 @@ ELIST_LINK *ELIST_ITERATOR::extract_sublist( //from
|
||||
|
||||
temp_it.mark_cycle_pt ();
|
||||
do { //walk sublist
|
||||
if (temp_it.cycled_list ()) //cant find end pt
|
||||
if (temp_it.cycled_list ()) //can't find end pt
|
||||
BAD_SUBLIST.error ("ELIST_ITERATOR.extract_sublist", ABORT, NULL);
|
||||
|
||||
if (temp_it.at_last ()) {
|
||||
|
@ -67,7 +67,7 @@ The implementation of lists is very careful about space and speed overheads.
|
||||
This is why many embedded lists are provided. The same concerns mean that
|
||||
in-line type coercion is done, rather than use virtual functions. This is
|
||||
cumbersome in that each data type to be listed requires its own iterator and
|
||||
list class - though macros can gererate these. It also prevents heterogenous
|
||||
list class - though macros can gererate these. It also prevents heterogeneous
|
||||
lists.
|
||||
**********************************************************************/
|
||||
|
||||
@ -98,7 +98,7 @@ class DLLSYM ELIST_LINK
|
||||
next = NULL;
|
||||
}
|
||||
|
||||
void operator= ( //dont copy links
|
||||
void operator= ( //don't copy links
|
||||
const ELIST_LINK &) {
|
||||
next = NULL;
|
||||
}
|
||||
@ -158,7 +158,7 @@ class DLLSYM ELIST
|
||||
|
||||
// Assuming list has been sorted already, insert new_link to
|
||||
// keep the list sorted according to the same comparison function.
|
||||
// Comparision function is the same as used by sort, i.e. uses double
|
||||
// Comparison function is the same as used by sort, i.e. uses double
|
||||
// indirection. Time is O(1) to add to beginning or end.
|
||||
// Time is linear to add pre-sorted items to an empty list.
|
||||
// If unique is set to true and comparator() returns 0 (an entry with the
|
||||
@ -274,7 +274,7 @@ class DLLSYM ELIST_ITERATOR
|
||||
bool cycled_list(); //Completed a cycle?
|
||||
|
||||
void add_to_end( //add at end &
|
||||
ELIST_LINK *new_link); //dont move
|
||||
ELIST_LINK *new_link); //don't move
|
||||
|
||||
void exchange( //positions of 2 links
|
||||
ELIST_ITERATOR *other_it); //other iterator
|
||||
@ -470,7 +470,7 @@ inline void ELIST_ITERATOR::add_before_then_move( // element to add
|
||||
/***********************************************************************
|
||||
* ELIST_ITERATOR::add_before_stay_put
|
||||
*
|
||||
* Add a new element to the list before the current element but dont move the
|
||||
* Add a new element to the list before the current element but don't move the
|
||||
* iterator to the new element.
|
||||
**********************************************************************/
|
||||
|
||||
@ -515,7 +515,7 @@ inline void ELIST_ITERATOR::add_before_stay_put( // element to add
|
||||
/***********************************************************************
|
||||
* ELIST_ITERATOR::add_list_after
|
||||
*
|
||||
* Insert another list to this list after the current element but dont move the
|
||||
* Insert another list to this list after the current element but don't move the
|
||||
* iterator.
|
||||
**********************************************************************/
|
||||
|
||||
@ -868,7 +868,7 @@ Replace <parm> with "<parm>". <parm> may be an arbitrary number of tokens
|
||||
CLASSNAME is assumed to be the name of a class which has a baseclass of
|
||||
ELIST_LINK.
|
||||
|
||||
NOTE: Because we dont use virtual functions in the list code, the list code
|
||||
NOTE: Because we don't use virtual functions in the list code, the list code
|
||||
will NOT work correctly for classes derived from this.
|
||||
|
||||
The macros generate:
|
||||
@ -999,7 +999,7 @@ ELISTIZEH_C( CLASSNAME )
|
||||
* A function which can delete a CLASSNAME element. This is passed to the \
|
||||
* generic clear list member function so that when a list is cleared the \
|
||||
* elements on the list are properly destroyed from the base class, even \
|
||||
* though we dont use a virtual destructor function. \
|
||||
* though we don't use a virtual destructor function. \
|
||||
**********************************************************************/ \
|
||||
\
|
||||
DLLSYM void CLASSNAME##_zapper(ELIST_LINK* link) { \
|
||||
|
@ -118,7 +118,7 @@ inT32 ELIST2::length() const { // count elements
|
||||
* ELIST2::sort
|
||||
*
|
||||
* Sort elements on list
|
||||
* NB If you dont like the const declarations in the comparator, coerce yours:
|
||||
* NB If you don't like the const declarations in the comparator, coerce yours:
|
||||
* ( int (*)(const void *, const void *)
|
||||
**********************************************************************/
|
||||
|
||||
@ -162,7 +162,7 @@ const void *, const void *)) {
|
||||
|
||||
// Assuming list has been sorted already, insert new_link to
|
||||
// keep the list sorted according to the same comparison function.
|
||||
// Comparision function is the same as used by sort, i.e. uses double
|
||||
// Comparison function is the same as used by sort, i.e. uses double
|
||||
// indirection. Time is O(1) to add to beginning or end.
|
||||
// Time is linear to add pre-sorted items to an empty list.
|
||||
void ELIST2::add_sorted(int comparator(const void*, const void*),
|
||||
@ -475,7 +475,7 @@ ELIST2_LINK *ELIST2_ITERATOR::extract_sublist( //fr
|
||||
|
||||
temp_it.mark_cycle_pt ();
|
||||
do { //walk sublist
|
||||
if (temp_it.cycled_list ()) //cant find end pt
|
||||
if (temp_it.cycled_list ()) //can't find end pt
|
||||
BAD_SUBLIST.error ("ELIST2_ITERATOR.extract_sublist", ABORT, NULL);
|
||||
|
||||
if (temp_it.at_last ()) {
|
||||
|
@ -69,11 +69,11 @@ class DLLSYM ELIST2_LINK
|
||||
}
|
||||
|
||||
ELIST2_LINK( //copy constructor
|
||||
const ELIST2_LINK &) { //dont copy link
|
||||
const ELIST2_LINK &) { //don't copy link
|
||||
prev = next = NULL;
|
||||
}
|
||||
|
||||
void operator= ( //dont copy links
|
||||
void operator= ( //don't copy links
|
||||
const ELIST2_LINK &) {
|
||||
prev = next = NULL;
|
||||
}
|
||||
@ -133,7 +133,7 @@ class DLLSYM ELIST2
|
||||
|
||||
// Assuming list has been sorted already, insert new_link to
|
||||
// keep the list sorted according to the same comparison function.
|
||||
// Comparision function is the same as used by sort, i.e. uses double
|
||||
// Comparison function is the same as used by sort, i.e. uses double
|
||||
// indirection. Time is O(1) to add to beginning or end.
|
||||
// Time is linear to add pre-sorted items to an empty list.
|
||||
void add_sorted(int comparator(const void*, const void*),
|
||||
@ -241,7 +241,7 @@ class DLLSYM ELIST2_ITERATOR
|
||||
BOOL8 cycled_list(); //Completed a cycle?
|
||||
|
||||
void add_to_end( //add at end &
|
||||
ELIST2_LINK *new_link); //dont move
|
||||
ELIST2_LINK *new_link); //don't move
|
||||
|
||||
void exchange( //positions of 2 links
|
||||
ELIST2_ITERATOR *other_it); //other iterator
|
||||
@ -450,7 +450,7 @@ inline void ELIST2_ITERATOR::add_before_then_move( // element to add
|
||||
/***********************************************************************
|
||||
* ELIST2_ITERATOR::add_before_stay_put
|
||||
*
|
||||
* Add a new element to the list before the current element but dont move the
|
||||
* Add a new element to the list before the current element but don't move the
|
||||
* iterator to the new element.
|
||||
**********************************************************************/
|
||||
|
||||
@ -500,7 +500,7 @@ inline void ELIST2_ITERATOR::add_before_stay_put( // element to add
|
||||
/***********************************************************************
|
||||
* ELIST2_ITERATOR::add_list_after
|
||||
*
|
||||
* Insert another list to this list after the current element but dont move the
|
||||
* Insert another list to this list after the current element but don't move the
|
||||
* iterator.
|
||||
**********************************************************************/
|
||||
|
||||
@ -883,7 +883,7 @@ Replace <parm> with "<parm>". <parm> may be an arbitrary number of tokens
|
||||
CLASSNAME is assumed to be the name of a class which has a baseclass of
|
||||
ELIST2_LINK.
|
||||
|
||||
NOTE: Because we dont use virtual functions in the list code, the list code
|
||||
NOTE: Because we don't use virtual functions in the list code, the list code
|
||||
will NOT work correctly for classes derived from this.
|
||||
|
||||
The macro generates:
|
||||
@ -927,7 +927,7 @@ public: \
|
||||
CLASSNAME##_LIST():ELIST2() {} \
|
||||
/* constructor */ \
|
||||
\
|
||||
CLASSNAME##_LIST( /* dont construct */ \
|
||||
CLASSNAME##_LIST( /* don't construct */ \
|
||||
const CLASSNAME##_LIST&) /*by initial assign*/\
|
||||
{ DONT_CONSTRUCT_LIST_BY_COPY.error( QUOTE_IT( CLASSNAME##_LIST ), \
|
||||
ABORT, NULL ); } \
|
||||
@ -1015,7 +1015,7 @@ ELIST2IZEH_C( CLASSNAME )
|
||||
* A function which can delete a CLASSNAME element. This is passed to the \
|
||||
* generic clear list member function so that when a list is cleared the \
|
||||
* elements on the list are properly destroyed from the base class, even \
|
||||
* though we dont use a virtual destructor function. \
|
||||
* though we don't use a virtual destructor function. \
|
||||
**********************************************************************/ \
|
||||
\
|
||||
DLLSYM void CLASSNAME##_zapper( /*delete a link*/ \
|
||||
|
@ -53,7 +53,7 @@ enum TessErrorLogCode {
|
||||
#define LOC_DOC_BLK_REJ 22
|
||||
#define LOC_WRITE_RESULTS 23
|
||||
#define LOC_ADAPTIVE 24
|
||||
/* DONT DEFINE ANY LOCATION > 31 !!! */
|
||||
/* DON'T DEFINE ANY LOCATION > 31 !!! */
|
||||
|
||||
/* Sub locatation determines whether pass2 was in normal mode or fix xht mode*/
|
||||
#define SUBLOC_NORM 0
|
||||
|
@ -949,7 +949,7 @@ bool GenericVector<T>::SerializeClasses(tesseract::TFile* fp) const {
|
||||
|
||||
// Reads a vector of classes from the given file. Assumes the existence of
|
||||
// bool T::Deserialize(bool swap, FILE* fp) that returns false in case of
|
||||
// error. Alse needs T::T() and T::T(constT&), as init_to_size is used in
|
||||
// error. Also needs T::T() and T::T(constT&), as init_to_size is used in
|
||||
// this function. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
template <typename T>
|
||||
|
@ -61,8 +61,8 @@ class TRand {
|
||||
private:
|
||||
// Steps the generator to the next value.
|
||||
void Iterate() {
|
||||
seed_ *= 6364136223846793005;
|
||||
seed_ += 1442695040888963407;
|
||||
seed_ *= 6364136223846793005ULL;
|
||||
seed_ += 1442695040888963407ULL;
|
||||
}
|
||||
|
||||
// The current value of the seed.
|
||||
|
@ -38,6 +38,6 @@ const ERRCODE NULL_PREV = "Previous element on the list is NULL";
|
||||
const ERRCODE EMPTY_LIST = "List is empty";
|
||||
const ERRCODE BAD_PARAMETER = "List parameter error";
|
||||
const ERRCODE STILL_LINKED =
|
||||
"Attemting to add an element with non NULL links, to a list";
|
||||
"Attempting to add an element with non NULL links, to a list";
|
||||
#endif
|
||||
#endif
|
||||
|
@ -21,7 +21,7 @@
|
||||
* the HP OCR interface.
|
||||
* The code is designed to be used with either a C or C++ compiler.
|
||||
* The structures are designed to allow them to be used with any
|
||||
* structure alignment upto 8.
|
||||
* structure alignment up to 8.
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef CCUTIL_OCRCLASS_H_
|
||||
|
@ -45,7 +45,7 @@ const int kMaxDoubleSize = 15;
|
||||
*
|
||||
* The collection of MACROS provide different implementations depending
|
||||
* on whether the string keeps track of its strlen or not so that this
|
||||
* feature can be added in later when consumers dont modifify the string
|
||||
* feature can be added in later when consumers don't modify the string
|
||||
**********************************************************************/
|
||||
|
||||
// Smallest string to allocate by default
|
||||
@ -339,7 +339,7 @@ STRING& STRING::operator=(const STRING& str) {
|
||||
const STRING_HEADER* str_header = str.GetHeader();
|
||||
int str_used = str_header->used_;
|
||||
|
||||
GetHeader()->used_ = 0; // clear since ensure doesnt need to copy data
|
||||
GetHeader()->used_ = 0; // clear since ensure doesn't need to copy data
|
||||
char* this_cstr = ensure_cstr(str_used);
|
||||
STRING_HEADER* this_header = GetHeader();
|
||||
|
||||
@ -398,7 +398,7 @@ STRING & STRING::operator=(const char* cstr) {
|
||||
if (cstr) {
|
||||
int len = strlen(cstr) + 1;
|
||||
|
||||
this_header->used_ = 0; // dont bother copying data if need to realloc
|
||||
this_header->used_ = 0; // don't bother copying data if need to realloc
|
||||
char* this_cstr = ensure_cstr(len);
|
||||
this_header = GetHeader(); // for realloc
|
||||
memcpy(this_cstr, cstr, len);
|
||||
@ -416,7 +416,7 @@ STRING & STRING::operator=(const char* cstr) {
|
||||
|
||||
void STRING::assign(const char *cstr, int len) {
|
||||
STRING_HEADER* this_header = GetHeader();
|
||||
this_header->used_ = 0; // dont bother copying data if need to realloc
|
||||
this_header->used_ = 0; // don't bother copying data if need to realloc
|
||||
char* this_cstr = ensure_cstr(len + 1); // +1 for '\0'
|
||||
|
||||
this_header = GetHeader(); // for realloc
|
||||
|
@ -51,7 +51,7 @@ bool TessdataManager::Init(const char *data_file_name, int debug_level) {
|
||||
sizeof(actual_tessdata_num_entries_));
|
||||
}
|
||||
if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) {
|
||||
// For forward compatability, truncate to the number we can handle.
|
||||
// For forward compatibility, truncate to the number we can handle.
|
||||
actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES;
|
||||
}
|
||||
fread(offset_table_, sizeof(inT64),
|
||||
|
@ -282,7 +282,7 @@ class TessdataManager {
|
||||
* same or smaller than TESSDATA_NUM_ENTRIES, but can never be larger,
|
||||
* since then it would be impossible to interpret the type of tessdata at
|
||||
* indices same and higher than TESSDATA_NUM_ENTRIES.
|
||||
* This parameter is used to allow for backward compatiblity
|
||||
* This parameter is used to allow for backward compatibility
|
||||
* when new tessdata types are introduced.
|
||||
*/
|
||||
inT32 actual_tessdata_num_entries_;
|
||||
|
@ -515,7 +515,7 @@ void Classify::EndAdaptiveClassifier() {
|
||||
* load_pre_trained_templates Indicates whether the pre-trained
|
||||
* templates (inttemp, normproto and pffmtable components)
|
||||
* should be lodaded. Should only be set to true if the
|
||||
* necesary classifier components are present in the
|
||||
* necessary classifier components are present in the
|
||||
* [lang].traineddata file.
|
||||
* Globals:
|
||||
* BuiltInTemplatesFile file to get built-in temps from
|
||||
@ -1720,7 +1720,7 @@ bool Classify::LooksLikeGarbage(TBLOB *blob) {
|
||||
*
|
||||
* Globals:
|
||||
*
|
||||
* @return Number of features extracted or 0 if an error occured.
|
||||
* @return Number of features extracted or 0 if an error occurred.
|
||||
* @note Exceptions: none
|
||||
* @note History: Tue May 28 10:40:52 1991, DSJ, Created.
|
||||
*/
|
||||
@ -2082,7 +2082,7 @@ void Classify::PrintAdaptiveMatchResults(const ADAPT_RESULTS& results) {
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
/**
|
||||
* This routine steps thru each matching class in Results
|
||||
* This routine steps through each matching class in Results
|
||||
* and removes it from the match list if its rating
|
||||
* is worse than the BestRating plus a pad. In other words,
|
||||
* all good matches get moved to the front of the classes
|
||||
|
@ -151,7 +151,7 @@ Classify::Classify()
|
||||
INT_MEMBER(classify_integer_matcher_multiplier, 10,
|
||||
"Integer Matcher Multiplier 0-255: ", this->params()),
|
||||
EnableLearning(true),
|
||||
INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word",
|
||||
INT_MEMBER(il1_adaption_test, 0, "Don't adapt to i/I at beginning of word",
|
||||
this->params()),
|
||||
BOOL_MEMBER(classify_bln_numeric_mode, 0,
|
||||
"Assume the input is numbers [0-9].", this->params()),
|
||||
|
@ -495,7 +495,7 @@ class Classify : public CCStruct {
|
||||
// font combinations that the shape represents.
|
||||
UnicityTable<FontSet> fontset_table_;
|
||||
|
||||
INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word");
|
||||
INT_VAR_H(il1_adaption_test, 0, "Don't adapt to i/I at beginning of word");
|
||||
BOOL_VAR_H(classify_bln_numeric_mode, 0,
|
||||
"Assume the input is numbers [0-9].");
|
||||
double_VAR_H(speckle_large_max_size, 0.30, "Max large speckle size");
|
||||
|
@ -182,7 +182,7 @@ struct BUCKETS {
|
||||
FLOAT64 ChiSquared; // test threshold
|
||||
uinT16 NumberOfBuckets; // number of cells in histogram
|
||||
uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets
|
||||
uinT32 *Count; // frequency of occurence histogram
|
||||
uinT32 *Count; // frequency of occurrence histogram
|
||||
FLOAT32 *ExpectedCount; // expected histogram
|
||||
};
|
||||
|
||||
|
@ -24,7 +24,7 @@
|
||||
#include <stdio.h>
|
||||
|
||||
/*-------------------------------------------------------------------------
|
||||
Public Funtion Prototype
|
||||
Public Function Prototype
|
||||
--------------------------------------------------------------------------*/
|
||||
uinT16 ReadSampleSize(FILE *File);
|
||||
|
||||
|
@ -285,7 +285,7 @@ CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
/**
|
||||
* Search thru all features currently defined and return
|
||||
* Search through all features currently defined and return
|
||||
* the feature type for the feature with the specified short
|
||||
* name. Trap an error if the specified name is not found.
|
||||
*
|
||||
|
@ -44,7 +44,7 @@ using tesseract::TrainingSample;
|
||||
// The entries are in binary degrees where a full circle is 256 binary degrees.
|
||||
static float cos_table[INT_CHAR_NORM_RANGE];
|
||||
static float sin_table[INT_CHAR_NORM_RANGE];
|
||||
// Guards write access to AtanTable so we dont create it more than once.
|
||||
// Guards write access to AtanTable so we don't create it more than once.
|
||||
tesseract::CCUtilMutex atan_table_mutex;
|
||||
|
||||
|
||||
|
@ -521,7 +521,7 @@ bool KDTreeSearch::BoxIntersectsSearch(FLOAT32 *lower, FLOAT32 *upper) {
|
||||
* Walk a tree, calling action once on each node.
|
||||
*
|
||||
* Operation:
|
||||
* This routine walks thru the specified sub_tree and invokes action
|
||||
* This routine walks through the specified sub_tree and invokes action
|
||||
* action at each node as follows:
|
||||
* action(context, data, level)
|
||||
* data the data contents of the node being visited,
|
||||
|
@ -104,7 +104,7 @@ LIST ConvertOutlines(TESSLINE *outline,
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
/**
|
||||
* This routine searches thru the specified outline, computes
|
||||
* This routine searches through the specified outline, computes
|
||||
* a slope for each vector in the outline, and marks each
|
||||
* vector as having one of the following directions:
|
||||
* N, S, E, W, NE, NW, SE, SW
|
||||
@ -182,7 +182,7 @@ void FreeOutlines(LIST Outlines) {
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
/**
|
||||
* This routine searches thru the specified outline and finds
|
||||
* This routine searches through the specified outline and finds
|
||||
* the points at which the outline changes direction. These
|
||||
* points are then marked as "extremities". This routine is
|
||||
* used as an alternative to FindExtremities(). It forces the
|
||||
|
@ -147,7 +147,7 @@ void ConvertSegmentToPicoFeat(FPOINT *Start,
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
/**
|
||||
* This routine steps thru the specified outline and cuts it
|
||||
* This routine steps through the specified outline and cuts it
|
||||
* up into pieces of equal length. These pieces become the
|
||||
* desired pico-features. Each segment in the outline
|
||||
* is converted into an integral number of pico-features.
|
||||
|
@ -93,7 +93,7 @@ void BeamSearch::CreateChildren(SearchColumn *out_col, LangModel *lang_mod,
|
||||
} // lm_edges
|
||||
}
|
||||
|
||||
// Performs a beam seach in the specified search using the specified
|
||||
// Performs a beam search in the specified search using the specified
|
||||
// language model; returns an alternate list of possible words as a result.
|
||||
WordAltList * BeamSearch::Search(SearchObject *srch_obj, LangModel *lang_mod) {
|
||||
// verifications
|
||||
|
@ -45,7 +45,7 @@ class BeamSearch {
|
||||
public:
|
||||
explicit BeamSearch(CubeRecoContext *cntxt, bool word_mode = true);
|
||||
~BeamSearch();
|
||||
// Performs a beam seach in the specified search using the specified
|
||||
// Performs a beam search in the specified search using the specified
|
||||
// language model; returns an alternate list of possible words as a result.
|
||||
WordAltList *Search(SearchObject *srch_obj, LangModel *lang_mod = NULL);
|
||||
// Returns the best node in the last column of last performed search.
|
||||
|
@ -72,7 +72,7 @@ bool ConvNetCharClassifier::Train(CharSamp *char_samp, int ClassID) {
|
||||
|
||||
/**
|
||||
* A secondary function needed for training. Allows the trainer to set the
|
||||
* value of any train-time paramter. This function is currently not
|
||||
* value of any train-time parameter. This function is currently not
|
||||
* implemented. TODO(ahmadab): implement end-2-end training
|
||||
*/
|
||||
bool ConvNetCharClassifier::SetLearnParam(char *var_name, float val) {
|
||||
|
@ -55,7 +55,7 @@ class ConvNetCharClassifier : public CharClassifier {
|
||||
// is currently not implemented. TODO(ahmadab): implement end-2-end training
|
||||
virtual bool Train(CharSamp *char_samp, int ClassID);
|
||||
// A secondary function needed for training. Allows the trainer to set the
|
||||
// value of any train-time paramter. This function is currently not
|
||||
// value of any train-time parameter. This function is currently not
|
||||
// implemented. TODO(ahmadab): implement end-2-end training
|
||||
virtual bool SetLearnParam(char *var_name, float val);
|
||||
// Externally sets the Neural Net used by the classifier. Used for training
|
||||
|
@ -247,7 +247,7 @@ int CubeLineObject::ComputeWordBreakThreshold(int con_comp_cnt,
|
||||
word_break_threshold--;
|
||||
} while (!valid && word_break_threshold > 0);
|
||||
|
||||
// failed to find a threshold that acheives the target aspect ratio.
|
||||
// failed to find a threshold that achieves the target aspect ratio.
|
||||
// Just use the default threshold
|
||||
return static_cast<int>(line_pix_->h *
|
||||
cntxt_->Params()->MaxSpaceHeightRatio());
|
||||
|
@ -237,7 +237,7 @@ Pixa *CubeLineSegmenter::CrackLine(Pix *cracked_line_pix,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// split a line continously until valid or fail
|
||||
// split a line continuously until valid or fail
|
||||
Pixa *CubeLineSegmenter::SplitLine(Pix *line_mask_pix, Box *line_box) {
|
||||
// clone the line mask
|
||||
Pix *line_pix = pixClone(line_mask_pix);
|
||||
@ -739,7 +739,7 @@ bool CubeLineSegmenter::LineSegment() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Estimate the paramters of the font(s) used in the page
|
||||
// Estimate the parameters of the font(s) used in the page
|
||||
bool CubeLineSegmenter::EstimateFontParams() {
|
||||
int hgt_hist[kHgtBins];
|
||||
int max_hgt;
|
||||
|
@ -212,7 +212,7 @@ CharSamp *CubeSearchObject::CharSample(int start_pt, int end_pt) {
|
||||
samp->SetLastChar(last_char ? 255 : 0);
|
||||
} else {
|
||||
// for non cursive languages, these features correspond
|
||||
// to whether the charsamp is at the begining or end of the word
|
||||
// to whether the charsamp is at the beginning or end of the word
|
||||
samp->SetFirstChar((start_pt == -1) ? 255 : 0);
|
||||
samp->SetLastChar((end_pt == (segment_cnt_ - 1)) ? 255 : 0);
|
||||
}
|
||||
|
@ -114,7 +114,7 @@ class CubeSearchObject : public SearchObject {
|
||||
end_pt <= (start_pt + max_seg_per_char_));
|
||||
}
|
||||
// computes the space and no space costs at gaps between segments
|
||||
// return true on sucess
|
||||
// return true on success
|
||||
bool ComputeSpaceCosts();
|
||||
};
|
||||
}
|
||||
|
@ -72,7 +72,7 @@ bool HybridNeuralNetCharClassifier::Train(CharSamp *char_samp, int ClassID) {
|
||||
}
|
||||
|
||||
// A secondary function needed for training. Allows the trainer to set the
|
||||
// value of any train-time paramter. This function is currently not
|
||||
// value of any train-time parameter. This function is currently not
|
||||
// implemented. TODO(ahmadab): implement end-2-end training
|
||||
bool HybridNeuralNetCharClassifier::SetLearnParam(char *var_name, float val) {
|
||||
// TODO(ahmadab): implementation of parameter initializing.
|
||||
@ -151,7 +151,7 @@ bool HybridNeuralNetCharClassifier::RunNets(CharSamp *char_samp) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// go thru all the nets
|
||||
// go through all the nets
|
||||
memset(net_output_, 0, class_cnt * sizeof(*net_output_));
|
||||
float *inputs = net_input_;
|
||||
for (int net_idx = 0; net_idx < nets_.size(); net_idx++) {
|
||||
|
@ -48,7 +48,7 @@ class HybridNeuralNetCharClassifier : public CharClassifier {
|
||||
// is currently not implemented. TODO(ahmadab): implement end-2-end training
|
||||
virtual bool Train(CharSamp *char_samp, int ClassID);
|
||||
// A secondary function needed for training. Allows the trainer to set the
|
||||
// value of any train-time paramter. This function is currently not
|
||||
// value of any train-time parameter. This function is currently not
|
||||
// implemented. TODO(ahmadab): implement end-2-end training
|
||||
virtual bool SetLearnParam(char *var_name, float val);
|
||||
// Externally sets the Neural Net used by the classifier. Used for training
|
||||
|
@ -397,7 +397,7 @@ int TessLangModel::NumberEdges(EDGE_REF edge_ref, LangModEdge **edge_array) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// go thru all valid transitions from the state
|
||||
// go through all valid transitions from the state
|
||||
int edge_cnt = 0;
|
||||
|
||||
EDGE_REF new_edge_ref;
|
||||
|
@ -37,7 +37,7 @@
|
||||
#include "oldlist.h"
|
||||
|
||||
/*----------------------------------------------------------------------------
|
||||
Public Funtion Prototypes
|
||||
Public Function Prototypes
|
||||
--------------------------------------------------------------------------*/
|
||||
LIST read_list(const char *filename);
|
||||
#endif
|
||||
|
@ -407,7 +407,7 @@ LIST s_adjoin(LIST var_list, void *variable, int_compare compare) {
|
||||
*
|
||||
* Search list, return NIL_LIST if not found. Return the list starting from
|
||||
* the item if found. The compare routine "is_equal" is passed in as
|
||||
* the third paramter to this routine. If the value NULL is supplied
|
||||
* the third parameter to this routine. If the value NULL is supplied
|
||||
* for is_equal, the is_key routine will be used.
|
||||
**********************************************************************/
|
||||
LIST search(LIST list, void *key, int_compare is_equal) {
|
||||
|
@ -234,7 +234,7 @@ first_node (list_rest (l))
|
||||
first_node (list_rest (list_rest (l)))
|
||||
|
||||
/*----------------------------------------------------------------------
|
||||
Public Funtion Prototypes
|
||||
Public Function Prototypes
|
||||
----------------------------------------------------------------------*/
|
||||
int count(LIST var_list);
|
||||
|
||||
|
@ -33,7 +33,7 @@ static const int kMinAbsoluteGarbageWordLength = 10;
|
||||
static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
|
||||
|
||||
const int case_state_table[6][4] = { {
|
||||
/* 0. Begining of word */
|
||||
/* 0. Beginning of word */
|
||||
/* P U L D */
|
||||
/* -1. Error on case */
|
||||
0, 1, 5, 4
|
||||
|
@ -447,7 +447,7 @@ class SquishedDawg : public Dawg {
|
||||
EDGE_REF edge = node;
|
||||
if (!edge_occupied(edge) || edge == NO_EDGE) return;
|
||||
assert(forward_edge(edge)); // we don't expect any backward edges to
|
||||
do { // be present when this funciton is called
|
||||
do { // be present when this function is called
|
||||
if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
|
||||
vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
|
||||
}
|
||||
|
@ -127,7 +127,7 @@ Dict::Dict(CCUtil* ccutil)
|
||||
" when there is a need to explore all segmentations",
|
||||
getCCUtil()->params()),
|
||||
BOOL_MEMBER(save_raw_choices, false,
|
||||
"Deprecated- backward compatablity only",
|
||||
"Deprecated- backward compatibility only",
|
||||
getCCUtil()->params()),
|
||||
INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
|
||||
"Max words to keep in list",
|
||||
|
@ -614,7 +614,7 @@ class Dict {
|
||||
"Make AcceptableChoice() always return false. Useful"
|
||||
" when there is a need to explore all segmentations");
|
||||
BOOL_VAR_H(save_raw_choices, false,
|
||||
"Deprecated- backward compatability only");
|
||||
"Deprecated- backward compatibility only");
|
||||
INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
|
||||
STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
|
||||
" should be printed to stdout");
|
||||
|
@ -303,7 +303,7 @@ void Dict::append_choices(
|
||||
*
|
||||
* The given prev_char_frag_info contains:
|
||||
* - fragment: if not NULL contains information about immediately
|
||||
* preceeding fragmented character choice
|
||||
* preceding fragmented character choice
|
||||
* - num_fragments: number of fragments that have been used so far
|
||||
* to construct a character
|
||||
* - certainty: certainty of the current choice or minimum
|
||||
|
@ -1657,7 +1657,7 @@ EXTRA_PACKAGES =
|
||||
# following commands have a special meaning inside the header: $title,
|
||||
# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
|
||||
# $projectbrief, $projectlogo. Doxygen will replace $title with the empy string,
|
||||
# for the replacement values of the other commands the user is refered to
|
||||
# for the replacement values of the other commands the user is referred to
|
||||
# HTML_HEADER.
|
||||
# This tag requires that the tag GENERATE_LATEX is set to YES.
|
||||
|
||||
|
@ -42,18 +42,22 @@ SCROLLVIEW_LIBS = \
|
||||
CLASSPATH = $(srcdir)/piccolo2d-core-3.0.jar:$(srcdir)/piccolo2d-extras-3.0.jar
|
||||
|
||||
ScrollView.jar : $(SCROLLVIEW_CLASSES)
|
||||
$(JAR) cf $@ com/google/scrollview/*.class \
|
||||
$(JAR) cfm $@ Manifest.txt com/google/scrollview/*.class \
|
||||
com/google/scrollview/events/*.class com/google/scrollview/ui/*.class
|
||||
|
||||
$(SCROLLVIEW_CLASSES) : $(SCROLLVIEW_FILES)
|
||||
$(JAVAC) -encoding UTF8 -sourcepath $(srcdir) -classpath $(CLASSPATH) $(SCROLLVIEW_FILES) -d $(builddir)
|
||||
|
||||
fetch-jars :
|
||||
curl -L http://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-core/3.0/piccolo2d-core-3.0.jar > piccolo2d-core-3.0.jar
|
||||
curl -L http://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-extras/3.0/piccolo2d-extras-3.0.jar > piccolo2d-extras-3.0.jar
|
||||
|
||||
.PHONY: install-jars
|
||||
install-jars : ScrollView.jar
|
||||
@if [ ! -d $(scrollview_path) ]; then mkdir -p $(scrollview_path); fi;
|
||||
$(INSTALL) -m 644 $(SCROLLVIEW_LIBS) $(scrollview_path);
|
||||
$(INSTALL) -m 644 ScrollView.jar $(scrollview_path);
|
||||
@echo "Don't forget to set eviroment variable SCROLLVIEW_PATH to $(scrollview_path)";
|
||||
@echo "Don't forget to set environment variable SCROLLVIEW_PATH to $(scrollview_path)";
|
||||
|
||||
uninstall:
|
||||
rm -f $(scrollview_path)/*.jar
|
||||
|
2
java/Manifest.txt
Normal file
2
java/Manifest.txt
Normal file
@ -0,0 +1,2 @@
|
||||
Main-Class: com/google/scrollview/ScrollView
|
||||
Class-Path: ScrollView.jar piccolo2d-core-3.0.jar piccolo2d-extras-3.0.jar
|
@ -50,7 +50,7 @@ public class SVMenuBar implements ActionListener {
|
||||
|
||||
|
||||
/**
|
||||
* A click on one of the items in our menubar has occured. Forward it
|
||||
* A click on one of the items in our menubar has occurred. Forward it
|
||||
* to the item itself to let it decide what happens.
|
||||
*/
|
||||
public void actionPerformed(ActionEvent e) {
|
||||
@ -111,7 +111,7 @@ public class SVMenuBar implements ActionListener {
|
||||
* @param name The caption of the new entry.
|
||||
* @param id The Id of the new entry. If it is -1, the entry will be treated
|
||||
* as a menu.
|
||||
* @param b Whether the entry is initally flagged.
|
||||
* @param b Whether the entry is initially flagged.
|
||||
*
|
||||
*/
|
||||
|
||||
|
@ -123,7 +123,7 @@ public class SVPopupMenu implements ActionListener {
|
||||
|
||||
|
||||
/**
|
||||
* A click on one of the items in our menubar has occured. Forward it
|
||||
* A click on one of the items in our menubar has occurred. Forward it
|
||||
* to the item itself to let it decide what happens.
|
||||
*/
|
||||
public void actionPerformed(ActionEvent e) {
|
||||
|
@ -298,7 +298,7 @@ public class SVWindow extends JFrame {
|
||||
ta.setEditable(false);
|
||||
getContentPane().add(ta, BorderLayout.SOUTH);
|
||||
}
|
||||
// We need to make the window bigger to accomodate the message box.
|
||||
// We need to make the window bigger to accommodate the message box.
|
||||
winSizeY += DEF_MESSAGEBOX_HEIGHT;
|
||||
setSize(winSizeX, winSizeY);
|
||||
}
|
||||
|
@ -780,7 +780,7 @@ VERTICAL_FONTS=( \
|
||||
# holds the text corpus file for the language, used in phase F
|
||||
# ${FONTS[@]}
|
||||
# holds a sequence of applicable fonts for the language, used in
|
||||
# phase F & I
|
||||
# phase F & I. only set if not already set, i.e. from command line
|
||||
# ${TRAINING_DATA_ARGUMENTS}
|
||||
# non-default arguments to the training_data program used in phase T
|
||||
# ${FILTER_ARGUMENTS} -
|
||||
@ -794,7 +794,6 @@ set_lang_specific_parameters() {
|
||||
local lang=$1
|
||||
# The default text location is now given directly from the language code.
|
||||
TEXT_CORPUS="${FLAGS_webtext_prefix}/${lang}.corpus.txt"
|
||||
FONTS=( "${LATIN_FONTS[@]}" )
|
||||
FILTER_ARGUMENTS=""
|
||||
WORDLIST2DAWG_ARGUMENTS=""
|
||||
# These dawg factors represent the fraction of the corpus not covered by the
|
||||
@ -816,30 +815,30 @@ set_lang_specific_parameters() {
|
||||
case ${lang} in
|
||||
# Latin languages.
|
||||
enm ) TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported
|
||||
FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
|
||||
test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
|
||||
frm ) TEXT_CORPUS="${FLAGS_webtext_prefix}/fra.corpus.txt"
|
||||
# Make long-s substitutions for Middle French text
|
||||
FILTER_ARGUMENTS="--make_early_language_variant=fra"
|
||||
TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported.
|
||||
FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
|
||||
test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
|
||||
frk ) TEXT_CORPUS="${FLAGS_webtext_prefix}/deu.corpus.txt"
|
||||
FONTS=( "${FRAKTUR_FONTS[@]}" );;
|
||||
test -z "$FONTS" && FONTS=( "${FRAKTUR_FONTS[@]}" );;
|
||||
ita_old )
|
||||
TEXT_CORPUS="${FLAGS_webtext_prefix}/ita.corpus.txt"
|
||||
# Make long-s substitutions for Early Italian text
|
||||
FILTER_ARGUMENTS="--make_early_language_variant=ita"
|
||||
TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported.
|
||||
FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
|
||||
test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
|
||||
spa_old )
|
||||
TEXT_CORPUS="${FLAGS_webtext_prefix}/spa.corpus.txt"
|
||||
# Make long-s substitutions for Early Spanish text
|
||||
FILTER_ARGUMENTS="--make_early_language_variant=spa"
|
||||
TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported.
|
||||
FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
|
||||
test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
|
||||
srp_latn )
|
||||
TEXT_CORPUS=${FLAGS_webtext_prefix}/srp.corpus.txt ;;
|
||||
vie ) TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
|
||||
FONTS=( "${VIETNAMESE_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${VIETNAMESE_FONTS[@]}" ) ;;
|
||||
# Highly inflective languages get a bigger dawg size.
|
||||
# TODO(rays) Add more here!
|
||||
hun ) WORD_DAWG_SIZE=1000000 ;;
|
||||
@ -899,14 +898,14 @@ set_lang_specific_parameters() {
|
||||
# Strip unrenderable words as not all fonts will render the extended
|
||||
# latin symbols found in Vietnamese text.
|
||||
WORD_DAWG_SIZE=1000000
|
||||
FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
|
||||
test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
|
||||
|
||||
# Cyrillic script-based languages.
|
||||
rus ) FONTS=( "${RUSSIAN_FONTS[@]}" )
|
||||
rus ) test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" )
|
||||
NUMBER_DAWG_FACTOR=0.05
|
||||
WORD_DAWG_SIZE=1000000 ;;
|
||||
aze_cyrl | bel | bul | kaz | mkd | srp | tgk | ukr | uzb_cyrl )
|
||||
FONTS=( "${RUSSIAN_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" ) ;;
|
||||
|
||||
# Special code for performing Cyrillic language-id that is trained on
|
||||
# Russian, Serbian, Ukranian, Belarusian, Macedonian, Tajik and Mongolian
|
||||
@ -916,70 +915,70 @@ set_lang_specific_parameters() {
|
||||
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
|
||||
GENERATE_WORD_BIGRAMS=0
|
||||
WORD_DAWG_SIZE=1000000
|
||||
FONTS=( "${RUSSIAN_FONTS[@]}" );;
|
||||
test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" );;
|
||||
|
||||
# South Asian scripts mostly have a lot of different graphemes, so trim
|
||||
# down the MEAN_COUNT so as not to get a huge amount of text.
|
||||
asm | ben )
|
||||
MEAN_COUNT="15"
|
||||
WORD_DAWG_FACTOR=0.15
|
||||
FONTS=( "${BENGALI_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${BENGALI_FONTS[@]}" ) ;;
|
||||
bih | hin | mar | nep | san )
|
||||
MEAN_COUNT="15"
|
||||
WORD_DAWG_FACTOR=0.15
|
||||
FONTS=( "${DEVANAGARI_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${DEVANAGARI_FONTS[@]}" ) ;;
|
||||
bod ) MEAN_COUNT="15"
|
||||
WORD_DAWG_FACTOR=0.15
|
||||
FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
|
||||
dzo )
|
||||
WORD_DAWG_FACTOR=0.01
|
||||
FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
|
||||
guj ) MEAN_COUNT="15"
|
||||
WORD_DAWG_FACTOR=0.15
|
||||
FONTS=( "${GUJARATI_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${GUJARATI_FONTS[@]}" ) ;;
|
||||
kan ) MEAN_COUNT="15"
|
||||
WORD_DAWG_FACTOR=0.15
|
||||
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
|
||||
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
|
||||
FONTS=( "${KANNADA_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${KANNADA_FONTS[@]}" ) ;;
|
||||
mal ) MEAN_COUNT="15"
|
||||
WORD_DAWG_FACTOR=0.15
|
||||
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
|
||||
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
|
||||
FONTS=( "${MALAYALAM_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${MALAYALAM_FONTS[@]}" ) ;;
|
||||
ori )
|
||||
WORD_DAWG_FACTOR=0.01
|
||||
FONTS=( "${ORIYA_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${ORIYA_FONTS[@]}" ) ;;
|
||||
pan ) MEAN_COUNT="15"
|
||||
WORD_DAWG_FACTOR=0.01
|
||||
FONTS=( "${PUNJABI_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${PUNJABI_FONTS[@]}" ) ;;
|
||||
sin ) MEAN_COUNT="15"
|
||||
WORD_DAWG_FACTOR=0.01
|
||||
FONTS=( "${SINHALA_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${SINHALA_FONTS[@]}" ) ;;
|
||||
tam ) MEAN_COUNT="30"
|
||||
WORD_DAWG_FACTOR=0.15
|
||||
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
|
||||
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
|
||||
FONTS=( "${TAMIL_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${TAMIL_FONTS[@]}" ) ;;
|
||||
tel ) MEAN_COUNT="15"
|
||||
WORD_DAWG_FACTOR=0.15
|
||||
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
|
||||
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
|
||||
FONTS=( "${TELUGU_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${TELUGU_FONTS[@]}" ) ;;
|
||||
|
||||
# SouthEast Asian scripts.
|
||||
khm ) MEAN_COUNT="15"
|
||||
WORD_DAWG_FACTOR=0.15
|
||||
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
|
||||
FONTS=( "${KHMER_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${KHMER_FONTS[@]}" ) ;;
|
||||
lao ) MEAN_COUNT="15"
|
||||
WORD_DAWG_FACTOR=0.15
|
||||
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
|
||||
FONTS=( "${LAOTHIAN_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${LAOTHIAN_FONTS[@]}" ) ;;
|
||||
mya ) MEAN_COUNT="12"
|
||||
WORD_DAWG_FACTOR=0.15
|
||||
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
|
||||
FONTS=( "${BURMESE_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${BURMESE_FONTS[@]}" ) ;;
|
||||
tha ) MEAN_COUNT="30"
|
||||
WORD_DAWG_FACTOR=0.01
|
||||
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
|
||||
@ -987,7 +986,7 @@ set_lang_specific_parameters() {
|
||||
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
|
||||
AMBIGS_FILTER_DENOMINATOR="1000"
|
||||
LEADING=48
|
||||
FONTS=( "${THAI_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${THAI_FONTS[@]}" ) ;;
|
||||
|
||||
# CJK
|
||||
chi_sim )
|
||||
@ -998,7 +997,7 @@ set_lang_specific_parameters() {
|
||||
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
|
||||
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
|
||||
FILTER_ARGUMENTS="--charset_filter=chi_sim --segmenter_lang=chi_sim"
|
||||
FONTS=( "${CHI_SIM_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${CHI_SIM_FONTS[@]}" ) ;;
|
||||
chi_tra )
|
||||
MEAN_COUNT="15"
|
||||
WORD_DAWG_FACTOR=0.015
|
||||
@ -1006,14 +1005,14 @@ set_lang_specific_parameters() {
|
||||
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
|
||||
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
|
||||
FILTER_ARGUMENTS="--charset_filter=chi_tra --segmenter_lang=chi_tra"
|
||||
FONTS=( "${CHI_TRA_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${CHI_TRA_FONTS[@]}" ) ;;
|
||||
jpn ) MEAN_COUNT="15"
|
||||
WORD_DAWG_FACTOR=0.015
|
||||
GENERATE_WORD_BIGRAMS=0
|
||||
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
|
||||
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
|
||||
FILTER_ARGUMENTS="--charset_filter=jpn --segmenter_lang=jpn"
|
||||
FONTS=( "${JPN_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${JPN_FONTS[@]}" ) ;;
|
||||
kor ) MEAN_COUNT="20"
|
||||
WORD_DAWG_FACTOR=0.015
|
||||
NUMBER_DAWG_FACTOR=0.05
|
||||
@ -1021,38 +1020,38 @@ set_lang_specific_parameters() {
|
||||
TRAINING_DATA_ARGUMENTS+=" --desired_bigrams="
|
||||
GENERATE_WORD_BIGRAMS=0
|
||||
FILTER_ARGUMENTS="--charset_filter=kor --segmenter_lang=kor"
|
||||
FONTS=( "${KOREAN_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${KOREAN_FONTS[@]}" ) ;;
|
||||
|
||||
# Middle-Eastern scripts.
|
||||
ara ) FONTS=( "${ARABIC_FONTS[@]}" ) ;;
|
||||
div ) FONTS=( "${THAANA_FONTS[@]}" ) ;;
|
||||
ara ) test -z "$FONTS" && FONTS=( "${ARABIC_FONTS[@]}" ) ;;
|
||||
div ) test -z "$FONTS" && FONTS=( "${THAANA_FONTS[@]}" ) ;;
|
||||
fas | pus | snd | uig | urd )
|
||||
FONTS=( "${PERSIAN_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${PERSIAN_FONTS[@]}" ) ;;
|
||||
heb | yid )
|
||||
NUMBER_DAWG_FACTOR=0.05
|
||||
WORD_DAWG_FACTOR=0.08
|
||||
FONTS=( "${HEBREW_FONTS[@]}" ) ;;
|
||||
syr ) FONTS=( "${SYRIAC_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${HEBREW_FONTS[@]}" ) ;;
|
||||
syr ) test -z "$FONTS" && FONTS=( "${SYRIAC_FONTS[@]}" ) ;;
|
||||
|
||||
# Other scripts.
|
||||
amh | tir)
|
||||
FONTS=( "${AMHARIC_FONTS[@]}" ) ;;
|
||||
chr ) FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" \
|
||||
test -z "$FONTS" && FONTS=( "${AMHARIC_FONTS[@]}" ) ;;
|
||||
chr ) test -z "$FONTS" && FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" \
|
||||
"Noto Sans Cherokee" \
|
||||
) ;;
|
||||
ell | grc )
|
||||
NUMBER_DAWG_FACTOR=0.05
|
||||
WORD_DAWG_FACTOR=0.08
|
||||
FONTS=( "${GREEK_FONTS[@]}" ) ;;
|
||||
hye ) FONTS=( "${ARMENIAN_FONTS[@]}" ) ;;
|
||||
iku ) FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" ) ;;
|
||||
kat) FONTS=( "${GEORGIAN_FONTS[@]}" ) ;;
|
||||
test -z "$FONTS" && FONTS=( "${GREEK_FONTS[@]}" ) ;;
|
||||
hye ) test -z "$FONTS" && FONTS=( "${ARMENIAN_FONTS[@]}" ) ;;
|
||||
iku ) test -z "$FONTS" && FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" ) ;;
|
||||
kat) test -z "$FONTS" && FONTS=( "${GEORGIAN_FONTS[@]}" ) ;;
|
||||
kat_old)
|
||||
TEXT_CORPUS="${FLAGS_webtext_prefix}/kat.corpus.txt"
|
||||
FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;;
|
||||
kir ) FONTS=( "${KYRGYZ_FONTS[@]}" )
|
||||
test -z "$FONTS" && FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;;
|
||||
kir ) test -z "$FONTS" && FONTS=( "${KYRGYZ_FONTS[@]}" )
|
||||
TRAINING_DATA_ARGUMENTS=" --infrequent_ratio=100" ;;
|
||||
kur ) FONTS=( "${KURDISH_FONTS[@]}" ) ;;
|
||||
kur ) test -z "$FONTS" && FONTS=( "${KURDISH_FONTS[@]}" ) ;;
|
||||
|
||||
*) err "Error: ${lang} is not a valid language code"
|
||||
esac
|
||||
@ -1061,6 +1060,8 @@ set_lang_specific_parameters() {
|
||||
elif [[ ! -z ${MEAN_COUNT} ]]; then
|
||||
TRAINING_DATA_ARGUMENTS+=" --mean_count=${MEAN_COUNT}"
|
||||
fi
|
||||
# Default to Latin fonts if none have been set
|
||||
test -z "$FONTS" && test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" )
|
||||
}
|
||||
|
||||
#=============================================================================
|
||||
|
@ -17,7 +17,6 @@
|
||||
# USAGE:
|
||||
#
|
||||
# tesstrain.sh
|
||||
# --bin_dir PATH # Location of training program.
|
||||
# --fontlist FONTS_STR # A plus-separated list of fontnames to train on.
|
||||
# --fonts_dir FONTS_PATH # Path to font files.
|
||||
# --lang LANG_CODE # ISO 639 code.
|
||||
@ -25,6 +24,7 @@
|
||||
# --output_dir OUTPUTDIR # Location of output traineddata file.
|
||||
# --overwrite # Safe to overwrite files in output_dir.
|
||||
# --run_shape_clustering # Run shape clustering (use for Indic langs).
|
||||
# --exposures EXPOSURES # A list of exposure levels to use (e.g. "-1 0 1").
|
||||
#
|
||||
# OPTIONAL flags for input data. If unspecified we will look for them in
|
||||
# the langdata_dir directory.
|
||||
@ -49,11 +49,8 @@ source `dirname $0`/tesstrain_utils.sh
|
||||
ARGV=("$@")
|
||||
parse_flags
|
||||
|
||||
tlog "\n=== Starting training for language '${LANG_CODE}'"
|
||||
|
||||
tlog "Cleaning workspace directory ${TRAINING_DIR}..."
|
||||
mkdir -p ${TRAINING_DIR}
|
||||
rm -fr ${TRAINING_DIR}/*
|
||||
tlog "\n=== Starting training for language '${LANG_CODE}'"
|
||||
|
||||
source `dirname $0`/language-specific.sh
|
||||
set_lang_specific_parameters ${LANG_CODE}
|
||||
|
@ -16,10 +16,6 @@
|
||||
#
|
||||
# USAGE: source tesstrain_utils.sh
|
||||
|
||||
FONTS=(
|
||||
"Arial" \
|
||||
"Times New Roman," \
|
||||
)
|
||||
if [ "$(uname)" == "Darwin" ];then
|
||||
FONTS_DIR="/Library/Fonts/"
|
||||
else
|
||||
@ -29,7 +25,8 @@ OUTPUT_DIR="/tmp/tesstrain/tessdata"
|
||||
OVERWRITE=0
|
||||
RUN_SHAPE_CLUSTERING=0
|
||||
EXTRACT_FONT_PROPERTIES=1
|
||||
WORKSPACE_DIR="/tmp/tesstrain"
|
||||
WORKSPACE_DIR=`mktemp -d`
|
||||
EXPOSURES=0
|
||||
|
||||
# Logging helper functions.
|
||||
tlog() {
|
||||
@ -45,11 +42,11 @@ err_exit() {
|
||||
# if the program file is not found.
|
||||
# Usage: run_command CMD ARG1 ARG2...
|
||||
run_command() {
|
||||
local cmd=$1
|
||||
shift
|
||||
if [[ ! -x ${cmd} ]]; then
|
||||
err_exit "File ${cmd} not found"
|
||||
local cmd=`which $1`
|
||||
if [[ -z ${cmd} ]]; then
|
||||
err_exit "$1 not found"
|
||||
fi
|
||||
shift
|
||||
tlog "[$(date)] ${cmd} $@"
|
||||
${cmd} "$@" 2>&1 1>&2 | tee -a ${LOG_FILE}
|
||||
# check completion status
|
||||
@ -69,22 +66,6 @@ check_file_readable() {
|
||||
done
|
||||
}
|
||||
|
||||
# Set global path variables that are based on parsed flags.
|
||||
set_prog_paths() {
|
||||
if [[ -z ${BINDIR} ]]; then
|
||||
err_exit "Need to specify location of program files"
|
||||
fi
|
||||
CN_TRAINING_EXE=${BINDIR}/cntraining
|
||||
COMBINE_TESSDATA_EXE=${BINDIR}/combine_tessdata
|
||||
MF_TRAINING_EXE=${BINDIR}/mftraining
|
||||
SET_UNICHARSET_PROPERTIES_EXE=${BINDIR}/set_unicharset_properties
|
||||
SHAPE_TRAINING_EXE=${BINDIR}/shapeclustering
|
||||
TESSERACT_EXE=${BINDIR}/tesseract
|
||||
TEXT2IMAGE_EXE=${BINDIR}/text2image
|
||||
UNICHARSET_EXTRACTOR_EXE=${BINDIR}/unicharset_extractor
|
||||
WORDLIST2DAWG_EXE=${BINDIR}/wordlist2dawg
|
||||
}
|
||||
|
||||
# Sets the named variable to given value. Aborts if the value is missing or
|
||||
# if it looks like a flag.
|
||||
# Usage: parse_value VAR_NAME VALUE
|
||||
@ -109,9 +90,6 @@ parse_flags() {
|
||||
case ${ARGV[$i]} in
|
||||
--)
|
||||
break;;
|
||||
--bin_dir)
|
||||
parse_value "BINDIR" ${ARGV[$j]}
|
||||
i=$j ;;
|
||||
--fontlist) # Expect a plus-separated list of names
|
||||
if [[ -z ${ARGV[$j]} ]] || [[ ${ARGV[$j]:0:2} == "--" ]]; then
|
||||
err_exit "Invalid value passed to --fontlist"
|
||||
@ -121,6 +99,16 @@ parse_flags() {
|
||||
FONTS=( ${ARGV[$j]} )
|
||||
IFS=$ofs
|
||||
i=$j ;;
|
||||
--exposures)
|
||||
exp=""
|
||||
while test $j -lt ${#ARGV[@]}; do
|
||||
test -z ${ARGV[$j]} && break
|
||||
test `echo ${ARGV[$j]} | cut -c -2` = "--" && break
|
||||
exp="$exp ${ARGV[$j]}"
|
||||
j=$((j+1))
|
||||
done
|
||||
parse_value "EXPOSURES" "$exp"
|
||||
i=$((j-1)) ;;
|
||||
--fonts_dir)
|
||||
parse_value "FONTS_DIR" ${ARGV[$j]}
|
||||
i=$j ;;
|
||||
@ -156,9 +144,6 @@ parse_flags() {
|
||||
if [[ -z ${LANG_CODE} ]]; then
|
||||
err_exit "Need to specify a language --lang"
|
||||
fi
|
||||
if [[ -z ${BINDIR} ]]; then
|
||||
err_exit "Need to specify path to built binaries --bin_dir"
|
||||
fi
|
||||
if [[ -z ${LANGDATA_ROOT} ]]; then
|
||||
err_exit "Need to specify path to language files --langdata_dir"
|
||||
fi
|
||||
@ -171,8 +156,6 @@ parse_flags() {
|
||||
fi
|
||||
fi
|
||||
|
||||
set_prog_paths
|
||||
|
||||
# Location where intermediate files will be created.
|
||||
TRAINING_DIR=${WORKSPACE_DIR}/${LANG_CODE}
|
||||
# Location of log file for the whole run.
|
||||
@ -200,8 +183,8 @@ initialize_fontconfig() {
|
||||
export FONT_CONFIG_CACHE=$(mktemp -d --tmpdir font_tmp.XXXXXXXXXX)
|
||||
local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt
|
||||
echo "Text" >${sample_path}
|
||||
run_command ${TEXT2IMAGE_EXE} --fonts_dir=${FONTS_DIR} \
|
||||
--font="Arial" --outputbase=${sample_path} --text=${sample_path} \
|
||||
run_command text2image --fonts_dir=${FONTS_DIR} \
|
||||
--font="${FONTS[0]}" --outputbase=${sample_path} --text=${sample_path} \
|
||||
--fontconfig_tmpdir=${FONT_CONFIG_CACHE}
|
||||
}
|
||||
|
||||
@ -228,14 +211,14 @@ generate_font_image() {
|
||||
fi
|
||||
done
|
||||
|
||||
run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \
|
||||
run_command text2image ${common_args} --font="${font}" \
|
||||
--text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS}
|
||||
check_file_readable ${outbase}.box ${outbase}.tif
|
||||
|
||||
if (( ${EXTRACT_FONT_PROPERTIES} )) &&
|
||||
[[ -r ${TRAIN_NGRAMS_FILE} ]]; then
|
||||
tlog "Extracting font properties of ${font}"
|
||||
run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \
|
||||
run_command text2image ${common_args} --font="${font}" \
|
||||
--ligatures=false --text=${TRAIN_NGRAMS_FILE} \
|
||||
--only_extract_font_properties --ptsize=32
|
||||
check_file_readable ${outbase}.fontinfo
|
||||
@ -254,35 +237,36 @@ phase_I_generate_image() {
|
||||
err_exit "Could not find training text file ${TRAINING_TEXT}"
|
||||
fi
|
||||
CHAR_SPACING="0.0"
|
||||
EXPOSURE="0"
|
||||
|
||||
if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
|
||||
# Parse .bigram_freqs file and compose a .train_ngrams file with text
|
||||
# for tesseract to recognize during training. Take only the ngrams whose
|
||||
# combined weight accounts for 95% of all the bigrams in the language.
|
||||
NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
|
||||
| awk '{s=s+$2}; END {print (s/100)*p}' p=99)
|
||||
cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
|
||||
| awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
|
||||
x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
|
||||
check_file_readable ${TRAIN_NGRAMS_FILE}
|
||||
fi
|
||||
|
||||
local counter=0
|
||||
for font in "${FONTS[@]}"; do
|
||||
generate_font_image "${font}" &
|
||||
let counter=counter+1
|
||||
let rem=counter%par_factor
|
||||
if [[ "${rem}" -eq 0 ]]; then
|
||||
wait
|
||||
for EXPOSURE in $EXPOSURES; do
|
||||
if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
|
||||
# Parse .bigram_freqs file and compose a .train_ngrams file with text
|
||||
# for tesseract to recognize during training. Take only the ngrams whose
|
||||
# combined weight accounts for 95% of all the bigrams in the language.
|
||||
NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
|
||||
| awk '{s=s+$2}; END {print (s/100)*p}' p=99)
|
||||
cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
|
||||
| awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
|
||||
x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
|
||||
check_file_readable ${TRAIN_NGRAMS_FILE}
|
||||
fi
|
||||
done
|
||||
wait
|
||||
# Check that each process was successful.
|
||||
for font in "${FONTS[@]}"; do
|
||||
local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
|
||||
local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
|
||||
check_file_readable ${outbase}.box ${outbase}.tif
|
||||
|
||||
local counter=0
|
||||
for font in "${FONTS[@]}"; do
|
||||
generate_font_image "${font}" &
|
||||
let counter=counter+1
|
||||
let rem=counter%par_factor
|
||||
if [[ "${rem}" -eq 0 ]]; then
|
||||
wait
|
||||
fi
|
||||
done
|
||||
wait
|
||||
# Check that each process was successful.
|
||||
for font in "${FONTS[@]}"; do
|
||||
local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
|
||||
local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
|
||||
check_file_readable ${outbase}.box ${outbase}.tif
|
||||
done
|
||||
done
|
||||
}
|
||||
|
||||
@ -291,7 +275,7 @@ phase_UP_generate_unicharset() {
|
||||
tlog "\n=== Phase UP: Generating unicharset and unichar properties files ==="
|
||||
|
||||
local box_files=$(ls ${TRAINING_DIR}/*.box)
|
||||
run_command ${UNICHARSET_EXTRACTOR_EXE} -D "${TRAINING_DIR}/" ${box_files}
|
||||
run_command unicharset_extractor -D "${TRAINING_DIR}/" ${box_files}
|
||||
local outfile=${TRAINING_DIR}/unicharset
|
||||
UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset"
|
||||
check_file_readable ${outfile}
|
||||
@ -299,7 +283,7 @@ phase_UP_generate_unicharset() {
|
||||
|
||||
XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights"
|
||||
check_file_readable ${UNICHARSET_FILE}
|
||||
run_command ${SET_UNICHARSET_PROPERTIES_EXE} \
|
||||
run_command set_unicharset_properties \
|
||||
-U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \
|
||||
--script_dir=${LANGDATA_ROOT}
|
||||
check_file_readable ${XHEIGHTS_FILE}
|
||||
@ -327,7 +311,7 @@ phase_D_generate_dawg() {
|
||||
if [[ -s ${WORDLIST_FILE} ]]; then
|
||||
tlog "Generating word Dawg"
|
||||
check_file_readable ${UNICHARSET_FILE}
|
||||
run_command ${WORDLIST2DAWG_EXE} -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
|
||||
run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
|
||||
${UNICHARSET_FILE}
|
||||
check_file_readable ${WORD_DAWG}
|
||||
|
||||
@ -339,13 +323,13 @@ phase_D_generate_dawg() {
|
||||
if [[ -s ${freq_wordlist_file} ]]; then
|
||||
check_file_readable ${UNICHARSET_FILE}
|
||||
tlog "Generating frequent-word Dawg"
|
||||
run_command ${WORDLIST2DAWG_EXE} -r 1 ${freq_wordlist_file} \
|
||||
run_command wordlist2dawg -r 1 ${freq_wordlist_file} \
|
||||
${FREQ_DAWG} ${UNICHARSET_FILE}
|
||||
check_file_readable ${FREQ_DAWG}
|
||||
fi
|
||||
|
||||
# Punctuation DAWG
|
||||
# -r arguments to WORDLIST2DAWG_EXE denote RTL reverse policy
|
||||
# -r arguments to wordlist2dawg denote RTL reverse policy
|
||||
# (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h).
|
||||
# We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
|
||||
# 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
|
||||
@ -360,20 +344,20 @@ phase_D_generate_dawg() {
|
||||
PUNC_FILE="${LANGDATA_ROOT}/common.punc"
|
||||
fi
|
||||
check_file_readable ${PUNC_FILE}
|
||||
run_command ${WORDLIST2DAWG_EXE} -r ${punc_reverse_policy} \
|
||||
run_command wordlist2dawg -r ${punc_reverse_policy} \
|
||||
${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE}
|
||||
check_file_readable ${PUNC_DAWG}
|
||||
|
||||
# Numbers DAWG
|
||||
if [[ -s ${NUMBERS_FILE} ]]; then
|
||||
run_command ${WORDLIST2DAWG_EXE} -r 0 \
|
||||
run_command wordlist2dawg -r 0 \
|
||||
${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE}
|
||||
check_file_readable ${NUMBER_DAWG}
|
||||
fi
|
||||
|
||||
# Bigram dawg
|
||||
if [[ -s ${WORD_BIGRAMS_FILE} ]]; then
|
||||
run_command ${WORDLIST2DAWG_EXE} -r 1 \
|
||||
run_command wordlist2dawg -r 1 \
|
||||
${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
|
||||
check_file_readable ${BIGRAM_DAWG}
|
||||
fi
|
||||
@ -387,10 +371,9 @@ phase_E_extract_features() {
|
||||
par_factor=1
|
||||
fi
|
||||
tlog "\n=== Phase E: Extracting features ==="
|
||||
TRAIN_EXPOSURES='0'
|
||||
|
||||
local img_files=""
|
||||
for exposure in ${TRAIN_EXPOSURES}; do
|
||||
for exposure in ${EXPOSURES}; do
|
||||
img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
|
||||
done
|
||||
|
||||
@ -405,7 +388,7 @@ phase_E_extract_features() {
|
||||
tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
||||
local counter=0
|
||||
for img_file in ${img_files}; do
|
||||
run_command ${TESSERACT_EXE} ${img_file} ${img_file%.*} \
|
||||
run_command tesseract ${img_file} ${img_file%.*} \
|
||||
${box_config} ${config} &
|
||||
let counter=counter+1
|
||||
let rem=counter%par_factor
|
||||
@ -427,7 +410,7 @@ phase_C_cluster_prototypes() {
|
||||
tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
|
||||
local out_normproto=$1
|
||||
|
||||
run_command ${CN_TRAINING_EXE} -D "${TRAINING_DIR}/" \
|
||||
run_command cntraining -D "${TRAINING_DIR}/" \
|
||||
$(ls ${TRAINING_DIR}/*.tr)
|
||||
|
||||
check_file_readable ${TRAINING_DIR}/normproto
|
||||
@ -447,7 +430,7 @@ phase_S_cluster_shapes() {
|
||||
font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
|
||||
fi
|
||||
|
||||
run_command ${SHAPE_TRAINING_EXE} \
|
||||
run_command shapeclustering \
|
||||
-D "${TRAINING_DIR}/" \
|
||||
-U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
|
||||
-O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
|
||||
@ -468,7 +451,7 @@ phase_M_cluster_microfeatures() {
|
||||
font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
|
||||
fi
|
||||
|
||||
run_command ${MF_TRAINING_EXE} \
|
||||
run_command mftraining \
|
||||
-D "${TRAINING_DIR}/" \
|
||||
-U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
|
||||
-O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
|
||||
@ -528,7 +511,7 @@ make__traineddata() {
|
||||
fi
|
||||
|
||||
# Compose the traineddata file.
|
||||
run_command ${COMBINE_TESSDATA_EXE} ${TRAINING_DIR}/${LANG_CODE}.
|
||||
run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}.
|
||||
|
||||
# Copy it to the output dir, overwriting only if allowed by the cmdline flag.
|
||||
if [[ ! -d ${OUTPUT_DIR} ]]; then
|
||||
|
@ -127,7 +127,7 @@ SVSemaphore::SVSemaphore() {
|
||||
semaphore_ = CreateSemaphore(0, 0, 10, 0);
|
||||
#elif defined(__APPLE__)
|
||||
char name[50];
|
||||
snprintf(name, sizeof(name), "%d", random());
|
||||
snprintf(name, sizeof(name), "%ld", random());
|
||||
sem_unlink(name);
|
||||
semaphore_ = sem_open(name, O_CREAT , S_IWUSR, 0);
|
||||
if (semaphore_ == SEM_FAILED) {
|
||||
@ -296,14 +296,11 @@ static std::string ScrollViewCommand(std::string scrollview_path) {
|
||||
// this unnecessary.
|
||||
// Also the path has to be separated by ; on windows and : otherwise.
|
||||
#ifdef _WIN32
|
||||
const char* cmd_template = "-Djava.library.path=%s -cp %s/ScrollView.jar;"
|
||||
"%s/piccolo2d-core-3.0.jar:%s/piccolo2d-extras-3.0.jar"
|
||||
" com.google.scrollview.ScrollView";
|
||||
const char* cmd_template = "-Djava.library.path=%s -jar %s/ScrollView.jar";
|
||||
|
||||
#else
|
||||
const char* cmd_template = "-c \"trap 'kill %%1' 0 1 2 ; java "
|
||||
"-Xms1024m -Xmx2048m -Djava.library.path=%s -cp %s/ScrollView.jar:"
|
||||
"%s/piccolo2d-core-3.0.jar:%s/piccolo2d-extras-3.0.jar"
|
||||
" com.google.scrollview.ScrollView"
|
||||
"-Xms1024m -Xmx2048m -jar %s/ScrollView.jar"
|
||||
" & wait\"";
|
||||
#endif
|
||||
int cmdlen = strlen(cmd_template) + 4*strlen(scrollview_path.c_str()) + 1;
|
||||
@ -374,7 +371,7 @@ static int GetAddrInfo(const char* hostname, int port,
|
||||
struct addrinfo** address) {
|
||||
#if defined(__linux__)
|
||||
char port_str[40];
|
||||
snprintf(port_str, 40, "%d", port);
|
||||
snprintf(port_str, 40, "%ld", port);
|
||||
return getaddrinfo(hostname, port_str, NULL, address);
|
||||
#else
|
||||
return GetAddrInfoNonLinux(hostname, port, address);
|
||||
|
@ -177,11 +177,11 @@ struct ViterbiStateEntry : public ELIST_LINK {
|
||||
/// the smallest rating or lower/upper case letters).
|
||||
LanguageModelFlagsType top_choice_flags;
|
||||
|
||||
/// Extra information maintained by Dawg laguage model component
|
||||
/// Extra information maintained by Dawg language model component
|
||||
/// (owned by ViterbiStateEntry).
|
||||
LanguageModelDawgInfo *dawg_info;
|
||||
|
||||
/// Extra information maintained by Ngram laguage model component
|
||||
/// Extra information maintained by Ngram language model component
|
||||
/// (owned by ViterbiStateEntry).
|
||||
LanguageModelNgramInfo *ngram_info;
|
||||
|
||||
|
@ -273,7 +273,7 @@ void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column,
|
||||
*
|
||||
* Recursively go through the ratings matrix to find lists of fragments
|
||||
* to be merged in the function merge_and_put_fragment_lists.
|
||||
* current_frag is the postion of the piece we are looking for.
|
||||
* current_frag is the position of the piece we are looking for.
|
||||
* current_row is the row in the rating matrix we are currently at.
|
||||
* start is the row we started initially, so that we can know where
|
||||
* to append the results to the matrix. num_frag_parts is the total
|
||||
|
@ -375,7 +375,7 @@ class Wordrec : public Classify {
|
||||
inT16 num_blobs);
|
||||
// Recursively go through the ratings matrix to find lists of fragments
|
||||
// to be merged in the function merge_and_put_fragment_lists.
|
||||
// current_frag is the postion of the piece we are looking for.
|
||||
// current_frag is the position of the piece we are looking for.
|
||||
// current_row is the row in the rating matrix we are currently at.
|
||||
// start is the row we started initially, so that we can know where
|
||||
// to append the results to the matrix. num_frag_parts is the total
|
||||
|
Loading…
Reference in New Issue
Block a user