Merge branch 'master' of github.com:egorpugin/tesseract

This commit is contained in:
Egor Pugin 2015-10-05 22:33:24 +03:00
commit a614edbe94
83 changed files with 468 additions and 362 deletions

2
.gitignore vendored
View File

@ -59,6 +59,8 @@ training/wordlist2dawg
*.o
*.Plo
*.a
*.class
*.jar
# tessdata
*.cube.*

45
.travis.yml Normal file
View File

@ -0,0 +1,45 @@
language: cpp
notifications:
email: false
sudo: required
os:
- linux
- osx
branches:
only:
- master
addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- gcc-4.8
- g++-4.8
before_install:
- if [[ $TRAVIS_OS_NAME == linux ]]; then LINUX=true; fi
- if [[ $TRAVIS_OS_NAME == osx ]]; then OSX=true; fi
- if [[ $OSX ]]; then brew update; fi
install:
- if [[ $OSX ]]; then brew install icu4c pango; brew link --force gettext; fi
- if [[ $OSX ]]; then export ICU_ROOT=/usr/local/opt/icu4c ; fi
- wget http://www.cmake.org/files/v3.3/cmake-3.3.1-Linux-x86_64.sh
- sudo sh cmake-3.3.1-Linux-x86_64.sh --skip-license --prefix=/usr
- wget -O leptonica.zip https://github.com/egorpugin/leptonica/archive/master.zip
- unzip leptonica.zip -d .
- cmake -Hleptonica-master -Bleptonica-master/build
- make -C leptonica-master/build
- if [[ $LINUX && "$CXX" = "g++" ]]; then export CXX="g++-4.8" CC="gcc-4.8"; fi
script:
- mkdir build
- cd build
- cmake .. -DLeptonica_DIR=leptonica-master/build
- make

View File

@ -8,7 +8,7 @@
#
###############################################################################
cmake_minimum_required(VERSION 2.8.12)
cmake_minimum_required(VERSION 2.8.11)
# In-source builds are disabled.
if (${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
@ -47,15 +47,8 @@ set(VERSION_PLAIN ${VERSION_MAJOR}.${VERSION_MINOR})
find_package(Leptonica 1.71 REQUIRED)
find_package(ICU COMPONENTS uc i18n)
find_package(PkgConfig QUIET)
pkg_check_modules(Pango pango)
pkg_check_modules(Cairo cairo)
pkg_check_modules(PangoFt2 pangoft2)
pkg_check_modules(PangoCairo pangocairo)
pkg_check_modules(FontConfig fontconfig)
include_directories(${Pango_INCLUDE_DIRS})
include_directories(${Cairo_INCLUDE_DIRS})
find_package(OpenCL QUIET)
find_package(PkgConfig)
###############################################################################
#
@ -80,6 +73,10 @@ if (WIN32)
set(LIB_Ws2_32 Ws2_32)
endif()
if (CYGWIN)
add_definitions(-D__CYGWIN__)
endif()
if (UNIX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
@ -190,7 +187,9 @@ set(tesseract_src ${tesseract_src}
)
add_library (tesseract ${LIBRARY_TYPE} ${tesseract_src} ${tesseract_hdr})
if (NOT STATIC)
target_compile_definitions (tesseract PUBLIC -DTESS_EXPORTS)
endif()
target_link_libraries (tesseract ${Leptonica_LIBRARIES} ${LIB_Ws2_32} ${LIB_pthread})
set_target_properties (tesseract PROPERTIES OUTPUT_NAME tesseract${VERSION_MAJOR}${VERSION_MINOR})
set_target_properties (tesseract PROPERTIES DEBUG_OUTPUT_NAME tesseract${VERSION_MAJOR}${VERSION_MINOR}d)

View File

@ -1,5 +1,5 @@
This package contains the Tesseract Open Source OCR Engine.
Orignally developed at Hewlett Packard Laboratories Bristol and
Originally developed at Hewlett Packard Laboratories Bristol and
at Hewlett Packard Co, Greeley Colorado, all the code
in this distribution is now licensed under the Apache License:

View File

@ -1,3 +1,6 @@
[![Build Status](https://travis-ci.org/egorpugin/tesseract.svg?branch=master)](https://travis-ci.org/egorpugin/tesseract)
[![Build status](https://ci.appveyor.com/api/projects/status/34s8gu4md3i9s93k?svg=true)](https://ci.appveyor.com/project/egorpugin/tesseract)
Note that this is possibly out-of-date version of the wiki ReadMe,
which is located at:
@ -97,7 +100,7 @@ find its data directory. You must either:
./autogen.sh
./configure
make
make install
sudo make install
sudo ldconfig
to move the data files to the standard place, or:

View File

@ -1660,7 +1660,7 @@ char* TessBaseAPI::GetUNLVText() {
word->word->space() > 0 &&
!word->word->flag(W_FUZZY_NON) &&
!word->word->flag(W_FUZZY_SP)) {
/* Write a space to separate from preceeding good text */
/* Write a space to separate from preceding good text */
*ptr++ = ' ';
last_char_was_tilde = false;
}

View File

@ -178,7 +178,7 @@ void TessPDFRenderer::AppendPDFObject(const char *data) {
AppendString((const char *)data);
}
// Helper function to prevent us from accidentaly writing
// Helper function to prevent us from accidentally writing
// scientific notation to an HOCR or PDF file. Besides, three
// decimal points are all you really need.
double prec(double x) {

View File

@ -227,7 +227,7 @@ int main(int argc, char **argv) {
}
// We have 2 possible sources of pagesegmode: a config file and
// the command line. For backwards compatability reasons, the
// the command line. For backwards compatibility reasons, the
// default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
// default for this program is tesseract::PSM_AUTO. We will let
// the config file take priority, so the command-line default

24
appveyor.yml Normal file
View File

@ -0,0 +1,24 @@
os: Visual Studio 2015
platform:
- Win32
- Win64
configuration:
- Debug
before_build:
- if %platform%==Win32 set generator=Visual Studio 14
- if %platform%==Win64 set generator=Visual Studio 14 Win64
- if %platform%==Win32 set vcplatform=Win32
- if %platform%==Win64 set vcplatform=x64
- ps: Start-FileDownload 'https://github.com/egorpugin/leptonica/archive/master.zip' -FileName leptonica.zip
- 7z x leptonica.zip
- cmake -Hleptonica-master -Bleptonica-master/build -G "%generator%"
- msbuild leptonica-master/build/leptonica.sln /p:Platform=%vcplatform% /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
build_script:
- mkdir build
- cd build
- cmake .. -G "%generator%" -DLeptonica_DIR=leptonica-master/build -DSTATIC=1
- msbuild tesseract.sln /p:Platform=%vcplatform% /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"

View File

@ -1556,7 +1556,7 @@ void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word,
word->fix_quotes();
if (tessedit_fix_hyphens)
word->fix_hyphens();
/* Dont trust fix_quotes! - though I think I've fixed the bug */
/* Don't trust fix_quotes! - though I think I've fixed the bug */
if (word->best_choice->length() != word->box_word->length()) {
tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
" #Blobs=%d\n",
@ -1694,7 +1694,7 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(
goto not_a_word;
/*
Allow a single hyphen in a lower case word
- dont trust upper case - I've seen several cases of "H" -> "I-I"
- don't trust upper case - I've seen several cases of "H" -> "I-I"
*/
if (lengths[i] == 1 && s[offset] == '-') {
hyphen_pos = i;

View File

@ -129,7 +129,7 @@ inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) {
int expected_outline_count;
if (STRING (outlines_odd).contains (c))
return 0; //Dont use this char
return 0; //Don't use this char
else if (STRING (outlines_2).contains (c))
expected_outline_count = 2;
else
@ -157,7 +157,7 @@ void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
* - Word segmentation is the same as the original image
* - All characters have the expected number of outlines
* NOTE - the rejection counts are recalculated after unrejection
* - CANT do it in a single pass without a bit of fiddling
* - CAN'T do it in a single pass without a bit of fiddling
* - keep it simple but inefficient
*************************************************************************/
void Tesseract::unrej_good_quality_words( //unreject potential
@ -403,7 +403,7 @@ void Tesseract::doc_and_block_rejection( //reject big chunks
/*************************************************************************
* reject_whole_page()
* Dont believe any of it - set the reject map to 00..00 in all words
* Don't believe any of it - set the reject map to 00..00 in all words
*
*************************************************************************/

View File

@ -55,7 +55,7 @@ void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
WERD_RES *word_res;
WERD_RES_LIST fuzzy_space_words;
inT16 new_length;
BOOL8 prevent_null_wd_fixsp; // DONT process blobless wds
BOOL8 prevent_null_wd_fixsp; // DON'T process blobless wds
inT32 word_index; // current word
block_res_it.set_to_list(&page_res->block_res_list);
@ -222,7 +222,7 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
* fuzzy spaces. The problem with the basic measure is that "561 63" would score
* the same as "56163", though given our knowledge that the space is fuzzy, and
* that there is a "1" next to the fuzzy space, we need to ensure that "56163"
* is prefered.
* is preferred.
*
* The solution is to NOT COUNT the score of any word which has a digit at one
* end and a "1Il" as the character the other side of the space.
@ -272,8 +272,8 @@ inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
} else {
/*
Can we add the prev word score and potentially count this word?
Yes IF it didnt end in a 1 when the first char of this word is a digit
AND it didnt end in a digit when the first char of this word is a 1
Yes IF it didn't end in a 1 when the first char of this word is a digit
AND it didn't end in a digit when the first char of this word is a 1
*/
word_len = word->reject_map.length();
current_word_ok_so_far = FALSE;
@ -507,7 +507,7 @@ BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
/*
Use all the standard pass 2 conditions for mode 5 in set_done() in
reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT
reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
CARE WHETHER WE HAVE of/at on/an etc.
*/
if (fixsp_done_mode > 0 &&

View File

@ -297,7 +297,7 @@ UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
/*************************************************************************
* SUSPECT LEVELS
*
* 0 - dont reject ANYTHING
* 0 - don't reject ANYTHING
* 1,2 - partial rejection
* 3 - BEST
*
@ -337,7 +337,7 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
rating_per_ch = word.rating() / word_res->reject_map.length();
if (rating_per_ch >= suspect_rating_per_ch)
return; //Dont touch bad ratings
return; //Don't touch bad ratings
if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
/* Unreject any Tess Acceptable word - but NOT tess reject chs*/

View File

@ -329,13 +329,13 @@ void ParamsEditor::WriteParams(char *filename,
fclose(fp);
sprintf (msg_str, "Overwrite file " "%s" "? (Y/N)", filename);
int a = sv_window_->ShowYesNoDialog(msg_str);
if (a == 'n') { return; } // dont write
if (a == 'n') { return; } // don't write
}
fp = fopen (filename, "wb"); // can we write to it?
if (fp == NULL) {
sv_window_->AddMessage("Cant write to file " "%s" "", filename);
sv_window_->AddMessage("Can't write to file " "%s" "", filename);
return;
}

View File

@ -521,7 +521,7 @@ BOOL8 Tesseract::word_contains_non_1_digit(const char *word,
/*************************************************************************
* dont_allow_1Il()
* Dont unreject LONE accepted 1Il conflict set chars
* Don't unreject LONE accepted 1Il conflict set chars
*************************************************************************/
void Tesseract::dont_allow_1Il(WERD_RES *word) {
int i = 0;
@ -633,7 +633,7 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
next_left = 9999;
else
next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
// Dont touch small or touching blobs - it is too dangerous.
// Don't touch small or touching blobs - it is too dangerous.
if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
(out_box.left() > prev_right) && (out_box.right() < next_left)) {
aspect_ratio = out_box.width() / (float) out_box.height();

View File

@ -136,7 +136,7 @@ Tesseract::Tesseract()
BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true,
"Try to improve fuzzy spaces", this->params()),
BOOL_MEMBER(tessedit_unrej_any_wd, false,
"Dont bother with word plausibility", this->params()),
"Don't bother with word plausibility", this->params()),
BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
this->params()),
BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height",
@ -310,19 +310,19 @@ Tesseract::Tesseract()
this->params()),
INT_MEMBER(crunch_pot_indicators, 1,
"How many potential indicators needed", this->params()),
BOOL_MEMBER(crunch_leave_ok_strings, true, "Dont touch sensible strings",
BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings",
this->params()),
BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
this->params()),
BOOL_MEMBER(crunch_leave_accept_strings, false,
"Dont pot crunch sensible strings", this->params()),
"Don't pot crunch sensible strings", this->params()),
BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
this->params()),
INT_MEMBER(crunch_leave_lc_strings, 4,
"Dont crunch words with long lower case strings",
"Don't crunch words with long lower case strings",
this->params()),
INT_MEMBER(crunch_leave_uc_strings, 4,
"Dont crunch words with long lower case strings",
"Don't crunch words with long lower case strings",
this->params()),
INT_MEMBER(crunch_long_repetitions, 3,
"Crunch words with long repetitions", this->params()),
@ -393,21 +393,21 @@ Tesseract::Tesseract()
INT_MEMBER(suspect_space_level, 100,
"Min suspect level for rejecting spaces", this->params()),
INT_MEMBER(suspect_short_words, 2,
"Dont Suspect dict wds longer than this", this->params()),
"Don't suspect dict wds longer than this", this->params()),
BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
this->params()),
double_MEMBER(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit",
double_MEMBER(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit",
this->params()),
double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
this->params()),
BOOL_MEMBER(tessedit_minimal_rejection, false,
"Only reject tess failures", this->params()),
BOOL_MEMBER(tessedit_zero_rejection, false, "Dont reject ANYTHING",
BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING",
this->params()),
BOOL_MEMBER(tessedit_word_for_word, false,
"Make output have exactly one word per WERD", this->params()),
BOOL_MEMBER(tessedit_zero_kelvin_rejection, false,
"Dont reject ANYTHING AT ALL", this->params()),
"Don't reject ANYTHING AT ALL", this->params()),
BOOL_MEMBER(tessedit_consistent_reps, true,
"Force all rep chars the same", this->params()),
INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
@ -424,7 +424,7 @@ Tesseract::Tesseract()
"Use DOC dawg in 11l conf. detector", this->params()),
BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
this->params()),
BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Dont double check",
BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check",
this->params()),
BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
this->params()),

View File

@ -733,7 +733,7 @@ class Tesseract : public Wordrec {
GenericVector<UNICHAR_ID>* class_ids);
// Resegments the word to achieve the target_text from the classifier.
// Returns false if the re-segmentation fails.
// Uses brute-force combination of upto kMaxGroupSize adjacent blobs, and
// Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
// applies a full search on the classifier results to find the best classified
// segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
// substitutions ARE used.
@ -833,7 +833,7 @@ class Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true,
"Try to improve fuzzy spaces");
BOOL_VAR_H(tessedit_unrej_any_wd, false,
"Dont bother with word plausibility");
"Don't bother with word plausibility");
BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?");
BOOL_VAR_H(tessedit_redo_xheight, true, "Check/Correct x-height");
BOOL_VAR_H(tessedit_enable_doc_dict, true,
@ -954,15 +954,15 @@ class Tesseract : public Wordrec {
double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this");
INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch");
INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed");
BOOL_VAR_H(crunch_leave_ok_strings, true, "Dont touch sensible strings");
BOOL_VAR_H(crunch_leave_ok_strings, true, "Don't touch sensible strings");
BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring");
BOOL_VAR_H(crunch_leave_accept_strings, false,
"Dont pot crunch sensible strings");
"Don't pot crunch sensible strings");
BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures");
INT_VAR_H(crunch_leave_lc_strings, 4,
"Dont crunch words with long lower case strings");
"Don't crunch words with long lower case strings");
INT_VAR_H(crunch_leave_uc_strings, 4,
"Dont crunch words with long lower case strings");
"Don't crunch words with long lower case strings");
INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions");
INT_VAR_H(crunch_debug, 0, "As it says");
INT_VAR_H(fixsp_non_noise_limit, 1,
@ -1010,16 +1010,16 @@ class Tesseract : public Wordrec {
INT_VAR_H(suspect_space_level, 100,
"Min suspect level for rejecting spaces");
INT_VAR_H(suspect_short_words, 2,
"Dont Suspect dict wds longer than this");
"Don't Suspect dict wds longer than this");
BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected");
double_VAR_H(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit");
double_VAR_H(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit");
double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit");
BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures");
BOOL_VAR_H(tessedit_zero_rejection, false, "Dont reject ANYTHING");
BOOL_VAR_H(tessedit_zero_rejection, false, "Don't reject ANYTHING");
BOOL_VAR_H(tessedit_word_for_word, false,
"Make output have exactly one word per WERD");
BOOL_VAR_H(tessedit_zero_kelvin_rejection, false,
"Dont reject ANYTHING AT ALL");
"Don't reject ANYTHING AT ALL");
BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same");
INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm");
BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug");
@ -1030,7 +1030,7 @@ class Tesseract : public Wordrec {
"Aspect ratio dot/hyphen test");
BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector");
BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test");
BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Dont double check");
BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Don't double check");
BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control");
BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control");
BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control");

View File

@ -33,7 +33,7 @@
ELISTIZE (BLOBNBOX) ELIST2IZE (TO_ROW) ELISTIZE (TO_BLOCK)
// Upto 30 degrees is allowed for rotations of diacritic blobs.
// Up to 30 degrees is allowed for rotations of diacritic blobs.
const double kCosSmallAngle = 0.866;
// Min aspect ratio for a joined word to indicate an obvious flow direction.
const double kDefiniteAspectRatio = 2.0;

View File

@ -35,7 +35,7 @@ FILE* OpenBoxFile(const STRING& fname) {
FILE* box_file = NULL;
if (!(box_file = fopen(filename.string(), "rb"))) {
CANTOPENFILE.error("read_next_box", TESSEXIT,
"Cant open box file %s",
"Can't open box file %s",
filename.string());
}
return box_file;

View File

@ -382,7 +382,7 @@ void DENORM::LocalDenormTransform(const FCOORD& pt, FCOORD* original) const {
}
// Transforms the given coords all the way back to source image space using
// the full transformation sequence defined by this and its predecesors
// the full transformation sequence defined by this and its predecessors
// recursively, shallowest first, and finally any block re_rotation.
// If last_denorm is not NULL, then the last transformation used will
// be last_denorm, and the block re_rotation will never be executed.

View File

@ -218,7 +218,7 @@ class DENORM {
void LocalDenormTransform(const TPOINT& pt, TPOINT* original) const;
void LocalDenormTransform(const FCOORD& pt, FCOORD* original) const;
// Transforms the given coords all the way back to source image space using
// the full transformation sequence defined by this and its predecesors
// the full transformation sequence defined by this and its predecessors
// recursively, shallowest first, and finally any block re_rotation.
// If last_denorm is not NULL, then the last transformation used will
// be last_denorm, and the block re_rotation will never be executed.

View File

@ -108,7 +108,7 @@ class PDBLK
PDBLK & operator= (const PDBLK & source);
protected:
POLY_BLOCK *hand_poly; //< wierd as well
POLY_BLOCK *hand_poly; //< weird as well
ICOORDELT_LIST leftside; //< left side vertices
ICOORDELT_LIST rightside; //< right side vertices
TBOX box; //< bounding box

View File

@ -16,7 +16,7 @@
** limitations under the License.
*
This module may look unneccessarily verbose, but here's the philosophy...
This module may look unnecessarily verbose, but here's the philosophy...
ALL processing of the reject map is done in this module. There are lots of
separate calls to set reject/accept flags. These have DELIBERATELY been kept
@ -51,7 +51,7 @@ OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!!
enum REJ_FLAGS
{
/* Reject modes which are NEVER overridden */
R_TESS_FAILURE, // PERM Tess didnt classify
R_TESS_FAILURE, // PERM Tess didn't classify
R_SMALL_XHT, // PERM Xht too small
R_EDGE_CHAR, // PERM Too close to edge of image
R_1IL_CONFLICT, // PERM 1Il confusion
@ -62,7 +62,7 @@ enum REJ_FLAGS
/* Initial reject modes (pre NN_ACCEPT) */
R_POOR_MATCH, // TEMP Ray's original heuristic (Not used)
R_NOT_TESS_ACCEPTED, // TEMP Tess didnt accept WERD
R_NOT_TESS_ACCEPTED, // TEMP Tess didn't accept WERD
R_CONTAINS_BLANKS, // TEMP Tess failed on other chs in WERD
R_BAD_PERMUTER, // POTENTIAL Bad permuter for WERD
@ -82,7 +82,7 @@ enum REJ_FLAGS
R_ROW_REJ, // TEMP Row rejection
R_UNLV_REJ, // TEMP ~ turned to - or ^ turned to space
/* Accept modes which occur inbetween the above rejection groups */
/* Accept modes which occur between the above rejection groups */
R_NN_ACCEPT, //NN acceptance
R_HYPHEN_ACCEPT, //Hyphen acceptance
R_MM_ACCEPT, //Matrix match acceptance

View File

@ -204,7 +204,7 @@ double STATS::ile(double frac) const {
/**********************************************************************
* STATS::min_bucket
*
* Find REAL minimum bucket - ile(0.0) isnt necessarily correct
* Find REAL minimum bucket - ile(0.0) isn't necessarily correct
**********************************************************************/
inT32 STATS::min_bucket() const { // Find min
if (buckets_ == NULL || total_count_ == 0) {
@ -219,7 +219,7 @@ inT32 STATS::min_bucket() const { // Find min
/**********************************************************************
* STATS::max_bucket
*
* Find REAL maximum bucket - ile(1.0) isnt necessarily correct
* Find REAL maximum bucket - ile(1.0) isn't necessarily correct
**********************************************************************/
inT32 STATS::max_bucket() const { // Find max
@ -249,7 +249,7 @@ double STATS::median() const { //get median
if ((total_count_ > 1) && (pile_count(median_pile) == 0)) {
inT32 min_pile;
inT32 max_pile;
/* Find preceeding non zero pile */
/* Find preceding non zero pile */
for (min_pile = median_pile; pile_count(min_pile) == 0; min_pile--);
/* Find following non zero pile */
for (max_pile = median_pile; pile_count(max_pile) == 0; max_pile++);

View File

@ -23,7 +23,7 @@
*
********************************************************************************
* Revision 5.1 89/07/27 11:47:50 11:47:50 ray ()
* Added ratings acces methods.
* Added ratings access methods.
* This version ready for independent development.
*/
/*----------------------------------------------------------------------

View File

@ -190,7 +190,7 @@ const void *, const void *)) {
// Assuming list has been sorted already, insert new_data to
// keep the list sorted according to the same comparison function.
// Comparision function is the same as used by sort, i.e. uses double
// Comparison function is the same as used by sort, i.e. uses double
// indirection. Time is O(1) to add to beginning or end.
// Time is linear to add pre-sorted items to an empty list.
// If unique, then don't add duplicate entries.
@ -513,7 +513,7 @@ CLIST_LINK *CLIST_ITERATOR::extract_sublist( //from
temp_it.mark_cycle_pt ();
do { //walk sublist
if (temp_it.cycled_list ()) //cant find end pt
if (temp_it.cycled_list ()) //can't find end pt
BAD_SUBLIST.error ("CLIST_ITERATOR.extract_sublist", ABORT, NULL);
if (temp_it.at_last ()) {

View File

@ -51,11 +51,11 @@ class DLLSYM CLIST_LINK
}
CLIST_LINK( //copy constructor
const CLIST_LINK &) { //dont copy link
const CLIST_LINK &) { //don't copy link
data = next = NULL;
}
void operator= ( //dont copy links
void operator= ( //don't copy links
const CLIST_LINK &) {
data = next = NULL;
}
@ -89,7 +89,7 @@ class DLLSYM CLIST
void internal_deep_clear ( //destroy all links
void (*zapper) (void *)); //ptr to zapper functn
void shallow_clear(); //clear list but dont
void shallow_clear(); //clear list but don't
//delete data elements
bool empty() const { //is list empty?
@ -117,7 +117,7 @@ class DLLSYM CLIST
// Assuming list has been sorted already, insert new_data to
// keep the list sorted according to the same comparison function.
// Comparision function is the same as used by sort, i.e. uses double
// Comparison function is the same as used by sort, i.e. uses double
// indirection. Time is O(1) to add to beginning or end.
// Time is linear to add pre-sorted items to an empty list.
// If unique, then don't add duplicate entries.
@ -232,7 +232,7 @@ class DLLSYM CLIST_ITERATOR
BOOL8 cycled_list(); //Completed a cycle?
void add_to_end( //add at end &
void *new_data); //dont move
void *new_data); //don't move
void exchange( //positions of 2 links
CLIST_ITERATOR *other_it); //other iterator
@ -437,7 +437,7 @@ inline void CLIST_ITERATOR::add_before_then_move( // element to add
/***********************************************************************
* CLIST_ITERATOR::add_before_stay_put
*
* Add a new element to the list before the current element but dont move the
* Add a new element to the list before the current element but don't move the
* iterator to the new element.
**********************************************************************/
@ -485,7 +485,7 @@ inline void CLIST_ITERATOR::add_before_stay_put( // element to add
/***********************************************************************
* CLIST_ITERATOR::add_list_after
*
* Insert another list to this list after the current element but dont move the
* Insert another list to this list after the current element but don't move the
* iterator.
**********************************************************************/
@ -836,7 +836,7 @@ Replace <parm> with "<parm>". <parm> may be an arbitrary number of tokens
CLASSNAME is assumed to be the name of a class to be used in a CONS list
NOTE: Because we dont use virtual functions in the list code, the list code
NOTE: Because we don't use virtual functions in the list code, the list code
will NOT work correctly for classes derived from this.
The macro generates:
@ -885,7 +885,7 @@ public: \
CLASSNAME##_CLIST():CLIST() {} \
/* constructor */ \
\
CLASSNAME##_CLIST( /* dont construct */ \
CLASSNAME##_CLIST( /* don't construct */ \
const CLASSNAME##_CLIST&) /*by initial assign*/ \
{ DONT_CONSTRUCT_LIST_BY_COPY.error( QUOTE_IT( CLASSNAME##_CLIST ), \
ABORT, NULL ); } \
@ -963,7 +963,7 @@ CLISTIZEH_C( CLASSNAME )
* A function which can delete a CLASSNAME element. This is passed to the \
* generic deep_clear list member function so that when a list is cleared the \
* elements on the list are properly destroyed from the base class, even \
* though we dont use a virtual destructor function. \
* though we don't use a virtual destructor function. \
**********************************************************************/ \
\
DLLSYM void CLASSNAME##_c1_zapper( /*delete a link*/ \

View File

@ -117,7 +117,7 @@ inT32 ELIST::length() const { // count elements
* ELIST::sort
*
* Sort elements on list
* NB If you dont like the const declarations in the comparator, coerce yours:
* NB If you don't like the const declarations in the comparator, coerce yours:
* ( int (*)(const void *, const void *)
**********************************************************************/
@ -161,7 +161,7 @@ const void *, const void *)) {
// Assuming list has been sorted already, insert new_link to
// keep the list sorted according to the same comparison function.
// Comparision function is the same as used by sort, i.e. uses double
// Comparison function is the same as used by sort, i.e. uses double
// indirection. Time is O(1) to add to beginning or end.
// Time is linear to add pre-sorted items to an empty list.
// If unique is set to true and comparator() returns 0 (an entry with the
@ -455,7 +455,7 @@ ELIST_LINK *ELIST_ITERATOR::extract_sublist( //from
temp_it.mark_cycle_pt ();
do { //walk sublist
if (temp_it.cycled_list ()) //cant find end pt
if (temp_it.cycled_list ()) //can't find end pt
BAD_SUBLIST.error ("ELIST_ITERATOR.extract_sublist", ABORT, NULL);
if (temp_it.at_last ()) {

View File

@ -67,7 +67,7 @@ The implementation of lists is very careful about space and speed overheads.
This is why many embedded lists are provided. The same concerns mean that
in-line type coercion is done, rather than use virtual functions. This is
cumbersome in that each data type to be listed requires its own iterator and
list class - though macros can gererate these. It also prevents heterogenous
list class - though macros can gererate these. It also prevents heterogeneous
lists.
**********************************************************************/
@ -98,7 +98,7 @@ class DLLSYM ELIST_LINK
next = NULL;
}
void operator= ( //dont copy links
void operator= ( //don't copy links
const ELIST_LINK &) {
next = NULL;
}
@ -158,7 +158,7 @@ class DLLSYM ELIST
// Assuming list has been sorted already, insert new_link to
// keep the list sorted according to the same comparison function.
// Comparision function is the same as used by sort, i.e. uses double
// Comparison function is the same as used by sort, i.e. uses double
// indirection. Time is O(1) to add to beginning or end.
// Time is linear to add pre-sorted items to an empty list.
// If unique is set to true and comparator() returns 0 (an entry with the
@ -274,7 +274,7 @@ class DLLSYM ELIST_ITERATOR
bool cycled_list(); //Completed a cycle?
void add_to_end( //add at end &
ELIST_LINK *new_link); //dont move
ELIST_LINK *new_link); //don't move
void exchange( //positions of 2 links
ELIST_ITERATOR *other_it); //other iterator
@ -470,7 +470,7 @@ inline void ELIST_ITERATOR::add_before_then_move( // element to add
/***********************************************************************
* ELIST_ITERATOR::add_before_stay_put
*
* Add a new element to the list before the current element but dont move the
* Add a new element to the list before the current element but don't move the
* iterator to the new element.
**********************************************************************/
@ -515,7 +515,7 @@ inline void ELIST_ITERATOR::add_before_stay_put( // element to add
/***********************************************************************
* ELIST_ITERATOR::add_list_after
*
* Insert another list to this list after the current element but dont move the
* Insert another list to this list after the current element but don't move the
* iterator.
**********************************************************************/
@ -868,7 +868,7 @@ Replace <parm> with "<parm>". <parm> may be an arbitrary number of tokens
CLASSNAME is assumed to be the name of a class which has a baseclass of
ELIST_LINK.
NOTE: Because we dont use virtual functions in the list code, the list code
NOTE: Because we don't use virtual functions in the list code, the list code
will NOT work correctly for classes derived from this.
The macros generate:
@ -999,7 +999,7 @@ ELISTIZEH_C( CLASSNAME )
* A function which can delete a CLASSNAME element. This is passed to the \
* generic clear list member function so that when a list is cleared the \
* elements on the list are properly destroyed from the base class, even \
* though we dont use a virtual destructor function. \
* though we don't use a virtual destructor function. \
**********************************************************************/ \
\
DLLSYM void CLASSNAME##_zapper(ELIST_LINK* link) { \

View File

@ -118,7 +118,7 @@ inT32 ELIST2::length() const { // count elements
* ELIST2::sort
*
* Sort elements on list
* NB If you dont like the const declarations in the comparator, coerce yours:
* NB If you don't like the const declarations in the comparator, coerce yours:
* ( int (*)(const void *, const void *)
**********************************************************************/
@ -162,7 +162,7 @@ const void *, const void *)) {
// Assuming list has been sorted already, insert new_link to
// keep the list sorted according to the same comparison function.
// Comparision function is the same as used by sort, i.e. uses double
// Comparison function is the same as used by sort, i.e. uses double
// indirection. Time is O(1) to add to beginning or end.
// Time is linear to add pre-sorted items to an empty list.
void ELIST2::add_sorted(int comparator(const void*, const void*),
@ -475,7 +475,7 @@ ELIST2_LINK *ELIST2_ITERATOR::extract_sublist( //fr
temp_it.mark_cycle_pt ();
do { //walk sublist
if (temp_it.cycled_list ()) //cant find end pt
if (temp_it.cycled_list ()) //can't find end pt
BAD_SUBLIST.error ("ELIST2_ITERATOR.extract_sublist", ABORT, NULL);
if (temp_it.at_last ()) {

View File

@ -69,11 +69,11 @@ class DLLSYM ELIST2_LINK
}
ELIST2_LINK( //copy constructor
const ELIST2_LINK &) { //dont copy link
const ELIST2_LINK &) { //don't copy link
prev = next = NULL;
}
void operator= ( //dont copy links
void operator= ( //don't copy links
const ELIST2_LINK &) {
prev = next = NULL;
}
@ -133,7 +133,7 @@ class DLLSYM ELIST2
// Assuming list has been sorted already, insert new_link to
// keep the list sorted according to the same comparison function.
// Comparision function is the same as used by sort, i.e. uses double
// Comparison function is the same as used by sort, i.e. uses double
// indirection. Time is O(1) to add to beginning or end.
// Time is linear to add pre-sorted items to an empty list.
void add_sorted(int comparator(const void*, const void*),
@ -241,7 +241,7 @@ class DLLSYM ELIST2_ITERATOR
BOOL8 cycled_list(); //Completed a cycle?
void add_to_end( //add at end &
ELIST2_LINK *new_link); //dont move
ELIST2_LINK *new_link); //don't move
void exchange( //positions of 2 links
ELIST2_ITERATOR *other_it); //other iterator
@ -450,7 +450,7 @@ inline void ELIST2_ITERATOR::add_before_then_move( // element to add
/***********************************************************************
* ELIST2_ITERATOR::add_before_stay_put
*
* Add a new element to the list before the current element but dont move the
* Add a new element to the list before the current element but don't move the
* iterator to the new element.
**********************************************************************/
@ -500,7 +500,7 @@ inline void ELIST2_ITERATOR::add_before_stay_put( // element to add
/***********************************************************************
* ELIST2_ITERATOR::add_list_after
*
* Insert another list to this list after the current element but dont move the
* Insert another list to this list after the current element but don't move the
* iterator.
**********************************************************************/
@ -883,7 +883,7 @@ Replace <parm> with "<parm>". <parm> may be an arbitrary number of tokens
CLASSNAME is assumed to be the name of a class which has a baseclass of
ELIST2_LINK.
NOTE: Because we dont use virtual functions in the list code, the list code
NOTE: Because we don't use virtual functions in the list code, the list code
will NOT work correctly for classes derived from this.
The macro generates:
@ -927,7 +927,7 @@ public: \
CLASSNAME##_LIST():ELIST2() {} \
/* constructor */ \
\
CLASSNAME##_LIST( /* dont construct */ \
CLASSNAME##_LIST( /* don't construct */ \
const CLASSNAME##_LIST&) /*by initial assign*/\
{ DONT_CONSTRUCT_LIST_BY_COPY.error( QUOTE_IT( CLASSNAME##_LIST ), \
ABORT, NULL ); } \
@ -1015,7 +1015,7 @@ ELIST2IZEH_C( CLASSNAME )
* A function which can delete a CLASSNAME element. This is passed to the \
* generic clear list member function so that when a list is cleared the \
* elements on the list are properly destroyed from the base class, even \
* though we dont use a virtual destructor function. \
* though we don't use a virtual destructor function. \
**********************************************************************/ \
\
DLLSYM void CLASSNAME##_zapper( /*delete a link*/ \

View File

@ -53,7 +53,7 @@ enum TessErrorLogCode {
#define LOC_DOC_BLK_REJ 22
#define LOC_WRITE_RESULTS 23
#define LOC_ADAPTIVE 24
/* DONT DEFINE ANY LOCATION > 31 !!! */
/* DON'T DEFINE ANY LOCATION > 31 !!! */
/* Sub locatation determines whether pass2 was in normal mode or fix xht mode*/
#define SUBLOC_NORM 0

View File

@ -949,7 +949,7 @@ bool GenericVector<T>::SerializeClasses(tesseract::TFile* fp) const {
// Reads a vector of classes from the given file. Assumes the existence of
// bool T::Deserialize(bool swap, FILE* fp) that returns false in case of
// error. Alse needs T::T() and T::T(constT&), as init_to_size is used in
// error. Also needs T::T() and T::T(constT&), as init_to_size is used in
// this function. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
template <typename T>

View File

@ -61,8 +61,8 @@ class TRand {
private:
// Steps the generator to the next value.
void Iterate() {
seed_ *= 6364136223846793005;
seed_ += 1442695040888963407;
seed_ *= 6364136223846793005ULL;
seed_ += 1442695040888963407ULL;
}
// The current value of the seed.

View File

@ -38,6 +38,6 @@ const ERRCODE NULL_PREV = "Previous element on the list is NULL";
const ERRCODE EMPTY_LIST = "List is empty";
const ERRCODE BAD_PARAMETER = "List parameter error";
const ERRCODE STILL_LINKED =
"Attemting to add an element with non NULL links, to a list";
"Attempting to add an element with non NULL links, to a list";
#endif
#endif

View File

@ -21,7 +21,7 @@
* the HP OCR interface.
* The code is designed to be used with either a C or C++ compiler.
* The structures are designed to allow them to be used with any
* structure alignment upto 8.
* structure alignment up to 8.
**********************************************************************/
#ifndef CCUTIL_OCRCLASS_H_

View File

@ -47,7 +47,7 @@
#define SIGNED signed
#endif
#ifdef _WIN32
#if defined(_WIN32) || defined(__CYGWIN__)
#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif

View File

@ -45,7 +45,7 @@ const int kMaxDoubleSize = 15;
*
* The collection of MACROS provide different implementations depending
* on whether the string keeps track of its strlen or not so that this
* feature can be added in later when consumers dont modifify the string
* feature can be added in later when consumers don't modify the string
**********************************************************************/
// Smallest string to allocate by default
@ -339,7 +339,7 @@ STRING& STRING::operator=(const STRING& str) {
const STRING_HEADER* str_header = str.GetHeader();
int str_used = str_header->used_;
GetHeader()->used_ = 0; // clear since ensure doesnt need to copy data
GetHeader()->used_ = 0; // clear since ensure doesn't need to copy data
char* this_cstr = ensure_cstr(str_used);
STRING_HEADER* this_header = GetHeader();
@ -398,7 +398,7 @@ STRING & STRING::operator=(const char* cstr) {
if (cstr) {
int len = strlen(cstr) + 1;
this_header->used_ = 0; // dont bother copying data if need to realloc
this_header->used_ = 0; // don't bother copying data if need to realloc
char* this_cstr = ensure_cstr(len);
this_header = GetHeader(); // for realloc
memcpy(this_cstr, cstr, len);
@ -416,7 +416,7 @@ STRING & STRING::operator=(const char* cstr) {
void STRING::assign(const char *cstr, int len) {
STRING_HEADER* this_header = GetHeader();
this_header->used_ = 0; // dont bother copying data if need to realloc
this_header->used_ = 0; // don't bother copying data if need to realloc
char* this_cstr = ensure_cstr(len + 1); // +1 for '\0'
this_header = GetHeader(); // for realloc

View File

@ -51,7 +51,7 @@ bool TessdataManager::Init(const char *data_file_name, int debug_level) {
sizeof(actual_tessdata_num_entries_));
}
if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) {
// For forward compatability, truncate to the number we can handle.
// For forward compatibility, truncate to the number we can handle.
actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES;
}
fread(offset_table_, sizeof(inT64),

View File

@ -282,7 +282,7 @@ class TessdataManager {
* same or smaller than TESSDATA_NUM_ENTRIES, but can never be larger,
* since then it would be impossible to interpret the type of tessdata at
* indices same and higher than TESSDATA_NUM_ENTRIES.
* This parameter is used to allow for backward compatiblity
* This parameter is used to allow for backward compatibility
* when new tessdata types are introduced.
*/
inT32 actual_tessdata_num_entries_;

View File

@ -515,7 +515,7 @@ void Classify::EndAdaptiveClassifier() {
* load_pre_trained_templates Indicates whether the pre-trained
* templates (inttemp, normproto and pffmtable components)
* should be lodaded. Should only be set to true if the
* necesary classifier components are present in the
* necessary classifier components are present in the
* [lang].traineddata file.
* Globals:
* BuiltInTemplatesFile file to get built-in temps from
@ -1720,7 +1720,7 @@ bool Classify::LooksLikeGarbage(TBLOB *blob) {
*
* Globals:
*
* @return Number of features extracted or 0 if an error occured.
* @return Number of features extracted or 0 if an error occurred.
* @note Exceptions: none
* @note History: Tue May 28 10:40:52 1991, DSJ, Created.
*/
@ -2082,7 +2082,7 @@ void Classify::PrintAdaptiveMatchResults(const ADAPT_RESULTS& results) {
/*---------------------------------------------------------------------------*/
/**
* This routine steps thru each matching class in Results
* This routine steps through each matching class in Results
* and removes it from the match list if its rating
* is worse than the BestRating plus a pad. In other words,
* all good matches get moved to the front of the classes

View File

@ -151,7 +151,7 @@ Classify::Classify()
INT_MEMBER(classify_integer_matcher_multiplier, 10,
"Integer Matcher Multiplier 0-255: ", this->params()),
EnableLearning(true),
INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word",
INT_MEMBER(il1_adaption_test, 0, "Don't adapt to i/I at beginning of word",
this->params()),
BOOL_MEMBER(classify_bln_numeric_mode, 0,
"Assume the input is numbers [0-9].", this->params()),

View File

@ -495,7 +495,7 @@ class Classify : public CCStruct {
// font combinations that the shape represents.
UnicityTable<FontSet> fontset_table_;
INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word");
INT_VAR_H(il1_adaption_test, 0, "Don't adapt to i/I at beginning of word");
BOOL_VAR_H(classify_bln_numeric_mode, 0,
"Assume the input is numbers [0-9].");
double_VAR_H(speckle_large_max_size, 0.30, "Max large speckle size");

View File

@ -182,7 +182,7 @@ struct BUCKETS {
FLOAT64 ChiSquared; // test threshold
uinT16 NumberOfBuckets; // number of cells in histogram
uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets
uinT32 *Count; // frequency of occurence histogram
uinT32 *Count; // frequency of occurrence histogram
FLOAT32 *ExpectedCount; // expected histogram
};

View File

@ -24,7 +24,7 @@
#include <stdio.h>
/*-------------------------------------------------------------------------
Public Funtion Prototype
Public Function Prototype
--------------------------------------------------------------------------*/
uinT16 ReadSampleSize(FILE *File);

View File

@ -285,7 +285,7 @@ CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
/*---------------------------------------------------------------------------*/
/**
* Search thru all features currently defined and return
* Search through all features currently defined and return
* the feature type for the feature with the specified short
* name. Trap an error if the specified name is not found.
*

View File

@ -44,7 +44,7 @@ using tesseract::TrainingSample;
// The entries are in binary degrees where a full circle is 256 binary degrees.
static float cos_table[INT_CHAR_NORM_RANGE];
static float sin_table[INT_CHAR_NORM_RANGE];
// Guards write access to AtanTable so we dont create it more than once.
// Guards write access to AtanTable so we don't create it more than once.
tesseract::CCUtilMutex atan_table_mutex;

View File

@ -521,7 +521,7 @@ bool KDTreeSearch::BoxIntersectsSearch(FLOAT32 *lower, FLOAT32 *upper) {
* Walk a tree, calling action once on each node.
*
* Operation:
* This routine walks thru the specified sub_tree and invokes action
* This routine walks through the specified sub_tree and invokes action
* action at each node as follows:
* action(context, data, level)
* data the data contents of the node being visited,

View File

@ -104,7 +104,7 @@ LIST ConvertOutlines(TESSLINE *outline,
/*---------------------------------------------------------------------------*/
/**
* This routine searches thru the specified outline, computes
* This routine searches through the specified outline, computes
* a slope for each vector in the outline, and marks each
* vector as having one of the following directions:
* N, S, E, W, NE, NW, SE, SW
@ -182,7 +182,7 @@ void FreeOutlines(LIST Outlines) {
/*---------------------------------------------------------------------------*/
/**
* This routine searches thru the specified outline and finds
* This routine searches through the specified outline and finds
* the points at which the outline changes direction. These
* points are then marked as "extremities". This routine is
* used as an alternative to FindExtremities(). It forces the

View File

@ -147,7 +147,7 @@ void ConvertSegmentToPicoFeat(FPOINT *Start,
/*---------------------------------------------------------------------------*/
/**
* This routine steps thru the specified outline and cuts it
* This routine steps through the specified outline and cuts it
* up into pieces of equal length. These pieces become the
* desired pico-features. Each segment in the outline
* is converted into an integral number of pico-features.

View File

@ -93,7 +93,7 @@ void BeamSearch::CreateChildren(SearchColumn *out_col, LangModel *lang_mod,
} // lm_edges
}
// Performs a beam seach in the specified search using the specified
// Performs a beam search in the specified search using the specified
// language model; returns an alternate list of possible words as a result.
WordAltList * BeamSearch::Search(SearchObject *srch_obj, LangModel *lang_mod) {
// verifications

View File

@ -45,7 +45,7 @@ class BeamSearch {
public:
explicit BeamSearch(CubeRecoContext *cntxt, bool word_mode = true);
~BeamSearch();
// Performs a beam seach in the specified search using the specified
// Performs a beam search in the specified search using the specified
// language model; returns an alternate list of possible words as a result.
WordAltList *Search(SearchObject *srch_obj, LangModel *lang_mod = NULL);
// Returns the best node in the last column of last performed search.

View File

@ -72,7 +72,7 @@ bool ConvNetCharClassifier::Train(CharSamp *char_samp, int ClassID) {
/**
* A secondary function needed for training. Allows the trainer to set the
* value of any train-time paramter. This function is currently not
* value of any train-time parameter. This function is currently not
* implemented. TODO(ahmadab): implement end-2-end training
*/
bool ConvNetCharClassifier::SetLearnParam(char *var_name, float val) {

View File

@ -55,7 +55,7 @@ class ConvNetCharClassifier : public CharClassifier {
// is currently not implemented. TODO(ahmadab): implement end-2-end training
virtual bool Train(CharSamp *char_samp, int ClassID);
// A secondary function needed for training. Allows the trainer to set the
// value of any train-time paramter. This function is currently not
// value of any train-time parameter. This function is currently not
// implemented. TODO(ahmadab): implement end-2-end training
virtual bool SetLearnParam(char *var_name, float val);
// Externally sets the Neural Net used by the classifier. Used for training

View File

@ -247,7 +247,7 @@ int CubeLineObject::ComputeWordBreakThreshold(int con_comp_cnt,
word_break_threshold--;
} while (!valid && word_break_threshold > 0);
// failed to find a threshold that acheives the target aspect ratio.
// failed to find a threshold that achieves the target aspect ratio.
// Just use the default threshold
return static_cast<int>(line_pix_->h *
cntxt_->Params()->MaxSpaceHeightRatio());

View File

@ -237,7 +237,7 @@ Pixa *CubeLineSegmenter::CrackLine(Pix *cracked_line_pix,
return NULL;
}
// split a line continously until valid or fail
// split a line continuously until valid or fail
Pixa *CubeLineSegmenter::SplitLine(Pix *line_mask_pix, Box *line_box) {
// clone the line mask
Pix *line_pix = pixClone(line_mask_pix);
@ -739,7 +739,7 @@ bool CubeLineSegmenter::LineSegment() {
return true;
}
// Estimate the paramters of the font(s) used in the page
// Estimate the parameters of the font(s) used in the page
bool CubeLineSegmenter::EstimateFontParams() {
int hgt_hist[kHgtBins];
int max_hgt;

View File

@ -212,7 +212,7 @@ CharSamp *CubeSearchObject::CharSample(int start_pt, int end_pt) {
samp->SetLastChar(last_char ? 255 : 0);
} else {
// for non cursive languages, these features correspond
// to whether the charsamp is at the begining or end of the word
// to whether the charsamp is at the beginning or end of the word
samp->SetFirstChar((start_pt == -1) ? 255 : 0);
samp->SetLastChar((end_pt == (segment_cnt_ - 1)) ? 255 : 0);
}

View File

@ -114,7 +114,7 @@ class CubeSearchObject : public SearchObject {
end_pt <= (start_pt + max_seg_per_char_));
}
// computes the space and no space costs at gaps between segments
// return true on sucess
// return true on success
bool ComputeSpaceCosts();
};
}

View File

@ -72,7 +72,7 @@ bool HybridNeuralNetCharClassifier::Train(CharSamp *char_samp, int ClassID) {
}
// A secondary function needed for training. Allows the trainer to set the
// value of any train-time paramter. This function is currently not
// value of any train-time parameter. This function is currently not
// implemented. TODO(ahmadab): implement end-2-end training
bool HybridNeuralNetCharClassifier::SetLearnParam(char *var_name, float val) {
// TODO(ahmadab): implementation of parameter initializing.
@ -151,7 +151,7 @@ bool HybridNeuralNetCharClassifier::RunNets(CharSamp *char_samp) {
return false;
}
// go thru all the nets
// go through all the nets
memset(net_output_, 0, class_cnt * sizeof(*net_output_));
float *inputs = net_input_;
for (int net_idx = 0; net_idx < nets_.size(); net_idx++) {

View File

@ -48,7 +48,7 @@ class HybridNeuralNetCharClassifier : public CharClassifier {
// is currently not implemented. TODO(ahmadab): implement end-2-end training
virtual bool Train(CharSamp *char_samp, int ClassID);
// A secondary function needed for training. Allows the trainer to set the
// value of any train-time paramter. This function is currently not
// value of any train-time parameter. This function is currently not
// implemented. TODO(ahmadab): implement end-2-end training
virtual bool SetLearnParam(char *var_name, float val);
// Externally sets the Neural Net used by the classifier. Used for training

View File

@ -397,7 +397,7 @@ int TessLangModel::NumberEdges(EDGE_REF edge_ref, LangModEdge **edge_array) {
return 0;
}
// go thru all valid transitions from the state
// go through all valid transitions from the state
int edge_cnt = 0;
EDGE_REF new_edge_ref;

View File

@ -37,7 +37,7 @@
#include "oldlist.h"
/*----------------------------------------------------------------------------
Public Funtion Prototypes
Public Function Prototypes
--------------------------------------------------------------------------*/
LIST read_list(const char *filename);
#endif

View File

@ -407,7 +407,7 @@ LIST s_adjoin(LIST var_list, void *variable, int_compare compare) {
*
* Search list, return NIL_LIST if not found. Return the list starting from
* the item if found. The compare routine "is_equal" is passed in as
* the third paramter to this routine. If the value NULL is supplied
* the third parameter to this routine. If the value NULL is supplied
* for is_equal, the is_key routine will be used.
**********************************************************************/
LIST search(LIST list, void *key, int_compare is_equal) {

View File

@ -234,7 +234,7 @@ first_node (list_rest (l))
first_node (list_rest (list_rest (l)))
/*----------------------------------------------------------------------
Public Funtion Prototypes
Public Function Prototypes
----------------------------------------------------------------------*/
int count(LIST var_list);

View File

@ -33,7 +33,7 @@ static const int kMinAbsoluteGarbageWordLength = 10;
static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
const int case_state_table[6][4] = { {
/* 0. Begining of word */
/* 0. Beginning of word */
/* P U L D */
/* -1. Error on case */
0, 1, 5, 4

View File

@ -447,7 +447,7 @@ class SquishedDawg : public Dawg {
EDGE_REF edge = node;
if (!edge_occupied(edge) || edge == NO_EDGE) return;
assert(forward_edge(edge)); // we don't expect any backward edges to
do { // be present when this funciton is called
do { // be present when this function is called
if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
}

View File

@ -127,7 +127,7 @@ Dict::Dict(CCUtil* ccutil)
" when there is a need to explore all segmentations",
getCCUtil()->params()),
BOOL_MEMBER(save_raw_choices, false,
"Deprecated- backward compatablity only",
"Deprecated- backward compatibility only",
getCCUtil()->params()),
INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
"Max words to keep in list",

View File

@ -614,7 +614,7 @@ class Dict {
"Make AcceptableChoice() always return false. Useful"
" when there is a need to explore all segmentations");
BOOL_VAR_H(save_raw_choices, false,
"Deprecated- backward compatability only");
"Deprecated- backward compatibility only");
INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
" should be printed to stdout");

View File

@ -303,7 +303,7 @@ void Dict::append_choices(
*
* The given prev_char_frag_info contains:
* - fragment: if not NULL contains information about immediately
* preceeding fragmented character choice
* preceding fragmented character choice
* - num_fragments: number of fragments that have been used so far
* to construct a character
* - certainty: certainty of the current choice or minimum

View File

@ -1657,7 +1657,7 @@ EXTRA_PACKAGES =
# following commands have a special meaning inside the header: $title,
# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
# $projectbrief, $projectlogo. Doxygen will replace $title with the empy string,
# for the replacement values of the other commands the user is refered to
# for the replacement values of the other commands the user is referred to
# HTML_HEADER.
# This tag requires that the tag GENERATE_LATEX is set to YES.

View File

@ -42,18 +42,22 @@ SCROLLVIEW_LIBS = \
CLASSPATH = $(srcdir)/piccolo2d-core-3.0.jar:$(srcdir)/piccolo2d-extras-3.0.jar
ScrollView.jar : $(SCROLLVIEW_CLASSES)
$(JAR) cf $@ com/google/scrollview/*.class \
$(JAR) cfm $@ Manifest.txt com/google/scrollview/*.class \
com/google/scrollview/events/*.class com/google/scrollview/ui/*.class
$(SCROLLVIEW_CLASSES) : $(SCROLLVIEW_FILES)
$(JAVAC) -encoding UTF8 -sourcepath $(srcdir) -classpath $(CLASSPATH) $(SCROLLVIEW_FILES) -d $(builddir)
fetch-jars :
curl -L http://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-core/3.0/piccolo2d-core-3.0.jar > piccolo2d-core-3.0.jar
curl -L http://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-extras/3.0/piccolo2d-extras-3.0.jar > piccolo2d-extras-3.0.jar
.PHONY: install-jars
install-jars : ScrollView.jar
@if [ ! -d $(scrollview_path) ]; then mkdir -p $(scrollview_path); fi;
$(INSTALL) -m 644 $(SCROLLVIEW_LIBS) $(scrollview_path);
$(INSTALL) -m 644 ScrollView.jar $(scrollview_path);
@echo "Don't forget to set eviroment variable SCROLLVIEW_PATH to $(scrollview_path)";
@echo "Don't forget to set environment variable SCROLLVIEW_PATH to $(scrollview_path)";
uninstall:
rm -f $(scrollview_path)/*.jar

2
java/Manifest.txt Normal file
View File

@ -0,0 +1,2 @@
Main-Class: com/google/scrollview/ScrollView
Class-Path: ScrollView.jar piccolo2d-core-3.0.jar piccolo2d-extras-3.0.jar

View File

@ -50,7 +50,7 @@ public class SVMenuBar implements ActionListener {
/**
* A click on one of the items in our menubar has occured. Forward it
* A click on one of the items in our menubar has occurred. Forward it
* to the item itself to let it decide what happens.
*/
public void actionPerformed(ActionEvent e) {
@ -111,7 +111,7 @@ public class SVMenuBar implements ActionListener {
* @param name The caption of the new entry.
* @param id The Id of the new entry. If it is -1, the entry will be treated
* as a menu.
* @param b Whether the entry is initally flagged.
* @param b Whether the entry is initially flagged.
*
*/

View File

@ -123,7 +123,7 @@ public class SVPopupMenu implements ActionListener {
/**
* A click on one of the items in our menubar has occured. Forward it
* A click on one of the items in our menubar has occurred. Forward it
* to the item itself to let it decide what happens.
*/
public void actionPerformed(ActionEvent e) {

View File

@ -298,7 +298,7 @@ public class SVWindow extends JFrame {
ta.setEditable(false);
getContentPane().add(ta, BorderLayout.SOUTH);
}
// We need to make the window bigger to accomodate the message box.
// We need to make the window bigger to accommodate the message box.
winSizeY += DEF_MESSAGEBOX_HEIGHT;
setSize(winSizeX, winSizeY);
}

View File

@ -2,113 +2,177 @@
# tesseract
#
add_definitions(-DPANGO_ENABLE_ENGINE)
if (STATIC OR NOT (WIN32 OR CYGWIN))
########################################
# LIBRARY tessopt
########################################
add_library (tessopt tessopt.cpp tessopt.h)
project_group (tessopt "Training Tools")
add_library (tessopt tessopt.cpp tessopt.h)
project_group (tessopt "Training Tools")
########################################
# LIBRARY training
# LIBRARY common_training
########################################
set(training_src
boxchar.cpp commandlineflags.cpp commontraining.cpp degradeimage.cpp
fileio.cpp ligature_table.cpp normstrngs.cpp pango_font_info.cpp
stringrenderer.cpp tlog.cpp unicharset_training_utils.cpp
set(common_training_src
commandlineflags.cpp
commontraining.cpp
)
set(training_hdr
boxchar.h commandlineflags.h commontraining.h degradeimage.h
fileio.h icuerrorcode.h ligature_table.h normstrngs.h
mergenf.h pango_font_info.h stringrenderer.h
tessopt.h tlog.h unicharset_training_utils.h util.h
set(common_training_hdr
commandlineflags.h
commontraining.h
)
add_library (training ${training_src} ${training_hdr})
project_group (training "Training Tools")
add_library (common_training ${common_training_src} ${common_training_hdr})
target_link_libraries (common_training tesseract tessopt)
project_group (common_training "Training Tools")
########################################
# EXECUTABLE ambiguous_words
########################################
add_executable (ambiguous_words ambiguous_words.cpp)
target_link_libraries (ambiguous_words tesseract training tessopt)
project_group (ambiguous_words "Training Tools")
add_executable (ambiguous_words ambiguous_words.cpp)
target_link_libraries (ambiguous_words tesseract)
project_group (ambiguous_words "Training Tools")
########################################
# EXECUTABLE classifier_tester
########################################
add_executable (classifier_tester classifier_tester.cpp)
target_link_libraries (classifier_tester tesseract training tessopt)
project_group (classifier_tester "Training Tools")
add_executable (classifier_tester classifier_tester.cpp)
target_link_libraries (classifier_tester common_training)
project_group (classifier_tester "Training Tools")
########################################
# EXECUTABLE combine_tessdata
########################################
add_executable (combine_tessdata combine_tessdata.cpp)
target_link_libraries (combine_tessdata tesseract)
project_group (combine_tessdata "Training Tools")
add_executable (combine_tessdata combine_tessdata.cpp)
target_link_libraries (combine_tessdata tesseract)
project_group (combine_tessdata "Training Tools")
########################################
# EXECUTABLE cntraining
########################################
add_executable (cntraining cntraining.cpp)
target_link_libraries (cntraining tesseract training tessopt)
project_group (cntraining "Training Tools")
add_executable (cntraining cntraining.cpp)
target_link_libraries (cntraining common_training)
project_group (cntraining "Training Tools")
########################################
# EXECUTABLE dawg2wordlist
########################################
add_executable (dawg2wordlist dawg2wordlist.cpp)
target_link_libraries (dawg2wordlist tesseract training tessopt)
project_group (dawg2wordlist "Training Tools")
add_executable (dawg2wordlist dawg2wordlist.cpp)
target_link_libraries (dawg2wordlist tesseract)
project_group (dawg2wordlist "Training Tools")
########################################
# EXECUTABLE mftraining
########################################
add_executable (mftraining mftraining.cpp mergenf.cpp)
target_link_libraries (mftraining tesseract training tessopt)
project_group (mftraining "Training Tools")
########################################
# EXECUTABLE set_unicharset_properties
########################################
add_executable (set_unicharset_properties set_unicharset_properties.cpp)
target_link_libraries (set_unicharset_properties tesseract training tessopt ${ICU_LIBRARIES})
project_group (set_unicharset_properties "Training Tools")
add_executable (mftraining mftraining.cpp mergenf.cpp mergenf.h)
target_link_libraries (mftraining common_training)
project_group (mftraining "Training Tools")
########################################
# EXECUTABLE shapeclustering
########################################
add_executable (shapeclustering shapeclustering.cpp)
target_link_libraries (shapeclustering tesseract training tessopt)
project_group (shapeclustering "Training Tools")
add_executable (shapeclustering shapeclustering.cpp)
target_link_libraries (shapeclustering common_training)
project_group (shapeclustering "Training Tools")
########################################
# EXECUTABLE unicharset_extractor
########################################
add_executable (unicharset_extractor unicharset_extractor.cpp)
target_link_libraries (unicharset_extractor tesseract tessopt)
project_group (unicharset_extractor "Training Tools")
########################################
# EXECUTABLE wordlist2dawg
########################################
add_executable (wordlist2dawg wordlist2dawg.cpp)
target_link_libraries (wordlist2dawg tesseract)
project_group (wordlist2dawg "Training Tools")
########################################
# EXECUTABLE set_unicharset_properties
########################################
if (ICU_FOUND)
include_directories(${ICU_INCLUDE_DIRS})
add_executable (set_unicharset_properties
set_unicharset_properties.cpp
unicharset_training_utils.cpp
unicharset_training_utils.h
fileio.cpp
fileio.h
normstrngs.cpp
normstrngs.h
icuerrorcode.h
)
target_link_libraries (set_unicharset_properties common_training ${ICU_LIBRARIES})
project_group (set_unicharset_properties "Training Tools")
########################################
# EXECUTABLE text2image
########################################
add_executable (text2image text2image.cpp)
target_link_libraries (text2image tesseract training tessopt
if (PKG_CONFIG_FOUND)
pkg_check_modules(Pango REQUIRED pango)
pkg_check_modules(Cairo REQUIRED cairo)
pkg_check_modules(PangoFt2 REQUIRED pangoft2)
pkg_check_modules(PangoCairo REQUIRED pangocairo)
pkg_check_modules(FontConfig REQUIRED fontconfig)
set(text2image_src
text2image.cpp
boxchar.cpp
boxchar.h
degradeimage.cpp
degradeimage.h
fileio.cpp
fileio.h
ligature_table.cpp
ligature_table.h
normstrngs.cpp
normstrngs.h
pango_font_info.cpp
pango_font_info.h
stringrenderer.cpp
stringrenderer.h
tlog.cpp
tlog.h
util.h
icuerrorcode.h
)
if (CYGWIN)
set(text2image_src ${text2image_src} ../vs2010/port/strcasestr.cpp)
endif()
add_executable (text2image ${text2image_src})
target_include_directories (text2image BEFORE PRIVATE ${Cairo_INCLUDE_DIRS} ${Pango_INCLUDE_DIRS})
target_compile_definitions (text2image PRIVATE -DPANGO_ENABLE_ENGINE)
target_link_libraries (text2image tesseract common_training
${ICU_LIBRARIES}
${Pango_LIBRARIES}
${Cairo_LIBRARIES}
@ -116,25 +180,10 @@ target_link_libraries (text2image tesseract training tessopt
${PangoFt2_LIBRARIES}
${FontConfig_LIBRARIES}
)
project_group (text2image "Training Tools")
########################################
# EXECUTABLE unicharset_extractor
########################################
add_executable (unicharset_extractor unicharset_extractor.cpp)
target_link_libraries (unicharset_extractor tesseract tessopt)
project_group (unicharset_extractor "Training Tools")
########################################
# EXECUTABLE wordlist2dawg
########################################
add_executable (wordlist2dawg wordlist2dawg.cpp)
target_link_libraries (wordlist2dawg tesseract tessopt)
project_group (wordlist2dawg "Training Tools")
project_group (text2image "Training Tools")
endif(PKG_CONFIG_FOUND)
endif(ICU_FOUND)
endif(STATIC OR NOT (WIN32 OR CYGWIN))
###############################################################################

View File

@ -780,7 +780,7 @@ VERTICAL_FONTS=( \
# holds the text corpus file for the language, used in phase F
# ${FONTS[@]}
# holds a sequence of applicable fonts for the language, used in
# phase F & I
# phase F & I. only set if not already set, i.e. from command line
# ${TRAINING_DATA_ARGUMENTS}
# non-default arguments to the training_data program used in phase T
# ${FILTER_ARGUMENTS} -
@ -794,7 +794,6 @@ set_lang_specific_parameters() {
local lang=$1
# The default text location is now given directly from the language code.
TEXT_CORPUS="${FLAGS_webtext_prefix}/${lang}.corpus.txt"
FONTS=( "${LATIN_FONTS[@]}" )
FILTER_ARGUMENTS=""
WORDLIST2DAWG_ARGUMENTS=""
# These dawg factors represent the fraction of the corpus not covered by the
@ -816,30 +815,30 @@ set_lang_specific_parameters() {
case ${lang} in
# Latin languages.
enm ) TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported
FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
frm ) TEXT_CORPUS="${FLAGS_webtext_prefix}/fra.corpus.txt"
# Make long-s substitutions for Middle French text
FILTER_ARGUMENTS="--make_early_language_variant=fra"
TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported.
FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
frk ) TEXT_CORPUS="${FLAGS_webtext_prefix}/deu.corpus.txt"
FONTS=( "${FRAKTUR_FONTS[@]}" );;
test -z "$FONTS" && FONTS=( "${FRAKTUR_FONTS[@]}" );;
ita_old )
TEXT_CORPUS="${FLAGS_webtext_prefix}/ita.corpus.txt"
# Make long-s substitutions for Early Italian text
FILTER_ARGUMENTS="--make_early_language_variant=ita"
TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported.
FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
spa_old )
TEXT_CORPUS="${FLAGS_webtext_prefix}/spa.corpus.txt"
# Make long-s substitutions for Early Spanish text
FILTER_ARGUMENTS="--make_early_language_variant=spa"
TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported.
FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
srp_latn )
TEXT_CORPUS=${FLAGS_webtext_prefix}/srp.corpus.txt ;;
vie ) TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
FONTS=( "${VIETNAMESE_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${VIETNAMESE_FONTS[@]}" ) ;;
# Highly inflective languages get a bigger dawg size.
# TODO(rays) Add more here!
hun ) WORD_DAWG_SIZE=1000000 ;;
@ -899,14 +898,14 @@ set_lang_specific_parameters() {
# Strip unrenderable words as not all fonts will render the extended
# latin symbols found in Vietnamese text.
WORD_DAWG_SIZE=1000000
FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
# Cyrillic script-based languages.
rus ) FONTS=( "${RUSSIAN_FONTS[@]}" )
rus ) test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" )
NUMBER_DAWG_FACTOR=0.05
WORD_DAWG_SIZE=1000000 ;;
aze_cyrl | bel | bul | kaz | mkd | srp | tgk | ukr | uzb_cyrl )
FONTS=( "${RUSSIAN_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" ) ;;
# Special code for performing Cyrillic language-id that is trained on
# Russian, Serbian, Ukranian, Belarusian, Macedonian, Tajik and Mongolian
@ -916,70 +915,70 @@ set_lang_specific_parameters() {
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
GENERATE_WORD_BIGRAMS=0
WORD_DAWG_SIZE=1000000
FONTS=( "${RUSSIAN_FONTS[@]}" );;
test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" );;
# South Asian scripts mostly have a lot of different graphemes, so trim
# down the MEAN_COUNT so as not to get a huge amount of text.
asm | ben )
MEAN_COUNT="15"
WORD_DAWG_FACTOR=0.15
FONTS=( "${BENGALI_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${BENGALI_FONTS[@]}" ) ;;
bih | hin | mar | nep | san )
MEAN_COUNT="15"
WORD_DAWG_FACTOR=0.15
FONTS=( "${DEVANAGARI_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${DEVANAGARI_FONTS[@]}" ) ;;
bod ) MEAN_COUNT="15"
WORD_DAWG_FACTOR=0.15
FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
dzo )
WORD_DAWG_FACTOR=0.01
FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
guj ) MEAN_COUNT="15"
WORD_DAWG_FACTOR=0.15
FONTS=( "${GUJARATI_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${GUJARATI_FONTS[@]}" ) ;;
kan ) MEAN_COUNT="15"
WORD_DAWG_FACTOR=0.15
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
FONTS=( "${KANNADA_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${KANNADA_FONTS[@]}" ) ;;
mal ) MEAN_COUNT="15"
WORD_DAWG_FACTOR=0.15
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
FONTS=( "${MALAYALAM_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${MALAYALAM_FONTS[@]}" ) ;;
ori )
WORD_DAWG_FACTOR=0.01
FONTS=( "${ORIYA_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${ORIYA_FONTS[@]}" ) ;;
pan ) MEAN_COUNT="15"
WORD_DAWG_FACTOR=0.01
FONTS=( "${PUNJABI_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${PUNJABI_FONTS[@]}" ) ;;
sin ) MEAN_COUNT="15"
WORD_DAWG_FACTOR=0.01
FONTS=( "${SINHALA_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${SINHALA_FONTS[@]}" ) ;;
tam ) MEAN_COUNT="30"
WORD_DAWG_FACTOR=0.15
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
FONTS=( "${TAMIL_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${TAMIL_FONTS[@]}" ) ;;
tel ) MEAN_COUNT="15"
WORD_DAWG_FACTOR=0.15
TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
FONTS=( "${TELUGU_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${TELUGU_FONTS[@]}" ) ;;
# SouthEast Asian scripts.
khm ) MEAN_COUNT="15"
WORD_DAWG_FACTOR=0.15
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
FONTS=( "${KHMER_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${KHMER_FONTS[@]}" ) ;;
lao ) MEAN_COUNT="15"
WORD_DAWG_FACTOR=0.15
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
FONTS=( "${LAOTHIAN_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${LAOTHIAN_FONTS[@]}" ) ;;
mya ) MEAN_COUNT="12"
WORD_DAWG_FACTOR=0.15
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
FONTS=( "${BURMESE_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${BURMESE_FONTS[@]}" ) ;;
tha ) MEAN_COUNT="30"
WORD_DAWG_FACTOR=0.01
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
@ -987,7 +986,7 @@ set_lang_specific_parameters() {
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
AMBIGS_FILTER_DENOMINATOR="1000"
LEADING=48
FONTS=( "${THAI_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${THAI_FONTS[@]}" ) ;;
# CJK
chi_sim )
@ -998,7 +997,7 @@ set_lang_specific_parameters() {
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
FILTER_ARGUMENTS="--charset_filter=chi_sim --segmenter_lang=chi_sim"
FONTS=( "${CHI_SIM_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${CHI_SIM_FONTS[@]}" ) ;;
chi_tra )
MEAN_COUNT="15"
WORD_DAWG_FACTOR=0.015
@ -1006,14 +1005,14 @@ set_lang_specific_parameters() {
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
FILTER_ARGUMENTS="--charset_filter=chi_tra --segmenter_lang=chi_tra"
FONTS=( "${CHI_TRA_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${CHI_TRA_FONTS[@]}" ) ;;
jpn ) MEAN_COUNT="15"
WORD_DAWG_FACTOR=0.015
GENERATE_WORD_BIGRAMS=0
TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
FILTER_ARGUMENTS="--charset_filter=jpn --segmenter_lang=jpn"
FONTS=( "${JPN_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${JPN_FONTS[@]}" ) ;;
kor ) MEAN_COUNT="20"
WORD_DAWG_FACTOR=0.015
NUMBER_DAWG_FACTOR=0.05
@ -1021,38 +1020,38 @@ set_lang_specific_parameters() {
TRAINING_DATA_ARGUMENTS+=" --desired_bigrams="
GENERATE_WORD_BIGRAMS=0
FILTER_ARGUMENTS="--charset_filter=kor --segmenter_lang=kor"
FONTS=( "${KOREAN_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${KOREAN_FONTS[@]}" ) ;;
# Middle-Eastern scripts.
ara ) FONTS=( "${ARABIC_FONTS[@]}" ) ;;
div ) FONTS=( "${THAANA_FONTS[@]}" ) ;;
ara ) test -z "$FONTS" && FONTS=( "${ARABIC_FONTS[@]}" ) ;;
div ) test -z "$FONTS" && FONTS=( "${THAANA_FONTS[@]}" ) ;;
fas | pus | snd | uig | urd )
FONTS=( "${PERSIAN_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${PERSIAN_FONTS[@]}" ) ;;
heb | yid )
NUMBER_DAWG_FACTOR=0.05
WORD_DAWG_FACTOR=0.08
FONTS=( "${HEBREW_FONTS[@]}" ) ;;
syr ) FONTS=( "${SYRIAC_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${HEBREW_FONTS[@]}" ) ;;
syr ) test -z "$FONTS" && FONTS=( "${SYRIAC_FONTS[@]}" ) ;;
# Other scripts.
amh | tir)
FONTS=( "${AMHARIC_FONTS[@]}" ) ;;
chr ) FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" \
test -z "$FONTS" && FONTS=( "${AMHARIC_FONTS[@]}" ) ;;
chr ) test -z "$FONTS" && FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" \
"Noto Sans Cherokee" \
) ;;
ell | grc )
NUMBER_DAWG_FACTOR=0.05
WORD_DAWG_FACTOR=0.08
FONTS=( "${GREEK_FONTS[@]}" ) ;;
hye ) FONTS=( "${ARMENIAN_FONTS[@]}" ) ;;
iku ) FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" ) ;;
kat) FONTS=( "${GEORGIAN_FONTS[@]}" ) ;;
test -z "$FONTS" && FONTS=( "${GREEK_FONTS[@]}" ) ;;
hye ) test -z "$FONTS" && FONTS=( "${ARMENIAN_FONTS[@]}" ) ;;
iku ) test -z "$FONTS" && FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" ) ;;
kat) test -z "$FONTS" && FONTS=( "${GEORGIAN_FONTS[@]}" ) ;;
kat_old)
TEXT_CORPUS="${FLAGS_webtext_prefix}/kat.corpus.txt"
FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;;
kir ) FONTS=( "${KYRGYZ_FONTS[@]}" )
test -z "$FONTS" && FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;;
kir ) test -z "$FONTS" && FONTS=( "${KYRGYZ_FONTS[@]}" )
TRAINING_DATA_ARGUMENTS=" --infrequent_ratio=100" ;;
kur ) FONTS=( "${KURDISH_FONTS[@]}" ) ;;
kur ) test -z "$FONTS" && FONTS=( "${KURDISH_FONTS[@]}" ) ;;
*) err "Error: ${lang} is not a valid language code"
esac
@ -1061,6 +1060,8 @@ set_lang_specific_parameters() {
elif [[ ! -z ${MEAN_COUNT} ]]; then
TRAINING_DATA_ARGUMENTS+=" --mean_count=${MEAN_COUNT}"
fi
# Default to Latin fonts if none have been set
test -z "$FONTS" && test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" )
}
#=============================================================================

View File

@ -17,7 +17,6 @@
# USAGE:
#
# tesstrain.sh
# --bin_dir PATH # Location of training program.
# --fontlist FONTS_STR # A plus-separated list of fontnames to train on.
# --fonts_dir FONTS_PATH # Path to font files.
# --lang LANG_CODE # ISO 639 code.
@ -25,6 +24,7 @@
# --output_dir OUTPUTDIR # Location of output traineddata file.
# --overwrite # Safe to overwrite files in output_dir.
# --run_shape_clustering # Run shape clustering (use for Indic langs).
# --exposures EXPOSURES # A list of exposure levels to use (e.g. "-1 0 1").
#
# OPTIONAL flags for input data. If unspecified we will look for them in
# the langdata_dir directory.
@ -49,11 +49,8 @@ source `dirname $0`/tesstrain_utils.sh
ARGV=("$@")
parse_flags
tlog "\n=== Starting training for language '${LANG_CODE}'"
tlog "Cleaning workspace directory ${TRAINING_DIR}..."
mkdir -p ${TRAINING_DIR}
rm -fr ${TRAINING_DIR}/*
tlog "\n=== Starting training for language '${LANG_CODE}'"
source `dirname $0`/language-specific.sh
set_lang_specific_parameters ${LANG_CODE}

View File

@ -16,10 +16,6 @@
#
# USAGE: source tesstrain_utils.sh
FONTS=(
"Arial" \
"Times New Roman," \
)
if [ "$(uname)" == "Darwin" ];then
FONTS_DIR="/Library/Fonts/"
else
@ -29,7 +25,8 @@ OUTPUT_DIR="/tmp/tesstrain/tessdata"
OVERWRITE=0
RUN_SHAPE_CLUSTERING=0
EXTRACT_FONT_PROPERTIES=1
WORKSPACE_DIR="/tmp/tesstrain"
WORKSPACE_DIR=`mktemp -d`
EXPOSURES=0
# Logging helper functions.
tlog() {
@ -45,11 +42,11 @@ err_exit() {
# if the program file is not found.
# Usage: run_command CMD ARG1 ARG2...
run_command() {
local cmd=$1
shift
if [[ ! -x ${cmd} ]]; then
err_exit "File ${cmd} not found"
local cmd=`which $1`
if [[ -z ${cmd} ]]; then
err_exit "$1 not found"
fi
shift
tlog "[$(date)] ${cmd} $@"
${cmd} "$@" 2>&1 1>&2 | tee -a ${LOG_FILE}
# check completion status
@ -69,22 +66,6 @@ check_file_readable() {
done
}
# Set global path variables that are based on parsed flags.
set_prog_paths() {
if [[ -z ${BINDIR} ]]; then
err_exit "Need to specify location of program files"
fi
CN_TRAINING_EXE=${BINDIR}/cntraining
COMBINE_TESSDATA_EXE=${BINDIR}/combine_tessdata
MF_TRAINING_EXE=${BINDIR}/mftraining
SET_UNICHARSET_PROPERTIES_EXE=${BINDIR}/set_unicharset_properties
SHAPE_TRAINING_EXE=${BINDIR}/shapeclustering
TESSERACT_EXE=${BINDIR}/tesseract
TEXT2IMAGE_EXE=${BINDIR}/text2image
UNICHARSET_EXTRACTOR_EXE=${BINDIR}/unicharset_extractor
WORDLIST2DAWG_EXE=${BINDIR}/wordlist2dawg
}
# Sets the named variable to given value. Aborts if the value is missing or
# if it looks like a flag.
# Usage: parse_value VAR_NAME VALUE
@ -109,9 +90,6 @@ parse_flags() {
case ${ARGV[$i]} in
--)
break;;
--bin_dir)
parse_value "BINDIR" ${ARGV[$j]}
i=$j ;;
--fontlist) # Expect a plus-separated list of names
if [[ -z ${ARGV[$j]} ]] || [[ ${ARGV[$j]:0:2} == "--" ]]; then
err_exit "Invalid value passed to --fontlist"
@ -121,6 +99,16 @@ parse_flags() {
FONTS=( ${ARGV[$j]} )
IFS=$ofs
i=$j ;;
--exposures)
exp=""
while test $j -lt ${#ARGV[@]}; do
test -z ${ARGV[$j]} && break
test `echo ${ARGV[$j]} | cut -c -2` = "--" && break
exp="$exp ${ARGV[$j]}"
j=$((j+1))
done
parse_value "EXPOSURES" "$exp"
i=$((j-1)) ;;
--fonts_dir)
parse_value "FONTS_DIR" ${ARGV[$j]}
i=$j ;;
@ -156,9 +144,6 @@ parse_flags() {
if [[ -z ${LANG_CODE} ]]; then
err_exit "Need to specify a language --lang"
fi
if [[ -z ${BINDIR} ]]; then
err_exit "Need to specify path to built binaries --bin_dir"
fi
if [[ -z ${LANGDATA_ROOT} ]]; then
err_exit "Need to specify path to language files --langdata_dir"
fi
@ -171,8 +156,6 @@ parse_flags() {
fi
fi
set_prog_paths
# Location where intermediate files will be created.
TRAINING_DIR=${WORKSPACE_DIR}/${LANG_CODE}
# Location of log file for the whole run.
@ -200,8 +183,8 @@ initialize_fontconfig() {
export FONT_CONFIG_CACHE=$(mktemp -d --tmpdir font_tmp.XXXXXXXXXX)
local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt
echo "Text" >${sample_path}
run_command ${TEXT2IMAGE_EXE} --fonts_dir=${FONTS_DIR} \
--font="Arial" --outputbase=${sample_path} --text=${sample_path} \
run_command text2image --fonts_dir=${FONTS_DIR} \
--font="${FONTS[0]}" --outputbase=${sample_path} --text=${sample_path} \
--fontconfig_tmpdir=${FONT_CONFIG_CACHE}
}
@ -228,14 +211,14 @@ generate_font_image() {
fi
done
run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \
run_command text2image ${common_args} --font="${font}" \
--text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS}
check_file_readable ${outbase}.box ${outbase}.tif
if (( ${EXTRACT_FONT_PROPERTIES} )) &&
[[ -r ${TRAIN_NGRAMS_FILE} ]]; then
tlog "Extracting font properties of ${font}"
run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \
run_command text2image ${common_args} --font="${font}" \
--ligatures=false --text=${TRAIN_NGRAMS_FILE} \
--only_extract_font_properties --ptsize=32
check_file_readable ${outbase}.fontinfo
@ -254,35 +237,36 @@ phase_I_generate_image() {
err_exit "Could not find training text file ${TRAINING_TEXT}"
fi
CHAR_SPACING="0.0"
EXPOSURE="0"
if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
# Parse .bigram_freqs file and compose a .train_ngrams file with text
# for tesseract to recognize during training. Take only the ngrams whose
# combined weight accounts for 95% of all the bigrams in the language.
NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
| awk '{s=s+$2}; END {print (s/100)*p}' p=99)
cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
| awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
check_file_readable ${TRAIN_NGRAMS_FILE}
fi
local counter=0
for font in "${FONTS[@]}"; do
generate_font_image "${font}" &
let counter=counter+1
let rem=counter%par_factor
if [[ "${rem}" -eq 0 ]]; then
wait
for EXPOSURE in $EXPOSURES; do
if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
# Parse .bigram_freqs file and compose a .train_ngrams file with text
# for tesseract to recognize during training. Take only the ngrams whose
# combined weight accounts for 95% of all the bigrams in the language.
NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
| awk '{s=s+$2}; END {print (s/100)*p}' p=99)
cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
| awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
check_file_readable ${TRAIN_NGRAMS_FILE}
fi
done
wait
# Check that each process was successful.
for font in "${FONTS[@]}"; do
local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
check_file_readable ${outbase}.box ${outbase}.tif
local counter=0
for font in "${FONTS[@]}"; do
generate_font_image "${font}" &
let counter=counter+1
let rem=counter%par_factor
if [[ "${rem}" -eq 0 ]]; then
wait
fi
done
wait
# Check that each process was successful.
for font in "${FONTS[@]}"; do
local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
check_file_readable ${outbase}.box ${outbase}.tif
done
done
}
@ -291,7 +275,7 @@ phase_UP_generate_unicharset() {
tlog "\n=== Phase UP: Generating unicharset and unichar properties files ==="
local box_files=$(ls ${TRAINING_DIR}/*.box)
run_command ${UNICHARSET_EXTRACTOR_EXE} -D "${TRAINING_DIR}/" ${box_files}
run_command unicharset_extractor -D "${TRAINING_DIR}/" ${box_files}
local outfile=${TRAINING_DIR}/unicharset
UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset"
check_file_readable ${outfile}
@ -299,7 +283,7 @@ phase_UP_generate_unicharset() {
XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights"
check_file_readable ${UNICHARSET_FILE}
run_command ${SET_UNICHARSET_PROPERTIES_EXE} \
run_command set_unicharset_properties \
-U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \
--script_dir=${LANGDATA_ROOT}
check_file_readable ${XHEIGHTS_FILE}
@ -327,7 +311,7 @@ phase_D_generate_dawg() {
if [[ -s ${WORDLIST_FILE} ]]; then
tlog "Generating word Dawg"
check_file_readable ${UNICHARSET_FILE}
run_command ${WORDLIST2DAWG_EXE} -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
${UNICHARSET_FILE}
check_file_readable ${WORD_DAWG}
@ -339,13 +323,13 @@ phase_D_generate_dawg() {
if [[ -s ${freq_wordlist_file} ]]; then
check_file_readable ${UNICHARSET_FILE}
tlog "Generating frequent-word Dawg"
run_command ${WORDLIST2DAWG_EXE} -r 1 ${freq_wordlist_file} \
run_command wordlist2dawg -r 1 ${freq_wordlist_file} \
${FREQ_DAWG} ${UNICHARSET_FILE}
check_file_readable ${FREQ_DAWG}
fi
# Punctuation DAWG
# -r arguments to WORDLIST2DAWG_EXE denote RTL reverse policy
# -r arguments to wordlist2dawg denote RTL reverse policy
# (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h).
# We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
# 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
@ -360,20 +344,20 @@ phase_D_generate_dawg() {
PUNC_FILE="${LANGDATA_ROOT}/common.punc"
fi
check_file_readable ${PUNC_FILE}
run_command ${WORDLIST2DAWG_EXE} -r ${punc_reverse_policy} \
run_command wordlist2dawg -r ${punc_reverse_policy} \
${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE}
check_file_readable ${PUNC_DAWG}
# Numbers DAWG
if [[ -s ${NUMBERS_FILE} ]]; then
run_command ${WORDLIST2DAWG_EXE} -r 0 \
run_command wordlist2dawg -r 0 \
${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE}
check_file_readable ${NUMBER_DAWG}
fi
# Bigram dawg
if [[ -s ${WORD_BIGRAMS_FILE} ]]; then
run_command ${WORDLIST2DAWG_EXE} -r 1 \
run_command wordlist2dawg -r 1 \
${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
check_file_readable ${BIGRAM_DAWG}
fi
@ -387,10 +371,9 @@ phase_E_extract_features() {
par_factor=1
fi
tlog "\n=== Phase E: Extracting features ==="
TRAIN_EXPOSURES='0'
local img_files=""
for exposure in ${TRAIN_EXPOSURES}; do
for exposure in ${EXPOSURES}; do
img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
done
@ -405,7 +388,7 @@ phase_E_extract_features() {
tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}"
local counter=0
for img_file in ${img_files}; do
run_command ${TESSERACT_EXE} ${img_file} ${img_file%.*} \
run_command tesseract ${img_file} ${img_file%.*} \
${box_config} ${config} &
let counter=counter+1
let rem=counter%par_factor
@ -427,7 +410,7 @@ phase_C_cluster_prototypes() {
tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
local out_normproto=$1
run_command ${CN_TRAINING_EXE} -D "${TRAINING_DIR}/" \
run_command cntraining -D "${TRAINING_DIR}/" \
$(ls ${TRAINING_DIR}/*.tr)
check_file_readable ${TRAINING_DIR}/normproto
@ -447,7 +430,7 @@ phase_S_cluster_shapes() {
font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
fi
run_command ${SHAPE_TRAINING_EXE} \
run_command shapeclustering \
-D "${TRAINING_DIR}/" \
-U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
-O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
@ -468,7 +451,7 @@ phase_M_cluster_microfeatures() {
font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
fi
run_command ${MF_TRAINING_EXE} \
run_command mftraining \
-D "${TRAINING_DIR}/" \
-U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
-O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
@ -528,7 +511,7 @@ make__traineddata() {
fi
# Compose the traineddata file.
run_command ${COMBINE_TESSDATA_EXE} ${TRAINING_DIR}/${LANG_CODE}.
run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}.
# Copy it to the output dir, overwriting only if allowed by the cmdline flag.
if [[ ! -d ${OUTPUT_DIR} ]]; then

View File

@ -127,7 +127,7 @@ SVSemaphore::SVSemaphore() {
semaphore_ = CreateSemaphore(0, 0, 10, 0);
#elif defined(__APPLE__)
char name[50];
snprintf(name, sizeof(name), "%d", random());
snprintf(name, sizeof(name), "%ld", random());
sem_unlink(name);
semaphore_ = sem_open(name, O_CREAT , S_IWUSR, 0);
if (semaphore_ == SEM_FAILED) {
@ -296,14 +296,11 @@ static std::string ScrollViewCommand(std::string scrollview_path) {
// this unnecessary.
// Also the path has to be separated by ; on windows and : otherwise.
#ifdef _WIN32
const char* cmd_template = "-Djava.library.path=%s -cp %s/ScrollView.jar;"
"%s/piccolo2d-core-3.0.jar:%s/piccolo2d-extras-3.0.jar"
" com.google.scrollview.ScrollView";
const char* cmd_template = "-Djava.library.path=%s -jar %s/ScrollView.jar";
#else
const char* cmd_template = "-c \"trap 'kill %%1' 0 1 2 ; java "
"-Xms1024m -Xmx2048m -Djava.library.path=%s -cp %s/ScrollView.jar:"
"%s/piccolo2d-core-3.0.jar:%s/piccolo2d-extras-3.0.jar"
" com.google.scrollview.ScrollView"
"-Xms1024m -Xmx2048m -jar %s/ScrollView.jar"
" & wait\"";
#endif
int cmdlen = strlen(cmd_template) + 4*strlen(scrollview_path.c_str()) + 1;
@ -374,7 +371,7 @@ static int GetAddrInfo(const char* hostname, int port,
struct addrinfo** address) {
#if defined(__linux__)
char port_str[40];
snprintf(port_str, 40, "%d", port);
snprintf(port_str, 40, "%ld", port);
return getaddrinfo(hostname, port_str, NULL, address);
#else
return GetAddrInfoNonLinux(hostname, port, address);

View File

@ -177,11 +177,11 @@ struct ViterbiStateEntry : public ELIST_LINK {
/// the smallest rating or lower/upper case letters).
LanguageModelFlagsType top_choice_flags;
/// Extra information maintained by Dawg laguage model component
/// Extra information maintained by Dawg language model component
/// (owned by ViterbiStateEntry).
LanguageModelDawgInfo *dawg_info;
/// Extra information maintained by Ngram laguage model component
/// Extra information maintained by Ngram language model component
/// (owned by ViterbiStateEntry).
LanguageModelNgramInfo *ngram_info;

View File

@ -273,7 +273,7 @@ void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column,
*
* Recursively go through the ratings matrix to find lists of fragments
* to be merged in the function merge_and_put_fragment_lists.
* current_frag is the postion of the piece we are looking for.
* current_frag is the position of the piece we are looking for.
* current_row is the row in the rating matrix we are currently at.
* start is the row we started initially, so that we can know where
* to append the results to the matrix. num_frag_parts is the total

View File

@ -375,7 +375,7 @@ class Wordrec : public Classify {
inT16 num_blobs);
// Recursively go through the ratings matrix to find lists of fragments
// to be merged in the function merge_and_put_fragment_lists.
// current_frag is the postion of the piece we are looking for.
// current_frag is the position of the piece we are looking for.
// current_row is the row in the rating matrix we are currently at.
// start is the row we started initially, so that we can know where
// to append the results to the matrix. num_frag_parts is the total