From 0a53f8c5bfb5246ed3264ca973f7ca346df7f12b Mon Sep 17 00:00:00 2001 From: theraysmith Date: Wed, 16 May 2007 01:18:59 +0000 Subject: [PATCH] Preparations for unicodization git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@34 d0cd1f9f-072b-0410-8dd7-cf729c803f20 --- ccmain/baseapi.cpp | 41 ++++++++++----- ccmain/baseapi.h | 26 ++++++---- ccmain/tessedit.cpp | 19 +++++++ ccmain/tessedit.h | 1 + ccmain/tesseractmain.cpp | 6 ++- ccutil/unichar.h | 3 ++ classify/adaptmatch.cpp | 108 ++++++++++++++++++++++++++------------- classify/cutoffs.cpp | 4 +- classify/cutoffs.h | 2 +- cutil/globals.cpp | 4 ++ cutil/globals.h | 13 +++-- 11 files changed, 158 insertions(+), 69 deletions(-) diff --git a/ccmain/baseapi.cpp b/ccmain/baseapi.cpp index 0d7639876..86934533a 100644 --- a/ccmain/baseapi.cpp +++ b/ccmain/baseapi.cpp @@ -20,6 +20,7 @@ #include "baseapi.h" #include "tessedit.h" +#include "ocrclass.h" #include "pageres.h" #include "tessvars.h" #include "control.h" @@ -52,7 +53,19 @@ const int kMinRectSize = 10; int TessBaseAPI::Init(const char* datapath, const char* outputbase, const char* configfile, bool numeric_mode, int argc, char* argv[]) { - int result = init_tesseract(datapath, outputbase, configfile, argc, argv); + return InitWithLanguage(datapath, outputbase, NULL, configfile, + numeric_mode, argc, argv); +} + +// Start tesseract. +// Similar to Init() except that it is possible to specify the language. +// Language is the code of the language for which the data will be loaded. +// (Codes follow ISO 639-2.) If it is NULL, english (eng) will be loaded. +int TessBaseAPI::InitWithLanguage(const char* datapath, const char* outputbase, + const char* language, const char* configfile, + bool numeric_mode, int argc, char* argv[]) { + int result = init_tesseract(datapath, outputbase, language, + configfile, argc, argv); bln_numericmode.set_value(numeric_mode); return result; } @@ -68,7 +81,7 @@ int TessBaseAPI::Init(const char* datapath, const char* outputbase, // one pixel is WHITE. For binary images set bytes_per_pixel=0. // The recognized text is returned as a char* which (in future will be coded // as UTF8 and) must be freed with the delete [] operator. -char* TessBaseAPI::TesseractRect(const UINT8* imagedata, +char* TessBaseAPI::TesseractRect(const unsigned char* imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, @@ -114,7 +127,7 @@ void TessBaseAPI::DumpPGM(const char* filename) { // Copy the given image rectangle to Tesseract, with adaptive thresholding // if the image is not already binary. -void TessBaseAPI::CopyImageToTesseract(const UINT8* imagedata, +void TessBaseAPI::CopyImageToTesseract(const unsigned char* imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, @@ -147,7 +160,7 @@ void TessBaseAPI::CopyImageToTesseract(const UINT8* imagedata, // hi_values[channel] is 0 or background if 1. A hi_value of -1 indicates // that there is no apparent foreground. At least one hi_value will not be -1. // thresholds and hi_values are assumed to be of bytes_per_pixel size. -void TessBaseAPI::OtsuThreshold(const UINT8* imagedata, +void TessBaseAPI::OtsuThreshold(const unsigned char* imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int right, int bottom, @@ -206,16 +219,16 @@ void TessBaseAPI::OtsuThreshold(const UINT8* imagedata, // counted with this call in a multi-channel (pixel-major) image. // Histogram is always a 256 element array to count occurrences of // each pixel value. -void TessBaseAPI::HistogramRect(const UINT8* imagedata, +void TessBaseAPI::HistogramRect(const unsigned char* imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int right, int bottom, int* histogram) { int width = right - left; memset(histogram, 0, sizeof(*histogram) * 256); - const UINT8* pix = imagedata + - top*bytes_per_line + - left*bytes_per_pixel; + const unsigned char* pix = imagedata + + top*bytes_per_line + + left*bytes_per_pixel; for (int y = top; y < bottom; ++y) { for (int x = 0; x < width; ++x) { ++histogram[pix[x * bytes_per_pixel]]; @@ -270,7 +283,7 @@ int TessBaseAPI::OtsuStats(const int* histogram, // Threshold the given grey or color image into the tesseract global // image ready for recognition. Requires thresholds and hi_value // produced by OtsuThreshold above. -void TessBaseAPI::ThresholdRect(const UINT8* imagedata, +void TessBaseAPI::ThresholdRect(const unsigned char* imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, @@ -283,9 +296,10 @@ void TessBaseAPI::ThresholdRect(const UINT8* imagedata, // For each line in the image, fill the IMAGELINE class and put it into the // Tesseract global page_image. Note that Tesseract stores images with the // bottom at y=0 and 0 is black, so we need 2 kinds of inversion. - const UINT8* data = imagedata + top*bytes_per_line + left*bytes_per_pixel; + const unsigned char* data = imagedata + top*bytes_per_line + + left*bytes_per_pixel; for (int y = height - 1 ; y >= 0; --y) { - const UINT8* pix = data; + const unsigned char* pix = data; for (int x = 0; x < width; ++x, pix += bytes_per_pixel) { line.pixels[x] = 1; for (int ch = 0; ch < bytes_per_pixel; ++ch) { @@ -303,13 +317,13 @@ void TessBaseAPI::ThresholdRect(const UINT8* imagedata, // Cut out the requested rectangle of the binary image to the // tesseract global image ready for recognition. -void TessBaseAPI::CopyBinaryRect(const UINT8* imagedata, +void TessBaseAPI::CopyBinaryRect(const unsigned char* imagedata, int bytes_per_line, int left, int top, int width, int height) { // Copy binary image, cutting out the required rectangle. IMAGE image; - image.capture(const_cast(imagedata), + image.capture(const_cast(imagedata), bytes_per_line*8, top + height, 1); page_image.create(width, height, 1); copy_sub_image(&image, left, top, width, height, &page_image, 0, 0, false); @@ -392,4 +406,3 @@ char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) { } return NULL; } - diff --git a/ccmain/baseapi.h b/ccmain/baseapi.h index 591d415d8..cdb8b251b 100644 --- a/ccmain/baseapi.h +++ b/ccmain/baseapi.h @@ -22,9 +22,6 @@ #include -#include "host.h" -#include "ocrclass.h" - class PAGE_RES; class BLOCK_LIST; @@ -51,6 +48,14 @@ class TessBaseAPI { const char* configfile, bool numeric_mode, int argc, char* argv[]); + // Start tesseract. + // Similar to Init() except that it is possible to specify the language. + // Language is the code of the language for which the data will be loaded. + // (Codes follow ISO 639-2.) If it is NULL, english (eng) will be loaded. + static int InitWithLanguage(const char* datapath, const char* outputbase, + const char* language, const char* configfile, + bool numeric_mode, int argc, char* argv[]); + // Recognize a rectangle from an image and return the result as a string. // May be called many times for a single Init. // Currently has no error checking. @@ -62,7 +67,7 @@ class TessBaseAPI { // 1 represents WHITE. For binary images set bytes_per_pixel=0. // The recognized text is returned as a char* which (in future will be coded // as UTF8 and) must be freed with the delete [] operator. - static char* TesseractRect(const UINT8* imagedata, + static char* TesseractRect(const unsigned char* imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height); @@ -80,7 +85,7 @@ class TessBaseAPI { protected: // Copy the given image rectangle to Tesseract, with adaptive thresholding // if the image is not already binary. - static void CopyImageToTesseract(const UINT8* imagedata, + static void CopyImageToTesseract(const unsigned char* imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height); @@ -92,7 +97,7 @@ class TessBaseAPI { // hi_values[channel] is 0 or background if 1. A hi_value of -1 indicates // that there is no apparent foreground. At least one hi_value will not be -1. // thresholds and hi_values are assumed to be of bytes_per_pixel size. - static void OtsuThreshold(const UINT8* imagedata, + static void OtsuThreshold(const unsigned char* imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int right, int bottom, @@ -106,7 +111,7 @@ class TessBaseAPI { // counted with this call in a multi-channel (pixel-major) image. // Histogram is always a 256 element array to count occurrences of // each pixel value. - static void HistogramRect(const UINT8* imagedata, + static void HistogramRect(const unsigned char* imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int right, int bottom, @@ -122,7 +127,7 @@ class TessBaseAPI { // Threshold the given grey or color image into the tesseract global // image ready for recognition. Requires thresholds and hi_value // produced by OtsuThreshold above. - static void ThresholdRect(const UINT8* imagedata, + static void ThresholdRect(const unsigned char* imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, @@ -132,7 +137,7 @@ class TessBaseAPI { // Cut out the requested rectangle of the binary image to the // tesseract global image ready for recognition. - static void CopyBinaryRect(const UINT8* imagedata, + static void CopyBinaryRect(const unsigned char* imagedata, int bytes_per_line, int left, int top, int width, int height); @@ -145,7 +150,8 @@ class TessBaseAPI { // Recognize the tesseract global image and return the result as Tesseract // internal structures. - static PAGE_RES* Recognize(BLOCK_LIST* block_list, ETEXT_DESC* monitor); + static PAGE_RES* Recognize(BLOCK_LIST* block_list, + struct ETEXT_STRUCT* monitor); // Convert (and free) the internal data structures into a text string. static char* TesseractToText(PAGE_RES* page_res); diff --git a/ccmain/tessedit.cpp b/ccmain/tessedit.cpp index 4d6f2d062..6c8f11088 100644 --- a/ccmain/tessedit.cpp +++ b/ccmain/tessedit.cpp @@ -50,6 +50,8 @@ #include "adaptmatch.h" #include "intmatcher.h" #include "chop.h" +#include "efio.h" +#include "danerror.h" #include "globals.h" //extern "C" { @@ -96,6 +98,7 @@ ETEXT_DESC *global_monitor = NULL; int init_tesseract(const char *arg0, const char *textbase, + const char *language, const char *configfile, int configc, const char *const *configv) { @@ -116,6 +119,22 @@ int init_tesseract(const char *arg0, strcpy (c_path, datadir.string ()); c_path[strlen (c_path) - strlen (m_data_sub_dir.string ())] = '\0'; demodir = c_path; + + // Set the language data path prefix + language_data_path_prefix = datadir; + if (language != NULL) { + language_data_path_prefix += language; + language_data_path_prefix += "."; + } + else + language_data_path_prefix += "eng."; + + // Load the unichar set + STRING unicharpath = language_data_path_prefix; + unicharpath += "unicharset"; + if (!unicharset.load_from_file(unicharpath.string())) { + DoError(FOPENERROR, "Unable to open unicharset"); + } start_recog(configfile, textbase); ReliableConfigThreshold = tweak_ReliableConfigThreshold; diff --git a/ccmain/tessedit.h b/ccmain/tessedit.h index e1f6299ed..ae98f7887 100644 --- a/ccmain/tessedit.h +++ b/ccmain/tessedit.h @@ -39,6 +39,7 @@ extern ETEXT_DESC *global_monitor; int init_tesseract(const char *arg0, const char *textbase, + const char *language, const char *configfile, int configc, const char *const *configv); diff --git a/ccmain/tesseractmain.cpp b/ccmain/tesseractmain.cpp index 8d8b22395..865f2df7e 100644 --- a/ccmain/tesseractmain.cpp +++ b/ccmain/tesseractmain.cpp @@ -67,9 +67,11 @@ int main(int argc, char **argv) { } if (argc == 3) - TessBaseAPI::Init(argv[0], argv[1], NULL, false, 0, argv + 2); + TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL, + NULL, false, 0, argv + 2); else - TessBaseAPI::Init(argv[0], argv[1], argv[3], false, argc - 4, argv + 4); + TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL, + argv[3], false, argc - 4, argv + 4); tprintf ("Tesseract Open Source OCR Engine\n"); diff --git a/ccutil/unichar.h b/ccutil/unichar.h index b73d40161..0804b70cf 100644 --- a/ccutil/unichar.h +++ b/ccutil/unichar.h @@ -26,6 +26,9 @@ // at least 4. Must not exceed 31 without changing the coding of length. #define UNICHAR_LEN 4 +// A UNICHAR_ID is the unique id of a unichar. +typedef int UNICHAR_ID; + // The UNICHAR class holds a single classification result. This may be // a single Unicode character (stored as between 1 and 4 utf8 bytes) or // multple Unicode characters representing the NFKC expansion of a ligature diff --git a/classify/adaptmatch.cpp b/classify/adaptmatch.cpp index 360ae9377..7a4cbf76c 100644 --- a/classify/adaptmatch.cpp +++ b/classify/adaptmatch.cpp @@ -53,8 +53,8 @@ #endif #define ADAPT_TEMPLATE_SUFFIX ".a" -#define BUILT_IN_TEMPLATES_FILE "tessdata/inttemp" -#define BUILT_IN_CUTOFFS_FILE "tessdata/pffmtable" +#define BUILT_IN_TEMPLATES_FILE "inttemp" +#define BUILT_IN_CUTOFFS_FILE "pffmtable" #define MAX_MATCHES 10 #define UNLIKELY_NUM_FEAT 200 @@ -98,7 +98,7 @@ PROTO_KEY; ((Rating) > GreatAdaptiveMatch) #define TempConfigReliable(Config) \ -((Config)->NumTimesSeen > ReliableConfigThreshold) +((Config)->NumTimesSeen >= ReliableConfigThreshold) #define InitIntFX() (FeaturesHaveBeenExtracted = FALSE) @@ -197,11 +197,11 @@ int GetIntCharNormFeatures(TBLOB *Blob, void InitMatcherRatings(register FLOAT32 *Rating); -void MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, - CLASS_ID ClassId, - int NumFeatures, - INT_FEATURE_ARRAY Features, - FEATURE_SET FloatFeatures); +int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, + CLASS_ID ClassId, + int NumFeatures, + INT_FEATURE_ARRAY Features, + FEATURE_SET FloatFeatures); PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, @@ -429,7 +429,6 @@ void MakeNewAdaptedClass ----------------------------------------------------------------------------**/ /* name of current image file being processed */ extern char imagefile[]; -//extern char *demodir; INT_VAR (tessedit_single_match, FALSE, "Top choice only from CP"); //extern "C" int il1_adaption_test; //? @@ -449,6 +448,7 @@ static int NumBaselineClassesTried = 0; static int NumCharNormClassesTried = 0; static int NumAmbigClassesTried = 0; static int NumClassesOutput = 0; +static int NumAdaptationsFailed = 0; /* define globals used to hold onto extracted features. This is used to map from the old scheme in which baseline features and char norm @@ -529,11 +529,11 @@ make_float_var (RatingMargin, 0.1, MakeRatingMargin, make_float_var (NoiseBlobLength, 0.6, MakeNoiseBlobLength, 18, 11, SetNoiseBlobLength, "Avg. noise blob length: "); -make_int_var (MinNumPermClasses, 3, MakeMinNumPermClasses, +make_int_var (MinNumPermClasses, 1, MakeMinNumPermClasses, 18, 12, SetMinNumPermClasses, "Min # of permanent classes: "); /* PREV DEFAULT 200 */ -make_int_var (ReliableConfigThreshold, 2, MakeReliableConfigThreshold, +make_int_var (ReliableConfigThreshold, 1, MakeReliableConfigThreshold, 18, 13, SetReliableConfigThreshold, "Reliable Config Threshold: "); @@ -556,6 +556,10 @@ make_float_var (RatingScale, 30.0, MakeRatingScale, make_float_var (CertaintyScale, 20.0, MakeCertaintyScale, 18, 18, SetCertaintyScale, "CertaintyScale: "); +make_int_var (FailedAdaptionsBeforeReset, 150, MakeFailedAdaptionsBeforeReset, +18, 19, SetFailedAdaptionsBeforeReset, +"Number of failed adaptions before adapted templates reset: "); + int tess_cn_matching = 0; int tess_bn_matching = 0; @@ -589,6 +593,11 @@ LIST AdaptiveClassifier(TBLOB *Blob, TBLOB *DotBlob, TEXTROW *Row) { ADAPT_RESULTS Results; LINE_STATS LineStats; + if (FailedAdaptionsBeforeReset >= 0 && + NumAdaptationsFailed >= FailedAdaptionsBeforeReset) { + NumAdaptationsFailed = 0; + ResetAdaptiveClassifier(); + } if (AdaptedTemplates == NULL) AdaptedTemplates = NewAdaptedTemplates (); EnterClassifyMode; @@ -672,6 +681,9 @@ void AdaptToWord(TWERD *Word, const char *map = rejmap; char map_char = '1'; + if (strlen(BestChoice) > MAX_ADAPTABLE_WERD_SIZE) + return; + if (EnableLearning) { NumWordsAdaptedTo++; @@ -826,13 +838,13 @@ void InitAdaptiveClassifier() { */ int i; FILE *File; - char Filename[1024]; + STRING Filename; if (!EnableAdaptiveMatcher) return; - strcpy(Filename, demodir); - strcat(Filename, BuiltInTemplatesFile); + Filename = language_data_path_prefix; + Filename += BuiltInTemplatesFile; #ifndef SECURE_NAMES // cprintf( "\nReading built-in templates from %s ...", // Filename); @@ -840,21 +852,22 @@ void InitAdaptiveClassifier() { #endif #ifdef __UNIX__ - File = Efopen (Filename, "r"); + File = Efopen (Filename.string(), "r"); #else - File = Efopen (Filename, "rb"); + File = Efopen (Filename.string(), "rb"); #endif PreTrainedTemplates = ReadIntTemplates (File, TRUE); fclose(File); - strcpy(Filename, demodir); - strcat(Filename, BuiltInCutoffsFile); + Filename = language_data_path_prefix; + Filename += BuiltInCutoffsFile; #ifndef SECURE_NAMES // cprintf( "\nReading built-in pico-feature cutoffs from %s ...", // Filename); fflush(stdout); #endif - ReadNewCutoffs (Filename, PreTrainedTemplates->IndexFor, CharNormCutoffs); + ReadNewCutoffs (Filename.string(), PreTrainedTemplates->IndexFor, + CharNormCutoffs); GetNormProtos(); @@ -874,14 +887,14 @@ void InitAdaptiveClassifier() { zero_all_bits (AllConfigsOff, WordsInVectorOfSize (MAX_NUM_CONFIGS)); if (UsePreAdaptedTemplates) { - strcpy(Filename, imagefile); - strcat(Filename, ADAPT_TEMPLATE_SUFFIX); - File = fopen (Filename, "rb"); + Filename = imagefile; + Filename += ADAPT_TEMPLATE_SUFFIX; + File = fopen (Filename.string(), "rb"); if (File == NULL) AdaptedTemplates = NewAdaptedTemplates (); else { #ifndef SECURE_NAMES - cprintf ("\nReading pre-adapted templates from %s ...", Filename); + cprintf ("\nReading pre-adapted templates from %s ...", Filename.string()); fflush(stdout); #endif AdaptedTemplates = ReadAdaptedTemplates (File); @@ -950,6 +963,7 @@ void InitAdaptiveClassifierVars() { MakeEnableNewAdaptRules(); MakeRatingScale(); MakeCertaintyScale(); + MakeFailedAdaptionsBeforeReset(); InitPicoFXVars(); InitOutlineFXVars(); //? @@ -1098,8 +1112,9 @@ void MakeNewAdaptedClass(TBLOB *Blob, TempConfigFor (Class, 0) = Config; /* this is a kludge to construct cutoffs for adapted templates */ - BaselineCutoffs[ClassIndex] = - CharNormCutoffs[IndexForClassId (PreTrainedTemplates, ClassId)]; + if (Templates == AdaptedTemplates) + BaselineCutoffs[ClassIndex] = + CharNormCutoffs[IndexForClassId (PreTrainedTemplates, ClassId)]; IClass = ClassForClassId (Templates->Templates, ClassId); @@ -1275,6 +1290,7 @@ void AdaptToChar(TBLOB *Blob, ADAPT_CLASS Class; TEMP_CONFIG TempConfig; FEATURE_SET FloatFeatures; + int NewTempConfigId; NumCharsAdaptedTo++; if (!LegalClassId (ClassId)) @@ -1323,11 +1339,17 @@ void AdaptToChar(TBLOB *Blob, if (LearningDebugLevel >= 1) cprintf ("Found poor match to temp config %d = %4.1f%%.\n", IntResult.Config, (1.0 - IntResult.Rating) * 100.0); - MakeNewTemporaryConfig(AdaptedTemplates, - ClassId, - NumFeatures, - IntFeatures, - FloatFeatures); + NewTempConfigId = MakeNewTemporaryConfig(AdaptedTemplates, + ClassId, + NumFeatures, + IntFeatures, + FloatFeatures); + + if (NewTempConfigId >= 0 && + TempConfigReliable (TempConfigFor (Class, NewTempConfigId))) + MakePermanent (AdaptedTemplates, ClassId, NewTempConfigId, + Blob, LineStats); + if (LearningDebugLevel >= 1) { IntegerMatcher (IClass, AllProtosOn, AllConfigsOn, NumFeatures, NumFeatures, IntFeatures, 0, 0, @@ -1630,6 +1652,8 @@ char *BaselineClassifier(TBLOB *Blob, } AddNewResult (Results, ClassId, IntResult.Rating, IntResult.Config); + if (IntResult.Rating < best_rating) + best_rating = IntResult.Rating; } while (i < NumClasses) { ClassId = ClassPrunerResults[i].Class; @@ -2185,7 +2209,8 @@ void DoAdaptiveMatch(TBLOB *Blob, */ TBLOB *Blob; - if (EnableNewAdaptRules) { /* new rules */ + if (EnableNewAdaptRules && /* new rules */ + CurrentBestChoiceIs (BestChoice)) { FindClassifierErrors(PerfectRating, GoodAdaptiveMatch, RatingMargin, @@ -2608,7 +2633,7 @@ void DoAdaptiveMatch(TBLOB *Blob, } /* InitMatcherRatings */ /*---------------------------------------------------------------------------*/ - void MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, + int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int NumFeatures, INT_FEATURE_ARRAY Features, @@ -2633,7 +2658,8 @@ void DoAdaptiveMatch(TBLOB *Blob, ** TempProtoMask defines old protos matched in new config ** Operation: - ** Return: none + ** Return: The id of the new config created, a negative integer in + ** case of error. ** Exceptions: none ** History: Fri Mar 15 08:49:46 1991, DSJ, Created. */ @@ -2661,14 +2687,18 @@ void DoAdaptiveMatch(TBLOB *Blob, Class = Templates->Class[ClassIndex]; if (NumIntConfigsIn (IClass) >= MAX_NUM_CONFIGS) - return; + { + ++NumAdaptationsFailed; + if (LearningDebugLevel >= 1) + cprintf ("Cannot make new temporary config: maximum number exceeded.\n"); + return -1; + } OldMaxProtoId = NumIntProtosIn (IClass) - 1; NumOldProtos = FindGoodProtos (IClass, AllProtosOn, AllConfigsOff, BlobLength, NumFeatures, Features, OldProtos, debug_level); - NumOldProtos = 0; MaskSize = WordsInVectorOfSize (MAX_NUM_PROTOS); zero_all_bits(TempProtoMask, MaskSize); @@ -2682,7 +2712,12 @@ void DoAdaptiveMatch(TBLOB *Blob, MaxProtoId = MakeNewTempProtos (FloatFeatures, NumBadFeatures, BadFeatures, IClass, Class, TempProtoMask); if (MaxProtoId == NO_PROTO) - return; + { + ++NumAdaptationsFailed; + if (LearningDebugLevel >= 1) + cprintf ("Cannot make new temp protos: maximum number exceeded.\n"); + return -1; + } ConfigId = AddIntConfig (IClass); ConvertConfig(TempProtoMask, ConfigId, IClass); @@ -2694,6 +2729,7 @@ void DoAdaptiveMatch(TBLOB *Blob, cprintf ("Making new temp config %d using %d old and %d new protos.\n", ConfigId, NumOldProtos, MaxProtoId - OldMaxProtoId); + return ConfigId; } /* MakeNewTemporaryConfig */ /*---------------------------------------------------------------------------*/ diff --git a/classify/cutoffs.cpp b/classify/cutoffs.cpp index b76d55de1..6e15da366 100644 --- a/classify/cutoffs.cpp +++ b/classify/cutoffs.cpp @@ -29,7 +29,7 @@ Public Code ----------------------------------------------------------------------------**/ /*---------------------------------------------------------------------------*/ -void ReadNewCutoffs(char *Filename, +void ReadNewCutoffs(const char *Filename, CLASS_TO_INDEX ClassMapper, CLASS_CUTOFF_ARRAY Cutoffs) { /* @@ -62,6 +62,6 @@ void ReadNewCutoffs(char *Filename, ClassId = Class[0]; Cutoffs[ClassMapper[ClassId]] = Cutoff; } - fclose(CutoffFile); + fclose(CutoffFile); } /* ReadNewCutoffs */ diff --git a/classify/cutoffs.h b/classify/cutoffs.h index 49b46015e..96f41c6b0 100644 --- a/classify/cutoffs.h +++ b/classify/cutoffs.h @@ -28,7 +28,7 @@ typedef UINT16 CLASS_CUTOFF_ARRAY[MAX_NUM_CLASSES]; /**---------------------------------------------------------------------------- Public Function Prototypes ----------------------------------------------------------------------------**/ -void ReadNewCutoffs(char *Filename, +void ReadNewCutoffs(const char *Filename, CLASS_TO_INDEX ClassMapper, CLASS_CUTOFF_ARRAY Cutoffs); diff --git a/cutil/globals.cpp b/cutil/globals.cpp index 22f8d96a4..9d554ee97 100644 --- a/cutil/globals.cpp +++ b/cutil/globals.cpp @@ -63,3 +63,7 @@ char *demodir; /*demo home directory */ int edgefd; /*edges window */ int debugfd; /*debug window fd */ FILE *debugfp; /*debug log file */ + +UNICHARSET unicharset; + +STRING language_data_path_prefix; diff --git a/cutil/globals.h b/cutil/globals.h index 18bd0637e..f737a5dff 100644 --- a/cutil/globals.h +++ b/cutil/globals.h @@ -27,6 +27,8 @@ #include "tessclas.h" #include "const.h" +#include "unicharset.h" +#include "strngs.h" #include @@ -43,10 +45,8 @@ extern int acts[MAXPROC]; /*action flags */ extern int debugs[MAXPROC]; /*debug flags */ extern int plots[MAXPROC]; /*plot flags */ extern int corners[4]; /*corners of scan window */ -extern "C" { - extern int optind; /*option index */ - extern char *optarg; /*option argument */ -} +extern int optind; /*option index */ +extern char *optarg; /*option argument */ /*image file name */ extern char imagefile[FILENAMESIZE]; /* main directory */ @@ -64,4 +64,9 @@ extern int acts_ocr; extern char *demodir; extern FILE *debugfp; /*debug log file */ + +extern UNICHARSET unicharset; /* The UNICHARSET variable that Tesseract uses internally */ + +extern STRING language_data_path_prefix; + #endif