mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 14:41:36 +08:00
Preparations for unicodization
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@34 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
4dffd5442c
commit
0a53f8c5bf
@ -20,6 +20,7 @@
|
||||
#include "baseapi.h"
|
||||
|
||||
#include "tessedit.h"
|
||||
#include "ocrclass.h"
|
||||
#include "pageres.h"
|
||||
#include "tessvars.h"
|
||||
#include "control.h"
|
||||
@ -52,7 +53,19 @@ const int kMinRectSize = 10;
|
||||
int TessBaseAPI::Init(const char* datapath, const char* outputbase,
|
||||
const char* configfile, bool numeric_mode,
|
||||
int argc, char* argv[]) {
|
||||
int result = init_tesseract(datapath, outputbase, configfile, argc, argv);
|
||||
return InitWithLanguage(datapath, outputbase, NULL, configfile,
|
||||
numeric_mode, argc, argv);
|
||||
}
|
||||
|
||||
// Start tesseract.
|
||||
// Similar to Init() except that it is possible to specify the language.
|
||||
// Language is the code of the language for which the data will be loaded.
|
||||
// (Codes follow ISO 639-2.) If it is NULL, english (eng) will be loaded.
|
||||
int TessBaseAPI::InitWithLanguage(const char* datapath, const char* outputbase,
|
||||
const char* language, const char* configfile,
|
||||
bool numeric_mode, int argc, char* argv[]) {
|
||||
int result = init_tesseract(datapath, outputbase, language,
|
||||
configfile, argc, argv);
|
||||
bln_numericmode.set_value(numeric_mode);
|
||||
return result;
|
||||
}
|
||||
@ -68,7 +81,7 @@ int TessBaseAPI::Init(const char* datapath, const char* outputbase,
|
||||
// one pixel is WHITE. For binary images set bytes_per_pixel=0.
|
||||
// The recognized text is returned as a char* which (in future will be coded
|
||||
// as UTF8 and) must be freed with the delete [] operator.
|
||||
char* TessBaseAPI::TesseractRect(const UINT8* imagedata,
|
||||
char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
|
||||
int bytes_per_pixel,
|
||||
int bytes_per_line,
|
||||
int left, int top,
|
||||
@ -114,7 +127,7 @@ void TessBaseAPI::DumpPGM(const char* filename) {
|
||||
|
||||
// Copy the given image rectangle to Tesseract, with adaptive thresholding
|
||||
// if the image is not already binary.
|
||||
void TessBaseAPI::CopyImageToTesseract(const UINT8* imagedata,
|
||||
void TessBaseAPI::CopyImageToTesseract(const unsigned char* imagedata,
|
||||
int bytes_per_pixel,
|
||||
int bytes_per_line,
|
||||
int left, int top,
|
||||
@ -147,7 +160,7 @@ void TessBaseAPI::CopyImageToTesseract(const UINT8* imagedata,
|
||||
// hi_values[channel] is 0 or background if 1. A hi_value of -1 indicates
|
||||
// that there is no apparent foreground. At least one hi_value will not be -1.
|
||||
// thresholds and hi_values are assumed to be of bytes_per_pixel size.
|
||||
void TessBaseAPI::OtsuThreshold(const UINT8* imagedata,
|
||||
void TessBaseAPI::OtsuThreshold(const unsigned char* imagedata,
|
||||
int bytes_per_pixel,
|
||||
int bytes_per_line,
|
||||
int left, int top, int right, int bottom,
|
||||
@ -206,16 +219,16 @@ void TessBaseAPI::OtsuThreshold(const UINT8* imagedata,
|
||||
// counted with this call in a multi-channel (pixel-major) image.
|
||||
// Histogram is always a 256 element array to count occurrences of
|
||||
// each pixel value.
|
||||
void TessBaseAPI::HistogramRect(const UINT8* imagedata,
|
||||
void TessBaseAPI::HistogramRect(const unsigned char* imagedata,
|
||||
int bytes_per_pixel,
|
||||
int bytes_per_line,
|
||||
int left, int top, int right, int bottom,
|
||||
int* histogram) {
|
||||
int width = right - left;
|
||||
memset(histogram, 0, sizeof(*histogram) * 256);
|
||||
const UINT8* pix = imagedata +
|
||||
top*bytes_per_line +
|
||||
left*bytes_per_pixel;
|
||||
const unsigned char* pix = imagedata +
|
||||
top*bytes_per_line +
|
||||
left*bytes_per_pixel;
|
||||
for (int y = top; y < bottom; ++y) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
++histogram[pix[x * bytes_per_pixel]];
|
||||
@ -270,7 +283,7 @@ int TessBaseAPI::OtsuStats(const int* histogram,
|
||||
// Threshold the given grey or color image into the tesseract global
|
||||
// image ready for recognition. Requires thresholds and hi_value
|
||||
// produced by OtsuThreshold above.
|
||||
void TessBaseAPI::ThresholdRect(const UINT8* imagedata,
|
||||
void TessBaseAPI::ThresholdRect(const unsigned char* imagedata,
|
||||
int bytes_per_pixel,
|
||||
int bytes_per_line,
|
||||
int left, int top,
|
||||
@ -283,9 +296,10 @@ void TessBaseAPI::ThresholdRect(const UINT8* imagedata,
|
||||
// For each line in the image, fill the IMAGELINE class and put it into the
|
||||
// Tesseract global page_image. Note that Tesseract stores images with the
|
||||
// bottom at y=0 and 0 is black, so we need 2 kinds of inversion.
|
||||
const UINT8* data = imagedata + top*bytes_per_line + left*bytes_per_pixel;
|
||||
const unsigned char* data = imagedata + top*bytes_per_line +
|
||||
left*bytes_per_pixel;
|
||||
for (int y = height - 1 ; y >= 0; --y) {
|
||||
const UINT8* pix = data;
|
||||
const unsigned char* pix = data;
|
||||
for (int x = 0; x < width; ++x, pix += bytes_per_pixel) {
|
||||
line.pixels[x] = 1;
|
||||
for (int ch = 0; ch < bytes_per_pixel; ++ch) {
|
||||
@ -303,13 +317,13 @@ void TessBaseAPI::ThresholdRect(const UINT8* imagedata,
|
||||
|
||||
// Cut out the requested rectangle of the binary image to the
|
||||
// tesseract global image ready for recognition.
|
||||
void TessBaseAPI::CopyBinaryRect(const UINT8* imagedata,
|
||||
void TessBaseAPI::CopyBinaryRect(const unsigned char* imagedata,
|
||||
int bytes_per_line,
|
||||
int left, int top,
|
||||
int width, int height) {
|
||||
// Copy binary image, cutting out the required rectangle.
|
||||
IMAGE image;
|
||||
image.capture(const_cast<UINT8*>(imagedata),
|
||||
image.capture(const_cast<unsigned char*>(imagedata),
|
||||
bytes_per_line*8, top + height, 1);
|
||||
page_image.create(width, height, 1);
|
||||
copy_sub_image(&image, left, top, width, height, &page_image, 0, 0, false);
|
||||
@ -392,4 +406,3 @@ char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -22,9 +22,6 @@
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "host.h"
|
||||
#include "ocrclass.h"
|
||||
|
||||
class PAGE_RES;
|
||||
class BLOCK_LIST;
|
||||
|
||||
@ -51,6 +48,14 @@ class TessBaseAPI {
|
||||
const char* configfile, bool numeric_mode,
|
||||
int argc, char* argv[]);
|
||||
|
||||
// Start tesseract.
|
||||
// Similar to Init() except that it is possible to specify the language.
|
||||
// Language is the code of the language for which the data will be loaded.
|
||||
// (Codes follow ISO 639-2.) If it is NULL, english (eng) will be loaded.
|
||||
static int InitWithLanguage(const char* datapath, const char* outputbase,
|
||||
const char* language, const char* configfile,
|
||||
bool numeric_mode, int argc, char* argv[]);
|
||||
|
||||
// Recognize a rectangle from an image and return the result as a string.
|
||||
// May be called many times for a single Init.
|
||||
// Currently has no error checking.
|
||||
@ -62,7 +67,7 @@ class TessBaseAPI {
|
||||
// 1 represents WHITE. For binary images set bytes_per_pixel=0.
|
||||
// The recognized text is returned as a char* which (in future will be coded
|
||||
// as UTF8 and) must be freed with the delete [] operator.
|
||||
static char* TesseractRect(const UINT8* imagedata,
|
||||
static char* TesseractRect(const unsigned char* imagedata,
|
||||
int bytes_per_pixel,
|
||||
int bytes_per_line,
|
||||
int left, int top, int width, int height);
|
||||
@ -80,7 +85,7 @@ class TessBaseAPI {
|
||||
protected:
|
||||
// Copy the given image rectangle to Tesseract, with adaptive thresholding
|
||||
// if the image is not already binary.
|
||||
static void CopyImageToTesseract(const UINT8* imagedata,
|
||||
static void CopyImageToTesseract(const unsigned char* imagedata,
|
||||
int bytes_per_pixel,
|
||||
int bytes_per_line,
|
||||
int left, int top, int width, int height);
|
||||
@ -92,7 +97,7 @@ class TessBaseAPI {
|
||||
// hi_values[channel] is 0 or background if 1. A hi_value of -1 indicates
|
||||
// that there is no apparent foreground. At least one hi_value will not be -1.
|
||||
// thresholds and hi_values are assumed to be of bytes_per_pixel size.
|
||||
static void OtsuThreshold(const UINT8* imagedata,
|
||||
static void OtsuThreshold(const unsigned char* imagedata,
|
||||
int bytes_per_pixel,
|
||||
int bytes_per_line,
|
||||
int left, int top, int right, int bottom,
|
||||
@ -106,7 +111,7 @@ class TessBaseAPI {
|
||||
// counted with this call in a multi-channel (pixel-major) image.
|
||||
// Histogram is always a 256 element array to count occurrences of
|
||||
// each pixel value.
|
||||
static void HistogramRect(const UINT8* imagedata,
|
||||
static void HistogramRect(const unsigned char* imagedata,
|
||||
int bytes_per_pixel,
|
||||
int bytes_per_line,
|
||||
int left, int top, int right, int bottom,
|
||||
@ -122,7 +127,7 @@ class TessBaseAPI {
|
||||
// Threshold the given grey or color image into the tesseract global
|
||||
// image ready for recognition. Requires thresholds and hi_value
|
||||
// produced by OtsuThreshold above.
|
||||
static void ThresholdRect(const UINT8* imagedata,
|
||||
static void ThresholdRect(const unsigned char* imagedata,
|
||||
int bytes_per_pixel,
|
||||
int bytes_per_line,
|
||||
int left, int top,
|
||||
@ -132,7 +137,7 @@ class TessBaseAPI {
|
||||
|
||||
// Cut out the requested rectangle of the binary image to the
|
||||
// tesseract global image ready for recognition.
|
||||
static void CopyBinaryRect(const UINT8* imagedata,
|
||||
static void CopyBinaryRect(const unsigned char* imagedata,
|
||||
int bytes_per_line,
|
||||
int left, int top,
|
||||
int width, int height);
|
||||
@ -145,7 +150,8 @@ class TessBaseAPI {
|
||||
|
||||
// Recognize the tesseract global image and return the result as Tesseract
|
||||
// internal structures.
|
||||
static PAGE_RES* Recognize(BLOCK_LIST* block_list, ETEXT_DESC* monitor);
|
||||
static PAGE_RES* Recognize(BLOCK_LIST* block_list,
|
||||
struct ETEXT_STRUCT* monitor);
|
||||
|
||||
// Convert (and free) the internal data structures into a text string.
|
||||
static char* TesseractToText(PAGE_RES* page_res);
|
||||
|
@ -50,6 +50,8 @@
|
||||
#include "adaptmatch.h"
|
||||
#include "intmatcher.h"
|
||||
#include "chop.h"
|
||||
#include "efio.h"
|
||||
#include "danerror.h"
|
||||
#include "globals.h"
|
||||
|
||||
//extern "C" {
|
||||
@ -96,6 +98,7 @@ ETEXT_DESC *global_monitor = NULL;
|
||||
|
||||
int init_tesseract(const char *arg0,
|
||||
const char *textbase,
|
||||
const char *language,
|
||||
const char *configfile,
|
||||
int configc,
|
||||
const char *const *configv) {
|
||||
@ -116,6 +119,22 @@ int init_tesseract(const char *arg0,
|
||||
strcpy (c_path, datadir.string ());
|
||||
c_path[strlen (c_path) - strlen (m_data_sub_dir.string ())] = '\0';
|
||||
demodir = c_path;
|
||||
|
||||
// Set the language data path prefix
|
||||
language_data_path_prefix = datadir;
|
||||
if (language != NULL) {
|
||||
language_data_path_prefix += language;
|
||||
language_data_path_prefix += ".";
|
||||
}
|
||||
else
|
||||
language_data_path_prefix += "eng.";
|
||||
|
||||
// Load the unichar set
|
||||
STRING unicharpath = language_data_path_prefix;
|
||||
unicharpath += "unicharset";
|
||||
if (!unicharset.load_from_file(unicharpath.string())) {
|
||||
DoError(FOPENERROR, "Unable to open unicharset");
|
||||
}
|
||||
start_recog(configfile, textbase);
|
||||
|
||||
ReliableConfigThreshold = tweak_ReliableConfigThreshold;
|
||||
|
@ -39,6 +39,7 @@ extern ETEXT_DESC *global_monitor;
|
||||
|
||||
int init_tesseract(const char *arg0,
|
||||
const char *textbase,
|
||||
const char *language,
|
||||
const char *configfile,
|
||||
int configc,
|
||||
const char *const *configv);
|
||||
|
@ -67,9 +67,11 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
|
||||
if (argc == 3)
|
||||
TessBaseAPI::Init(argv[0], argv[1], NULL, false, 0, argv + 2);
|
||||
TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL,
|
||||
NULL, false, 0, argv + 2);
|
||||
else
|
||||
TessBaseAPI::Init(argv[0], argv[1], argv[3], false, argc - 4, argv + 4);
|
||||
TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL,
|
||||
argv[3], false, argc - 4, argv + 4);
|
||||
|
||||
tprintf ("Tesseract Open Source OCR Engine\n");
|
||||
|
||||
|
@ -26,6 +26,9 @@
|
||||
// at least 4. Must not exceed 31 without changing the coding of length.
|
||||
#define UNICHAR_LEN 4
|
||||
|
||||
// A UNICHAR_ID is the unique id of a unichar.
|
||||
typedef int UNICHAR_ID;
|
||||
|
||||
// The UNICHAR class holds a single classification result. This may be
|
||||
// a single Unicode character (stored as between 1 and 4 utf8 bytes) or
|
||||
// multple Unicode characters representing the NFKC expansion of a ligature
|
||||
|
@ -53,8 +53,8 @@
|
||||
#endif
|
||||
|
||||
#define ADAPT_TEMPLATE_SUFFIX ".a"
|
||||
#define BUILT_IN_TEMPLATES_FILE "tessdata/inttemp"
|
||||
#define BUILT_IN_CUTOFFS_FILE "tessdata/pffmtable"
|
||||
#define BUILT_IN_TEMPLATES_FILE "inttemp"
|
||||
#define BUILT_IN_CUTOFFS_FILE "pffmtable"
|
||||
|
||||
#define MAX_MATCHES 10
|
||||
#define UNLIKELY_NUM_FEAT 200
|
||||
@ -98,7 +98,7 @@ PROTO_KEY;
|
||||
((Rating) > GreatAdaptiveMatch)
|
||||
|
||||
#define TempConfigReliable(Config) \
|
||||
((Config)->NumTimesSeen > ReliableConfigThreshold)
|
||||
((Config)->NumTimesSeen >= ReliableConfigThreshold)
|
||||
|
||||
#define InitIntFX() (FeaturesHaveBeenExtracted = FALSE)
|
||||
|
||||
@ -197,11 +197,11 @@ int GetIntCharNormFeatures(TBLOB *Blob,
|
||||
|
||||
void InitMatcherRatings(register FLOAT32 *Rating);
|
||||
|
||||
void MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,
|
||||
CLASS_ID ClassId,
|
||||
int NumFeatures,
|
||||
INT_FEATURE_ARRAY Features,
|
||||
FEATURE_SET FloatFeatures);
|
||||
int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,
|
||||
CLASS_ID ClassId,
|
||||
int NumFeatures,
|
||||
INT_FEATURE_ARRAY Features,
|
||||
FEATURE_SET FloatFeatures);
|
||||
|
||||
PROTO_ID MakeNewTempProtos (FEATURE_SET Features,
|
||||
int NumBadFeat,
|
||||
@ -429,7 +429,6 @@ void MakeNewAdaptedClass
|
||||
----------------------------------------------------------------------------**/
|
||||
/* name of current image file being processed */
|
||||
extern char imagefile[];
|
||||
//extern char *demodir;
|
||||
INT_VAR (tessedit_single_match, FALSE, "Top choice only from CP");
|
||||
|
||||
//extern "C" int il1_adaption_test; //?
|
||||
@ -449,6 +448,7 @@ static int NumBaselineClassesTried = 0;
|
||||
static int NumCharNormClassesTried = 0;
|
||||
static int NumAmbigClassesTried = 0;
|
||||
static int NumClassesOutput = 0;
|
||||
static int NumAdaptationsFailed = 0;
|
||||
|
||||
/* define globals used to hold onto extracted features. This is used
|
||||
to map from the old scheme in which baseline features and char norm
|
||||
@ -529,11 +529,11 @@ make_float_var (RatingMargin, 0.1, MakeRatingMargin,
|
||||
make_float_var (NoiseBlobLength, 0.6, MakeNoiseBlobLength,
|
||||
18, 11, SetNoiseBlobLength, "Avg. noise blob length: ");
|
||||
|
||||
make_int_var (MinNumPermClasses, 3, MakeMinNumPermClasses,
|
||||
make_int_var (MinNumPermClasses, 1, MakeMinNumPermClasses,
|
||||
18, 12, SetMinNumPermClasses, "Min # of permanent classes: ");
|
||||
/* PREV DEFAULT 200 */
|
||||
|
||||
make_int_var (ReliableConfigThreshold, 2, MakeReliableConfigThreshold,
|
||||
make_int_var (ReliableConfigThreshold, 1, MakeReliableConfigThreshold,
|
||||
18, 13, SetReliableConfigThreshold,
|
||||
"Reliable Config Threshold: ");
|
||||
|
||||
@ -556,6 +556,10 @@ make_float_var (RatingScale, 30.0, MakeRatingScale,
|
||||
make_float_var (CertaintyScale, 20.0, MakeCertaintyScale,
|
||||
18, 18, SetCertaintyScale, "CertaintyScale: ");
|
||||
|
||||
make_int_var (FailedAdaptionsBeforeReset, 150, MakeFailedAdaptionsBeforeReset,
|
||||
18, 19, SetFailedAdaptionsBeforeReset,
|
||||
"Number of failed adaptions before adapted templates reset: ");
|
||||
|
||||
int tess_cn_matching = 0;
|
||||
int tess_bn_matching = 0;
|
||||
|
||||
@ -589,6 +593,11 @@ LIST AdaptiveClassifier(TBLOB *Blob, TBLOB *DotBlob, TEXTROW *Row) {
|
||||
ADAPT_RESULTS Results;
|
||||
LINE_STATS LineStats;
|
||||
|
||||
if (FailedAdaptionsBeforeReset >= 0 &&
|
||||
NumAdaptationsFailed >= FailedAdaptionsBeforeReset) {
|
||||
NumAdaptationsFailed = 0;
|
||||
ResetAdaptiveClassifier();
|
||||
}
|
||||
if (AdaptedTemplates == NULL)
|
||||
AdaptedTemplates = NewAdaptedTemplates ();
|
||||
EnterClassifyMode;
|
||||
@ -672,6 +681,9 @@ void AdaptToWord(TWERD *Word,
|
||||
const char *map = rejmap;
|
||||
char map_char = '1';
|
||||
|
||||
if (strlen(BestChoice) > MAX_ADAPTABLE_WERD_SIZE)
|
||||
return;
|
||||
|
||||
if (EnableLearning) {
|
||||
NumWordsAdaptedTo++;
|
||||
|
||||
@ -826,13 +838,13 @@ void InitAdaptiveClassifier() {
|
||||
*/
|
||||
int i;
|
||||
FILE *File;
|
||||
char Filename[1024];
|
||||
STRING Filename;
|
||||
|
||||
if (!EnableAdaptiveMatcher)
|
||||
return;
|
||||
|
||||
strcpy(Filename, demodir);
|
||||
strcat(Filename, BuiltInTemplatesFile);
|
||||
Filename = language_data_path_prefix;
|
||||
Filename += BuiltInTemplatesFile;
|
||||
#ifndef SECURE_NAMES
|
||||
// cprintf( "\nReading built-in templates from %s ...",
|
||||
// Filename);
|
||||
@ -840,21 +852,22 @@ void InitAdaptiveClassifier() {
|
||||
#endif
|
||||
|
||||
#ifdef __UNIX__
|
||||
File = Efopen (Filename, "r");
|
||||
File = Efopen (Filename.string(), "r");
|
||||
#else
|
||||
File = Efopen (Filename, "rb");
|
||||
File = Efopen (Filename.string(), "rb");
|
||||
#endif
|
||||
PreTrainedTemplates = ReadIntTemplates (File, TRUE);
|
||||
fclose(File);
|
||||
|
||||
strcpy(Filename, demodir);
|
||||
strcat(Filename, BuiltInCutoffsFile);
|
||||
Filename = language_data_path_prefix;
|
||||
Filename += BuiltInCutoffsFile;
|
||||
#ifndef SECURE_NAMES
|
||||
// cprintf( "\nReading built-in pico-feature cutoffs from %s ...",
|
||||
// Filename);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
ReadNewCutoffs (Filename, PreTrainedTemplates->IndexFor, CharNormCutoffs);
|
||||
ReadNewCutoffs (Filename.string(), PreTrainedTemplates->IndexFor,
|
||||
CharNormCutoffs);
|
||||
|
||||
GetNormProtos();
|
||||
|
||||
@ -874,14 +887,14 @@ void InitAdaptiveClassifier() {
|
||||
zero_all_bits (AllConfigsOff, WordsInVectorOfSize (MAX_NUM_CONFIGS));
|
||||
|
||||
if (UsePreAdaptedTemplates) {
|
||||
strcpy(Filename, imagefile);
|
||||
strcat(Filename, ADAPT_TEMPLATE_SUFFIX);
|
||||
File = fopen (Filename, "rb");
|
||||
Filename = imagefile;
|
||||
Filename += ADAPT_TEMPLATE_SUFFIX;
|
||||
File = fopen (Filename.string(), "rb");
|
||||
if (File == NULL)
|
||||
AdaptedTemplates = NewAdaptedTemplates ();
|
||||
else {
|
||||
#ifndef SECURE_NAMES
|
||||
cprintf ("\nReading pre-adapted templates from %s ...", Filename);
|
||||
cprintf ("\nReading pre-adapted templates from %s ...", Filename.string());
|
||||
fflush(stdout);
|
||||
#endif
|
||||
AdaptedTemplates = ReadAdaptedTemplates (File);
|
||||
@ -950,6 +963,7 @@ void InitAdaptiveClassifierVars() {
|
||||
MakeEnableNewAdaptRules();
|
||||
MakeRatingScale();
|
||||
MakeCertaintyScale();
|
||||
MakeFailedAdaptionsBeforeReset();
|
||||
|
||||
InitPicoFXVars();
|
||||
InitOutlineFXVars(); //?
|
||||
@ -1098,8 +1112,9 @@ void MakeNewAdaptedClass(TBLOB *Blob,
|
||||
TempConfigFor (Class, 0) = Config;
|
||||
|
||||
/* this is a kludge to construct cutoffs for adapted templates */
|
||||
BaselineCutoffs[ClassIndex] =
|
||||
CharNormCutoffs[IndexForClassId (PreTrainedTemplates, ClassId)];
|
||||
if (Templates == AdaptedTemplates)
|
||||
BaselineCutoffs[ClassIndex] =
|
||||
CharNormCutoffs[IndexForClassId (PreTrainedTemplates, ClassId)];
|
||||
|
||||
IClass = ClassForClassId (Templates->Templates, ClassId);
|
||||
|
||||
@ -1275,6 +1290,7 @@ void AdaptToChar(TBLOB *Blob,
|
||||
ADAPT_CLASS Class;
|
||||
TEMP_CONFIG TempConfig;
|
||||
FEATURE_SET FloatFeatures;
|
||||
int NewTempConfigId;
|
||||
|
||||
NumCharsAdaptedTo++;
|
||||
if (!LegalClassId (ClassId))
|
||||
@ -1323,11 +1339,17 @@ void AdaptToChar(TBLOB *Blob,
|
||||
if (LearningDebugLevel >= 1)
|
||||
cprintf ("Found poor match to temp config %d = %4.1f%%.\n",
|
||||
IntResult.Config, (1.0 - IntResult.Rating) * 100.0);
|
||||
MakeNewTemporaryConfig(AdaptedTemplates,
|
||||
ClassId,
|
||||
NumFeatures,
|
||||
IntFeatures,
|
||||
FloatFeatures);
|
||||
NewTempConfigId = MakeNewTemporaryConfig(AdaptedTemplates,
|
||||
ClassId,
|
||||
NumFeatures,
|
||||
IntFeatures,
|
||||
FloatFeatures);
|
||||
|
||||
if (NewTempConfigId >= 0 &&
|
||||
TempConfigReliable (TempConfigFor (Class, NewTempConfigId)))
|
||||
MakePermanent (AdaptedTemplates, ClassId, NewTempConfigId,
|
||||
Blob, LineStats);
|
||||
|
||||
if (LearningDebugLevel >= 1) {
|
||||
IntegerMatcher (IClass, AllProtosOn, AllConfigsOn,
|
||||
NumFeatures, NumFeatures, IntFeatures, 0, 0,
|
||||
@ -1630,6 +1652,8 @@ char *BaselineClassifier(TBLOB *Blob,
|
||||
}
|
||||
|
||||
AddNewResult (Results, ClassId, IntResult.Rating, IntResult.Config);
|
||||
if (IntResult.Rating < best_rating)
|
||||
best_rating = IntResult.Rating;
|
||||
}
|
||||
while (i < NumClasses) {
|
||||
ClassId = ClassPrunerResults[i].Class;
|
||||
@ -2185,7 +2209,8 @@ void DoAdaptiveMatch(TBLOB *Blob,
|
||||
*/
|
||||
TBLOB *Blob;
|
||||
|
||||
if (EnableNewAdaptRules) { /* new rules */
|
||||
if (EnableNewAdaptRules && /* new rules */
|
||||
CurrentBestChoiceIs (BestChoice)) {
|
||||
FindClassifierErrors(PerfectRating,
|
||||
GoodAdaptiveMatch,
|
||||
RatingMargin,
|
||||
@ -2608,7 +2633,7 @@ void DoAdaptiveMatch(TBLOB *Blob,
|
||||
} /* InitMatcherRatings */
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
void MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,
|
||||
int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates,
|
||||
CLASS_ID ClassId,
|
||||
int NumFeatures,
|
||||
INT_FEATURE_ARRAY Features,
|
||||
@ -2633,7 +2658,8 @@ void DoAdaptiveMatch(TBLOB *Blob,
|
||||
** TempProtoMask
|
||||
defines old protos matched in new config
|
||||
** Operation:
|
||||
** Return: none
|
||||
** Return: The id of the new config created, a negative integer in
|
||||
** case of error.
|
||||
** Exceptions: none
|
||||
** History: Fri Mar 15 08:49:46 1991, DSJ, Created.
|
||||
*/
|
||||
@ -2661,14 +2687,18 @@ void DoAdaptiveMatch(TBLOB *Blob,
|
||||
Class = Templates->Class[ClassIndex];
|
||||
|
||||
if (NumIntConfigsIn (IClass) >= MAX_NUM_CONFIGS)
|
||||
return;
|
||||
{
|
||||
++NumAdaptationsFailed;
|
||||
if (LearningDebugLevel >= 1)
|
||||
cprintf ("Cannot make new temporary config: maximum number exceeded.\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
OldMaxProtoId = NumIntProtosIn (IClass) - 1;
|
||||
|
||||
NumOldProtos = FindGoodProtos (IClass, AllProtosOn, AllConfigsOff,
|
||||
BlobLength, NumFeatures, Features,
|
||||
OldProtos, debug_level);
|
||||
NumOldProtos = 0;
|
||||
|
||||
MaskSize = WordsInVectorOfSize (MAX_NUM_PROTOS);
|
||||
zero_all_bits(TempProtoMask, MaskSize);
|
||||
@ -2682,7 +2712,12 @@ void DoAdaptiveMatch(TBLOB *Blob,
|
||||
MaxProtoId = MakeNewTempProtos (FloatFeatures, NumBadFeatures, BadFeatures,
|
||||
IClass, Class, TempProtoMask);
|
||||
if (MaxProtoId == NO_PROTO)
|
||||
return;
|
||||
{
|
||||
++NumAdaptationsFailed;
|
||||
if (LearningDebugLevel >= 1)
|
||||
cprintf ("Cannot make new temp protos: maximum number exceeded.\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
ConfigId = AddIntConfig (IClass);
|
||||
ConvertConfig(TempProtoMask, ConfigId, IClass);
|
||||
@ -2694,6 +2729,7 @@ void DoAdaptiveMatch(TBLOB *Blob,
|
||||
cprintf ("Making new temp config %d using %d old and %d new protos.\n",
|
||||
ConfigId, NumOldProtos, MaxProtoId - OldMaxProtoId);
|
||||
|
||||
return ConfigId;
|
||||
} /* MakeNewTemporaryConfig */
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
|
@ -29,7 +29,7 @@
|
||||
Public Code
|
||||
----------------------------------------------------------------------------**/
|
||||
/*---------------------------------------------------------------------------*/
|
||||
void ReadNewCutoffs(char *Filename,
|
||||
void ReadNewCutoffs(const char *Filename,
|
||||
CLASS_TO_INDEX ClassMapper,
|
||||
CLASS_CUTOFF_ARRAY Cutoffs) {
|
||||
/*
|
||||
@ -62,6 +62,6 @@ void ReadNewCutoffs(char *Filename,
|
||||
ClassId = Class[0];
|
||||
Cutoffs[ClassMapper[ClassId]] = Cutoff;
|
||||
}
|
||||
fclose(CutoffFile);
|
||||
fclose(CutoffFile);
|
||||
|
||||
} /* ReadNewCutoffs */
|
||||
|
@ -28,7 +28,7 @@ typedef UINT16 CLASS_CUTOFF_ARRAY[MAX_NUM_CLASSES];
|
||||
/**----------------------------------------------------------------------------
|
||||
Public Function Prototypes
|
||||
----------------------------------------------------------------------------**/
|
||||
void ReadNewCutoffs(char *Filename,
|
||||
void ReadNewCutoffs(const char *Filename,
|
||||
CLASS_TO_INDEX ClassMapper,
|
||||
CLASS_CUTOFF_ARRAY Cutoffs);
|
||||
|
||||
|
@ -63,3 +63,7 @@ char *demodir; /*demo home directory */
|
||||
int edgefd; /*edges window */
|
||||
int debugfd; /*debug window fd */
|
||||
FILE *debugfp; /*debug log file */
|
||||
|
||||
UNICHARSET unicharset;
|
||||
|
||||
STRING language_data_path_prefix;
|
||||
|
@ -27,6 +27,8 @@
|
||||
|
||||
#include "tessclas.h"
|
||||
#include "const.h"
|
||||
#include "unicharset.h"
|
||||
#include "strngs.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
@ -43,10 +45,8 @@ extern int acts[MAXPROC]; /*action flags */
|
||||
extern int debugs[MAXPROC]; /*debug flags */
|
||||
extern int plots[MAXPROC]; /*plot flags */
|
||||
extern int corners[4]; /*corners of scan window */
|
||||
extern "C" {
|
||||
extern int optind; /*option index */
|
||||
extern char *optarg; /*option argument */
|
||||
}
|
||||
extern int optind; /*option index */
|
||||
extern char *optarg; /*option argument */
|
||||
/*image file name */
|
||||
extern char imagefile[FILENAMESIZE];
|
||||
/* main directory */
|
||||
@ -64,4 +64,9 @@ extern int acts_ocr;
|
||||
|
||||
extern char *demodir;
|
||||
extern FILE *debugfp; /*debug log file */
|
||||
|
||||
extern UNICHARSET unicharset; /* The UNICHARSET variable that Tesseract uses internally */
|
||||
|
||||
extern STRING language_data_path_prefix;
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user