Changes to ccutil for 3.00

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@305 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
theraysmith 2009-07-11 02:50:24 +00:00
parent b47efd2cc4
commit d8b1456dd5
32 changed files with 4234 additions and 261 deletions

View File

@ -1,19 +1,30 @@
SUBDIRS =
AM_CXXFLAGS = -DTESSDATA_PREFIX=@datadir@/
EXTRA_DIST = ccutil.vcproj mfcpch.cpp scanutils.cpp scanutils.h
include_HEADERS = \
basedir.h bits16.h clst.h debugwin.h elst2.h elst.h errcode.h \
fileerr.h tessopt.h globaloc.h hashfn.h host.h hosthplb.h lsterr.h \
mainblk.h memblk.h memryerr.h memry.h mfcpch.h ndminx.h notdll.h \
nwmain.h ocrclass.h ocrshell.h platform.h secname.h serialis.h \
stderr.h strngs.h tessclas.h tprintf.h varable.h \
mfcpch.cpp scanutils.cpp scanutils.h unichar.h \
unicharmap.h unicharset.h boxread.h
ambigs.h basedir.h bits16.h boxread.h \
callback.h ccutil.h clst.h \
debugwin.h elst2.h elst.h errcode.h \
fileerr.h genericvector.h globaloc.h \
hashfn.h helpers.h host.h hosthplb.h lsterr.h \
mainblk.h memblk.h memry.h memryerr.h mfcpch.h \
ndminx.h notdll.h nwmain.h \
ocrclass.h ocrshell.h platform.h qrsequence.h \
secname.h serialis.h stderr.h strngs.h \
tessclas.h tessdatamanager.h tessopt.h tordvars.h tprintf.h \
unichar.h unicharmap.h unicharset.h unicity_table.h \
varable.h
lib_LIBRARIES = libtesseract_ccutil.a
libtesseract_ccutil_a_SOURCES = \
basedir.cpp bits16.cpp clst.cpp debugwin.cpp elst.cpp \
elst2.cpp errcode.cpp globaloc.cpp hashfn.cpp mainblk.cpp \
memblk.cpp memry.cpp ocrshell.cpp serialis.cpp strngs.cpp \
tprintf.cpp varable.cpp unichar.cpp tessopt.cpp \
unicharmap.cpp unicharset.cpp boxread.cpp
ambigs.cpp basedir.cpp bits16.cpp boxread.cpp \
ccutil.cpp clst.cpp debugwin.cpp \
elst2.cpp elst.cpp errcode.cpp \
globaloc.cpp hashfn.cpp \
mainblk.cpp memblk.cpp memry.cpp ocrshell.cpp \
serialis.cpp strngs.cpp \
tessdatamanager.cpp tessopt.cpp tordvars.cpp tprintf.cpp \
unichar.cpp unicharmap.cpp unicharset.cpp \
varable.cpp

View File

@ -57,14 +57,15 @@ AR = ar
ARFLAGS = cru
libtesseract_ccutil_a_AR = $(AR) $(ARFLAGS)
libtesseract_ccutil_a_LIBADD =
am_libtesseract_ccutil_a_OBJECTS = basedir.$(OBJEXT) bits16.$(OBJEXT) \
clst.$(OBJEXT) debugwin.$(OBJEXT) elst.$(OBJEXT) \
elst2.$(OBJEXT) errcode.$(OBJEXT) globaloc.$(OBJEXT) \
am_libtesseract_ccutil_a_OBJECTS = ambigs.$(OBJEXT) basedir.$(OBJEXT) \
bits16.$(OBJEXT) boxread.$(OBJEXT) ccutil.$(OBJEXT) \
clst.$(OBJEXT) debugwin.$(OBJEXT) elst2.$(OBJEXT) \
elst.$(OBJEXT) errcode.$(OBJEXT) globaloc.$(OBJEXT) \
hashfn.$(OBJEXT) mainblk.$(OBJEXT) memblk.$(OBJEXT) \
memry.$(OBJEXT) ocrshell.$(OBJEXT) serialis.$(OBJEXT) \
strngs.$(OBJEXT) tprintf.$(OBJEXT) varable.$(OBJEXT) \
unichar.$(OBJEXT) tessopt.$(OBJEXT) unicharmap.$(OBJEXT) \
unicharset.$(OBJEXT) boxread.$(OBJEXT)
strngs.$(OBJEXT) tessdatamanager.$(OBJEXT) tessopt.$(OBJEXT) \
tordvars.$(OBJEXT) tprintf.$(OBJEXT) unichar.$(OBJEXT) \
unicharmap.$(OBJEXT) unicharset.$(OBJEXT) varable.$(OBJEXT)
libtesseract_ccutil_a_OBJECTS = $(am_libtesseract_ccutil_a_OBJECTS)
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
depcomp = $(SHELL) $(top_srcdir)/config/depcomp
@ -196,22 +197,32 @@ top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
SUBDIRS =
AM_CXXFLAGS = -DTESSDATA_PREFIX=@datadir@/
EXTRA_DIST = ccutil.vcproj mfcpch.cpp scanutils.cpp scanutils.h
include_HEADERS = \
basedir.h bits16.h clst.h debugwin.h elst2.h elst.h errcode.h \
fileerr.h tessopt.h globaloc.h hashfn.h host.h hosthplb.h lsterr.h \
mainblk.h memblk.h memryerr.h memry.h mfcpch.h ndminx.h notdll.h \
nwmain.h ocrclass.h ocrshell.h platform.h secname.h serialis.h \
stderr.h strngs.h tessclas.h tprintf.h varable.h \
mfcpch.cpp scanutils.cpp scanutils.h unichar.h \
unicharmap.h unicharset.h boxread.h
ambigs.h basedir.h bits16.h boxread.h \
callback.h ccutil.h clst.h \
debugwin.h elst2.h elst.h errcode.h \
fileerr.h genericvector.h globaloc.h \
hashfn.h helpers.h host.h hosthplb.h lsterr.h \
mainblk.h memblk.h memry.h memryerr.h mfcpch.h \
ndminx.h notdll.h nwmain.h \
ocrclass.h ocrshell.h platform.h qrsequence.h \
secname.h serialis.h stderr.h strngs.h \
tessclas.h tessdatamanager.h tessopt.h tordvars.h tprintf.h \
unichar.h unicharmap.h unicharset.h unicity_table.h \
varable.h
lib_LIBRARIES = libtesseract_ccutil.a
libtesseract_ccutil_a_SOURCES = \
basedir.cpp bits16.cpp clst.cpp debugwin.cpp elst.cpp \
elst2.cpp errcode.cpp globaloc.cpp hashfn.cpp mainblk.cpp \
memblk.cpp memry.cpp ocrshell.cpp serialis.cpp strngs.cpp \
tprintf.cpp varable.cpp unichar.cpp tessopt.cpp \
unicharmap.cpp unicharset.cpp boxread.cpp
ambigs.cpp basedir.cpp bits16.cpp boxread.cpp \
ccutil.cpp clst.cpp debugwin.cpp \
elst2.cpp elst.cpp errcode.cpp \
globaloc.cpp hashfn.cpp \
mainblk.cpp memblk.cpp memry.cpp ocrshell.cpp \
serialis.cpp strngs.cpp \
tessdatamanager.cpp tessopt.cpp tordvars.cpp tprintf.cpp \
unichar.cpp unicharmap.cpp unicharset.cpp \
varable.cpp
all: all-recursive
@ -286,9 +297,11 @@ mostlyclean-compile:
distclean-compile:
-rm -f *.tab.c
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ambigs.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/basedir.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bits16.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/boxread.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccutil.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/clst.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/debugwin.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/elst.Po@am__quote@
@ -302,7 +315,9 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ocrshell.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/serialis.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/strngs.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tessdatamanager.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tessopt.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tordvars.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tprintf.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unichar.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unicharmap.Po@am__quote@

254
ccutil/ambigs.cpp Normal file
View File

@ -0,0 +1,254 @@
///////////////////////////////////////////////////////////////////////
// File: ambigs.cc
// Description: Functions for dealing with ambiguities
// (training and recognition).
// Author: Daria Antonova
// Created: Mon Feb 5 11:26:43 PDT 2009
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "ambigs.h"
#include "helpers.h"
INT_VAR(global_ambigs_debug_level, 0, "Debug level for unichar ambiguities");
BOOL_VAR(use_definite_ambigs_for_classifier, 0,
"Use definite ambiguities when running character classifier");
namespace tesseract {
AmbigSpec::AmbigSpec() {
wrong_ngram[0] = INVALID_UNICHAR_ID;
correct_fragments[0] = INVALID_UNICHAR_ID;
correct_ngram_id = INVALID_UNICHAR_ID;
type = NOT_AMBIG;
wrong_ngram_size = 0;
}
ELISTIZE(AmbigSpec);
void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile, inT64 end_offset,
UNICHARSET *unicharset) {
int i;
for (i = 0; i < unicharset->size(); ++i) {
replace_ambigs_.push_back(NULL);
dang_ambigs_.push_back(NULL);
one_to_one_definite_ambigs_.push_back(NULL);
}
if (global_ambigs_debug_level) tprintf("Reading ambiguities\n");
int TestAmbigPartSize;
int ReplacementAmbigPartSize;
// Maximum line size:
// 10 for sizes of ambigs, tabs, abmig type and newline
// UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig
// The space for buffer is allocated on the heap to avoid
// GCC frame size warning.
const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);
const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;
char *buffer = new char[kBufferSize];
char ReplacementString[kMaxAmbigStringSize];
UNICHAR_ID TestUnicharIds[MAX_AMBIG_SIZE + 1];
int line_num = 0;
int type = NOT_AMBIG;
// Determine the version of the ambigs file.
int version = 0;
ASSERT_HOST(fgets(buffer, kBufferSize, AmbigFile) != NULL &&
strlen(buffer) > 0);
if (*buffer == 'v') {
version = static_cast<int>(strtol(buffer+1, NULL, 10));
++line_num;
} else {
rewind(AmbigFile);
}
while ((end_offset < 0 || ftell(AmbigFile) < end_offset) &&
fgets(buffer, kBufferSize, AmbigFile) != NULL) {
chomp_string(buffer);
if (global_ambigs_debug_level > 2) tprintf("read line %s\n", buffer);
++line_num;
if (!ParseAmbiguityLine(line_num, version, *unicharset, buffer,
&TestAmbigPartSize, TestUnicharIds,
&ReplacementAmbigPartSize,
ReplacementString, &type)) continue;
// Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.
AmbigSpec *ambig_spec = new AmbigSpec();
InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_,
TestAmbigPartSize, TestUnicharIds,
ReplacementAmbigPartSize, ReplacementString, type,
ambig_spec, unicharset);
// Update one_to_one_definite_ambigs_.
if (use_definite_ambigs_for_classifier && TestAmbigPartSize == 1 &&
ReplacementAmbigPartSize == 1 && type == DEFINITE_AMBIG) {
if (one_to_one_definite_ambigs_[TestUnicharIds[0]] == NULL) {
one_to_one_definite_ambigs_[TestUnicharIds[0]] = new UnicharIdVector();
}
one_to_one_definite_ambigs_[TestUnicharIds[0]]->push_back(
ambig_spec->correct_ngram_id);
}
}
delete[] buffer;
// Print what was read from the input file.
if (global_ambigs_debug_level > 2) {
for (int tbl = 0; tbl < 2; ++tbl) {
const UnicharAmbigsVector &print_table =
(tbl == 0) ? replace_ambigs_ : dang_ambigs_;
for (i = 0; i < print_table.size(); ++i) {
AmbigSpec_LIST *lst = print_table[i];
if (lst == NULL) continue;
if (!lst->empty()) {
tprintf("%s Ambiguities for %s:\n",
(tbl == 0) ? "Replaceable" : "Dangerous",
unicharset->debug_str(i).string());
}
AmbigSpec_IT lst_it(lst);
for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) {
AmbigSpec *ambig_spec = lst_it.data();
tprintf("wrong_ngram:");
UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset);
tprintf("correct_fragments:");
UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset);
}
}
}
}
}
bool UnicharAmbigs::ParseAmbiguityLine(
int line_num, int version, const UNICHARSET &unicharset,
char *buffer, int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
int *ReplacementAmbigPartSize, char *ReplacementString, int *type) {
int i;
char *token;
char *next_token;
if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) ||
!sscanf(token, "%d", TestAmbigPartSize) || TestAmbigPartSize <= 0) {
if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num);
return false;
}
if (*TestAmbigPartSize > MAX_AMBIG_SIZE) {
tprintf("Too many unichars in ambiguity on line %d\n");
return false;
}
for (i = 0; i < *TestAmbigPartSize; ++i) {
if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
if (!unicharset.contains_unichar(token)) {
if (global_ambigs_debug_level) tprintf(kIllegalUnicharMsg, token);
break;
}
TestUnicharIds[i] = unicharset.unichar_to_id(token);
}
TestUnicharIds[i] = INVALID_UNICHAR_ID;
if (i != *TestAmbigPartSize ||
!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
!sscanf(token, "%d", ReplacementAmbigPartSize) ||
*ReplacementAmbigPartSize <= 0) {
if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num);
return false;
}
if (*ReplacementAmbigPartSize > MAX_AMBIG_SIZE) {
tprintf("Too many unichars in ambiguity on line %d\n");
return false;
}
ReplacementString[0] = '\0';
for (i = 0; i < *ReplacementAmbigPartSize; ++i) {
if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
strcat(ReplacementString, token);
if (!unicharset.contains_unichar(token)) {
if (global_ambigs_debug_level) tprintf(kIllegalUnicharMsg, token);
break;
}
}
if (i != *ReplacementAmbigPartSize) {
if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num);
return false;
}
if (version > 0) {
// The next field being true indicates that the abiguity should
// always be substituted (e.g. '' should always be changed to ").
// For such "certain" n -> m ambigs tesseract will insert character
// fragments for the n pieces in the unicharset. AmbigsFound()
// will then replace the incorrect ngram with the character
// fragments of the correct character (or ngram if m > 1).
// Note that if m > 1, an ngram will be inserted into the
// modified word, not the individual unigrams. Tesseract
// has limited support for ngram unichar (e.g. dawg permuter).
if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
!sscanf(token, "%d", type)) {
if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num);
return false;
}
}
return true;
}
void UnicharAmbigs::InsertIntoTable(
UnicharAmbigsVector &table, int TestAmbigPartSize,
UNICHAR_ID *TestUnicharIds, int ReplacementAmbigPartSize,
const char *ReplacementString, int type,
AmbigSpec *ambig_spec, UNICHARSET *unicharset) {
ambig_spec->type = static_cast<AmbigType>(type);
if (TestAmbigPartSize == 1 && ReplacementAmbigPartSize == 1 &&
unicharset->to_lower(TestUnicharIds[0]) ==
unicharset->to_lower(unicharset->unichar_to_id(ReplacementString))) {
ambig_spec->type = CASE_AMBIG;
}
ambig_spec->wrong_ngram_size =
UnicharIdArrayUtils::copy(TestUnicharIds, ambig_spec->wrong_ngram);
// Since we need to maintain a constant number of unichar positions in
// order to construct ambig_blob_choices vector in NoDangerousAmbig(), for
// each n->m ambiguity we will have to place n character fragments of the
// correct ngram into the corresponding positions in the vector (e.g. given
// "vvvvw" and vvvv->ww we will place v and |ww|0|4 into position 0, v and
// |ww|1|4 into position 1 and so on. The correct ngram is reconstructed
// from fragments by dawg_permute_and_select().
// Insert the corresponding correct ngram into the unicharset.
// Unicharset code assumes that the "base" ngram is inserted into
// the unicharset before fragments of this ngram are inserted.
unicharset->unichar_insert(ReplacementString);
ambig_spec->correct_ngram_id =
unicharset->unichar_to_id(ReplacementString);
if (ReplacementAmbigPartSize > 1) {
unicharset->set_isngram(ambig_spec->correct_ngram_id, true);
}
// Add the corresponding fragments of the correct ngram to unicharset.
int i;
for (i = 0; i < TestAmbigPartSize; ++i) {
UNICHAR_ID unichar_id;
if (TestAmbigPartSize == 1) {
unichar_id = ambig_spec->correct_ngram_id;
} else {
STRING frag_str = CHAR_FRAGMENT::to_string(
ReplacementString, i, TestAmbigPartSize);
unicharset->unichar_insert(frag_str.string());
unichar_id = unicharset->unichar_to_id(frag_str.string());
}
ambig_spec->correct_fragments[i] = unichar_id;
}
ambig_spec->correct_fragments[i] = INVALID_UNICHAR_ID;
// Add AmbigSpec for this ambiguity to the corresponding AmbigSpec_LIST.
// Keep AmbigSpec_LISTs sorted by AmbigSpec.wrong_ngram.
if (table[TestUnicharIds[0]] == NULL) {
table[TestUnicharIds[0]] = new AmbigSpec_LIST();
}
table[TestUnicharIds[0]]->add_sorted(
AmbigSpec::compare_ambig_specs, ambig_spec);
}
} // namespace tesseract

186
ccutil/ambigs.h Normal file
View File

@ -0,0 +1,186 @@
///////////////////////////////////////////////////////////////////////
// File: ambigs.h
// Description: Constants, flags, functions for dealing with
// ambiguities (training and recognition).
// Author: Daria Antonova
// Created: Mon Aug 23 11:26:43 PDT 2008
//
// (C) Copyright 2008, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCUTIL_AMBIGS_H_
#define TESSERACT_CCUTIL_AMBIGS_H_
#include "elst.h"
#include "tprintf.h"
#include "unichar.h"
#include "unicharset.h"
#include "genericvector.h"
#define MAX_AMBIG_SIZE 10
extern INT_VAR_H(global_ambigs_debug_level, 0,
"Debug level for unichar ambiguities");
extern BOOL_VAR_H(use_definite_ambigs_for_classifier, 0,
"Use definite ambiguities when running character classifier");
namespace tesseract {
static const int kUnigramAmbigsBufferSize = 1000;
static const char kAmbigNgramSeparator[] = { ' ', '\0' };
static const char kAmbigDelimiters[] = "\t ";
static const char kIllegalMsg[] =
"Illegal ambiguity specification on line %d\n";
static const char kIllegalUnicharMsg[] =
"Illegal unichar %s in ambiguity specification\n";
enum AmbigType {
NOT_AMBIG, // the ngram pair is not ambiguous
REPLACE_AMBIG, // ocred ngram should always be substituted with correct
DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1)
SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1)
CASE_AMBIG, // this is a case ambiguity (1-1)
AMBIG_TYPE_COUNT // number of enum entries
};
// A collection of utility functions for arrays of UNICHAR_IDs that are
// terminated by INVALID_UNICHAR_ID.
class UnicharIdArrayUtils {
public:
// Compares two arrays of unichar ids. Returns -1 if the length of array1 is
// less than length of array2, if any array1[i] is less than array2[i].
// Returns 0 if the arrays are equal, 1 otherwise.
// The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
static inline int compare(const UNICHAR_ID array1[],
const UNICHAR_ID array2[]) {
const UNICHAR_ID *ptr1 = array1;
const UNICHAR_ID *ptr2 = array2;
while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) {
if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1;
++ptr1;
++ptr2;
}
if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0;
return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1;
}
// Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
// The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
// and that dst has enough space for all the elements from src.
static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
int i = 0;
do {
dst[i] = src[i];
} while (dst[i++] != INVALID_UNICHAR_ID);
return i - 1;
}
// Prints unichars corresponding to the unichar_ids in the given array.
// The function assumes that array is terminated by INVALID_UNICHAR_ID.
static inline void print(const UNICHAR_ID array[],
const UNICHARSET &unicharset) {
const UNICHAR_ID *ptr = array;
if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]");
while (*ptr != INVALID_UNICHAR_ID) {
tprintf("%s ", unicharset.id_to_unichar(*ptr++));
}
tprintf("( ");
ptr = array;
while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++);
tprintf(")\n");
}
};
// AMBIG_SPEC_LIST stores a list of dangerous ambigs that
// start with the same unichar (e.g. r->t rn->m rr1->m).
class AmbigSpec : public ELIST_LINK {
public:
AmbigSpec();
~AmbigSpec() {}
// Comparator function for sorting AmbigSpec_LISTs. The lists will
// be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
// in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
static int compare_ambig_specs(const void *spec1, const void *spec2) {
const AmbigSpec *s1 =
*reinterpret_cast<const AmbigSpec * const *>(spec1);
const AmbigSpec *s2 =
*reinterpret_cast<const AmbigSpec * const *>(spec2);
return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
}
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
UNICHAR_ID correct_ngram_id;
AmbigType type;
int wrong_ngram_size;
};
ELISTIZEH(AmbigSpec);
// AMBIG_TABLE[i] stores a set of ambiguities whose
// wrong ngram starts with unichar id i.
typedef GenericVector<AmbigSpec_LIST *> UnicharAmbigsVector;
typedef GenericVector<UNICHAR_ID> UnicharIdVector;
class UnicharAmbigs {
public:
UnicharAmbigs() {}
~UnicharAmbigs() {
replace_ambigs_.delete_data_pointers();
dang_ambigs_.delete_data_pointers();
one_to_one_definite_ambigs_.delete_data_pointers();
}
const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }
// Fills in two ambiguity tables (replaceable and dangerous) with information
// read from the ambigs file. An ambiguity table is an array of lists.
// The array is indexed by a class id. Each entry in the table provides
// a list of potential ambiguities which can start with the corresponding
// character. For example the ambiguity "rn -> m", would be located in the
// table at index of unicharset.unichar_to_id('r').
// In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
// one_to_one_definite_ambigs_. This vector is also indexed by the class id
// of the wrong part of the ambiguity and each entry contains a vector of
// unichar ids that are ambiguous to it.
void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset,
UNICHARSET *unicharset);
// Return definite 1-1 ambigs.
const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const {
if (one_to_one_definite_ambigs_.empty()) return NULL;
return one_to_one_definite_ambigs_[unichar_id];
}
private:
bool ParseAmbiguityLine(int line_num, int version,
const UNICHARSET &unicharset, char *buffer,
int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
int *ReplacementAmbigPartSize,
char *ReplacementString, int *type);
void InsertIntoTable(UnicharAmbigsVector &table,
int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
int ReplacementAmbigPartSize,
const char *ReplacementString, int type,
AmbigSpec *ambig_spec, UNICHARSET *unicharset);
UnicharAmbigsVector dang_ambigs_;
UnicharAmbigsVector replace_ambigs_;
GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
};
} // namespace tesseract
#endif // TESSERACT_CCUTIL_AMBIGS_H_

View File

@ -22,8 +22,6 @@
#ifdef __UNIX__
#include <unistd.h>
#include <fcntl.h>
#else
#include <io.h>
#endif
#include <stdlib.h>
#include "basedir.h"
@ -103,7 +101,7 @@ DLLSYM inT8 getpath( //get dir name of code
return -1;
}
} else {
strcpy(directory, code);
strncpy(directory, code, MAX_PATH - 1);
}
while ((path_end = strchr (directory, '\\')) != NULL)
*path_end = '/';

1006
ccutil/callback.h Normal file

File diff suppressed because it is too large Load Diff

48
ccutil/ccutil.cpp Normal file
View File

@ -0,0 +1,48 @@
// Copyright 2008 Google Inc. All Rights Reserved.
// Author: scharron@google.com (Samuel Charron)
#include "ccutil.h"
namespace tesseract {
CCUtil::CCUtil()
: //// mainblk.* /////////////////////////////////////////////////////
BOOL_MEMBER(m_print_variables, FALSE,
"Print initial values of all variables"),
STRING_MEMBER(m_data_sub_dir,
"tessdata/", "Directory for data files")
////////////////////////////////////////////////////////////////////
{
}
CCUtil::~CCUtil() {
}
CCUtilMutex::CCUtilMutex() {
#ifdef WIN32
mutex_ = CreateMutex(0, FALSE, 0);
#else
pthread_mutex_init(&mutex_, NULL);
#endif
}
void CCUtilMutex::Lock() {
#ifdef WIN32
WaitForSingleObject(mutex_, INFINITE);
#else
pthread_mutex_lock(&mutex_);
#endif
}
void CCUtilMutex::Unlock() {
#ifdef WIN32
ReleaseMutex(mutex_);
#else
pthread_mutex_unlock(&mutex_);
#endif
}
CCUtilMutex tprintfMutex;
} // namespace tesseract

83
ccutil/ccutil.h Normal file
View File

@ -0,0 +1,83 @@
///////////////////////////////////////////////////////////////////////
// File: ccutil.h
// Description: ccutil class.
// Author: Samuel Charron
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCUTIL_CCUTIL_H__
#define TESSERACT_CCUTIL_CCUTIL_H__
#include "ambigs.h"
#include "errcode.h"
#include "strngs.h"
#include "tessdatamanager.h"
#include "varable.h"
#include "unicharset.h"
#ifdef WIN32
#include <windows.h>
#else
#include <pthread.h>
#include <semaphore.h>
#endif
namespace tesseract {
class CCUtilMutex {
public:
CCUtilMutex();
void Lock();
void Unlock();
private:
#ifdef WIN32
HANDLE mutex_;
#else
pthread_mutex_t mutex_;
#endif
};
class CCUtil {
public:
CCUtil();
~CCUtil();
public:
void main_setup(
const char *argv0, // program name
const char *basename // name of image
);
public:
STRING datadir; // dir for data files
STRING imagebasename; // name of image
BOOL_VAR_H (m_print_variables, FALSE,
"Print initial values of all variables");
STRING_VAR_H (m_data_sub_dir, "tessdata/", "Directory for data files");
STRING lang;
STRING language_data_path_prefix;
TessdataManager tessdata_manager;
UNICHARSET unicharset;
UnicharAmbigs unichar_ambigs;
STRING imagefile; // image file name
STRING directory; // main directory
};
extern CCUtilMutex tprintfMutex;
} // namespace tesseract
#endif // TESSERACT_CCUTIL_CCUTIL_H__

819
ccutil/ccutil.vcproj Executable file
View File

@ -0,0 +1,819 @@
<?xml version="1.0" encoding="Windows-1252"?>
<VisualStudioProject
ProjectType="Visual C++"
Version="9.00"
Name="ccutil"
ProjectGUID="{DF2FA86F-A663-4805-AED7-2F81D9EAC796}"
RootNamespace="ccutil"
Keyword="Win32Proj"
TargetFrameworkVersion="196613"
>
<Platforms>
<Platform
Name="Win32"
/>
</Platforms>
<ToolFiles>
</ToolFiles>
<Configurations>
<Configuration
Name="Debug|Win32"
OutputDirectory="$(SolutionDir)$(ConfigurationName)"
IntermediateDirectory="$(ConfigurationName)"
ConfigurationType="4"
CharacterSet="2"
>
<Tool
Name="VCPreBuildEventTool"
/>
<Tool
Name="VCCustomBuildTool"
/>
<Tool
Name="VCXMLDataGeneratorTool"
/>
<Tool
Name="VCWebServiceProxyGeneratorTool"
/>
<Tool
Name="VCMIDLTool"
/>
<Tool
Name="VCCLCompilerTool"
Optimization="0"
PreprocessorDefinitions="WIN32;_DEBUG;_LIB;__MSW32__;_CRT_SECURE_NO_WARNINGS"
MinimalRebuild="true"
BasicRuntimeChecks="3"
RuntimeLibrary="1"
UsePrecompiledHeader="0"
WarningLevel="3"
DebugInformationFormat="4"
/>
<Tool
Name="VCManagedResourceCompilerTool"
/>
<Tool
Name="VCResourceCompilerTool"
/>
<Tool
Name="VCPreLinkEventTool"
/>
<Tool
Name="VCLibrarianTool"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
Name="VCXDCMakeTool"
/>
<Tool
Name="VCBscMakeTool"
/>
<Tool
Name="VCFxCopTool"
/>
<Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
<Configuration
Name="Release|Win32"
OutputDirectory="$(SolutionDir)$(ConfigurationName)"
IntermediateDirectory="$(ConfigurationName)"
ConfigurationType="4"
CharacterSet="2"
WholeProgramOptimization="1"
>
<Tool
Name="VCPreBuildEventTool"
/>
<Tool
Name="VCCustomBuildTool"
/>
<Tool
Name="VCXMLDataGeneratorTool"
/>
<Tool
Name="VCWebServiceProxyGeneratorTool"
/>
<Tool
Name="VCMIDLTool"
/>
<Tool
Name="VCCLCompilerTool"
Optimization="2"
EnableIntrinsicFunctions="true"
PreprocessorDefinitions="WIN32;NDEBUG;_LIB;__MSW32__;_CRT_SECURE_NO_WARNINGS"
RuntimeLibrary="0"
EnableFunctionLevelLinking="true"
UsePrecompiledHeader="0"
WarningLevel="3"
DebugInformationFormat="3"
/>
<Tool
Name="VCManagedResourceCompilerTool"
/>
<Tool
Name="VCResourceCompilerTool"
/>
<Tool
Name="VCPreLinkEventTool"
/>
<Tool
Name="VCLibrarianTool"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
Name="VCXDCMakeTool"
/>
<Tool
Name="VCBscMakeTool"
/>
<Tool
Name="VCFxCopTool"
/>
<Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
</Configurations>
<References>
</References>
<Files>
<Filter
Name="Source Files"
Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
>
<File
RelativePath=".\ambigs.cpp"
>
</File>
<File
RelativePath=".\basedir.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\bits16.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\boxread.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\ccutil.cpp"
>
</File>
<File
RelativePath=".\clst.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\debugwin.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\elst.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\elst2.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\errcode.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\globaloc.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\hashfn.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\mainblk.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\memblk.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\memry.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\mfcpch.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="1"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="1"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\ocrshell.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\serialis.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\strngs.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\tessdatamanager.cpp"
>
</File>
<File
RelativePath=".\tessopt.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\tordvars.cpp"
>
</File>
<File
RelativePath=".\tprintf.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\unichar.cpp"
>
</File>
<File
RelativePath=".\unicharmap.cpp"
>
</File>
<File
RelativePath=".\unicharset.cpp"
>
</File>
<File
RelativePath=".\varable.cpp"
>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
UsePrecompiledHeader="2"
PrecompiledHeaderThrough="mfcpch.h"
/>
</FileConfiguration>
</File>
</Filter>
<Filter
Name="Header Files"
Filter="h;hpp;hxx;hm;inl;inc;xsd"
UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
>
<File
RelativePath=".\ambigs.h"
>
</File>
<File
RelativePath=".\basedir.h"
>
</File>
<File
RelativePath=".\bits16.h"
>
</File>
<File
RelativePath=".\boxread.h"
>
</File>
<File
RelativePath=".\callback.h"
>
</File>
<File
RelativePath=".\ccutil.h"
>
</File>
<File
RelativePath=".\clst.h"
>
</File>
<File
RelativePath=".\debugwin.h"
>
</File>
<File
RelativePath=".\elst.h"
>
</File>
<File
RelativePath=".\elst2.h"
>
</File>
<File
RelativePath=".\errcode.h"
>
</File>
<File
RelativePath=".\fileerr.h"
>
</File>
<File
RelativePath=".\genericvector.h"
>
</File>
<File
RelativePath=".\globaloc.h"
>
</File>
<File
RelativePath=".\hashfn.h"
>
</File>
<File
RelativePath=".\helpers.h"
>
</File>
<File
RelativePath=".\host.h"
>
</File>
<File
RelativePath=".\hosthplb.h"
>
</File>
<File
RelativePath=".\lsterr.h"
>
</File>
<File
RelativePath=".\mainblk.h"
>
</File>
<File
RelativePath=".\memblk.h"
>
</File>
<File
RelativePath=".\memry.h"
>
</File>
<File
RelativePath=".\memryerr.h"
>
</File>
<File
RelativePath=".\mfcpch.h"
>
</File>
<File
RelativePath=".\ndminx.h"
>
</File>
<File
RelativePath=".\notdll.h"
>
</File>
<File
RelativePath=".\nwmain.h"
>
</File>
<File
RelativePath=".\ocrclass.h"
>
</File>
<File
RelativePath=".\ocrshell.h"
>
</File>
<File
RelativePath=".\platform.h"
>
</File>
<File
RelativePath=".\qrsequence.h"
>
</File>
<File
RelativePath=".\scanutils.h"
>
</File>
<File
RelativePath=".\secname.h"
>
</File>
<File
RelativePath=".\serialis.h"
>
</File>
<File
RelativePath=".\stderr.h"
>
</File>
<File
RelativePath=".\strngs.h"
>
</File>
<File
RelativePath=".\tessclas.h"
>
</File>
<File
RelativePath=".\tessdatamanager.h"
>
</File>
<File
RelativePath=".\tessopt.h"
>
</File>
<File
RelativePath=".\tordvars.h"
>
</File>
<File
RelativePath=".\tprintf.h"
>
</File>
<File
RelativePath=".\unichar.h"
>
</File>
<File
RelativePath=".\unicharmap.h"
>
</File>
<File
RelativePath=".\unicharset.h"
>
</File>
<File
RelativePath=".\unicity_table.h"
>
</File>
<File
RelativePath=".\varable.h"
>
</File>
</Filter>
<Filter
Name="Resource Files"
Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
>
</Filter>
</Files>
<Globals>
</Globals>
</VisualStudioProject>

View File

@ -96,11 +96,11 @@ class DLLSYM CLIST
void shallow_clear(); //clear list but dont
//delete data elements
BOOL8 empty() { //is list empty?
bool empty() { //is list empty?
return !last;
}
BOOL8 singleton() {
bool singleton() {
return last != NULL ? (last == last->next) : FALSE;
}
@ -165,13 +165,13 @@ class DLLSYM CLIST_ITERATOR
CLIST_LINK *prev; //prev element
CLIST_LINK *current; //current element
CLIST_LINK *next; //next element
BOOL8 ex_current_was_last; //current extracted
bool ex_current_was_last; //current extracted
//was end of list
BOOL8 ex_current_was_cycle_pt; //current extracted
bool ex_current_was_cycle_pt; //current extracted
//was cycle point
CLIST_LINK *cycle_pt; //point we are cycling
//the list to.
BOOL8 started_cycling; //Have we moved off
bool started_cycling; //Have we moved off
//the start?
CLIST_LINK *extract_sublist( //from this current...
@ -229,7 +229,7 @@ class DLLSYM CLIST_ITERATOR
void mark_cycle_pt(); //remember current
BOOL8 empty() { //is list empty?
bool empty() { //is list empty?
#ifndef NDEBUG
if (!list)
NO_LIST.error ("CLIST_ITERATOR::empty", ABORT, NULL);
@ -237,15 +237,15 @@ class DLLSYM CLIST_ITERATOR
return list->empty ();
}
BOOL8 current_extracted() { //current extracted?
bool current_extracted() { //current extracted?
return !current;
}
BOOL8 at_first(); //Current is first?
bool at_first(); //Current is first?
BOOL8 at_last(); //Current is last?
bool at_last(); //Current is last?
BOOL8 cycled_list(); //Completed a cycle?
bool cycled_list(); //Completed a cycle?
void add_to_end( //add at end &
void *new_data); //dont move
@ -695,7 +695,7 @@ inline void CLIST_ITERATOR::mark_cycle_pt() {
*
**********************************************************************/
inline BOOL8 CLIST_ITERATOR::at_first() {
inline bool CLIST_ITERATOR::at_first() {
#ifndef NDEBUG
if (!this)
NULL_OBJECT.error ("CLIST_ITERATOR::at_first", ABORT, NULL);
@ -717,7 +717,7 @@ inline BOOL8 CLIST_ITERATOR::at_first() {
*
**********************************************************************/
inline BOOL8 CLIST_ITERATOR::at_last() {
inline bool CLIST_ITERATOR::at_last() {
#ifndef NDEBUG
if (!this)
NULL_OBJECT.error ("CLIST_ITERATOR::at_last", ABORT, NULL);
@ -739,7 +739,7 @@ inline BOOL8 CLIST_ITERATOR::at_last() {
*
**********************************************************************/
inline BOOL8 CLIST_ITERATOR::cycled_list() {
inline bool CLIST_ITERATOR::cycled_list() {
#ifndef NDEBUG
if (!this)
NULL_OBJECT.error ("CLIST_ITERATOR::cycled_list", ABORT, NULL);

View File

@ -39,7 +39,6 @@ static LCommander *pCommander = NULL;
//NT implementation
#if defined(__MSW32__) && !defined(_CONSOLE)
#include <io.h>
#define ID_DEBUG_MSG 32779
/**********************************************************************

View File

@ -141,11 +141,11 @@ class DLLSYM ELIST
//ptr to zapper functn
void (*zapper) (ELIST_LINK *));
BOOL8 empty() { //is list empty?
bool empty() { //is list empty?
return !last;
}
BOOL8 singleton() {
bool singleton() {
return last ? (last == last->next) : FALSE;
}
@ -210,13 +210,13 @@ class DLLSYM ELIST_ITERATOR
ELIST_LINK *prev; //prev element
ELIST_LINK *current; //current element
ELIST_LINK *next; //next element
BOOL8 ex_current_was_last; //current extracted
bool ex_current_was_last; //current extracted
//was end of list
BOOL8 ex_current_was_cycle_pt; //current extracted
bool ex_current_was_cycle_pt; //current extracted
//was cycle point
ELIST_LINK *cycle_pt; //point we are cycling
//the list to.
BOOL8 started_cycling; //Have we moved off
bool started_cycling; //Have we moved off
//the start?
ELIST_LINK *extract_sublist( //from this current...
@ -274,7 +274,7 @@ class DLLSYM ELIST_ITERATOR
void mark_cycle_pt(); //remember current
BOOL8 empty() { //is list empty?
bool empty() { //is list empty?
#ifndef NDEBUG
if (!list)
NO_LIST.error ("ELIST_ITERATOR::empty", ABORT, NULL);
@ -282,15 +282,15 @@ class DLLSYM ELIST_ITERATOR
return list->empty ();
}
BOOL8 current_extracted() { //current extracted?
bool current_extracted() { //current extracted?
return !current;
}
BOOL8 at_first(); //Current is first?
bool at_first(); //Current is first?
BOOL8 at_last(); //Current is last?
bool at_last(); //Current is last?
BOOL8 cycled_list(); //Completed a cycle?
bool cycled_list(); //Completed a cycle?
void add_to_end( //add at end &
ELIST_LINK *new_link); //dont move
@ -728,7 +728,7 @@ inline void ELIST_ITERATOR::mark_cycle_pt() {
*
**********************************************************************/
inline BOOL8 ELIST_ITERATOR::at_first() {
inline bool ELIST_ITERATOR::at_first() {
#ifndef NDEBUG
if (!this)
NULL_OBJECT.error ("ELIST_ITERATOR::at_first", ABORT, NULL);
@ -750,7 +750,7 @@ inline BOOL8 ELIST_ITERATOR::at_first() {
*
**********************************************************************/
inline BOOL8 ELIST_ITERATOR::at_last() {
inline bool ELIST_ITERATOR::at_last() {
#ifndef NDEBUG
if (!this)
NULL_OBJECT.error ("ELIST_ITERATOR::at_last", ABORT, NULL);
@ -772,7 +772,7 @@ inline BOOL8 ELIST_ITERATOR::at_last() {
*
**********************************************************************/
inline BOOL8 ELIST_ITERATOR::cycled_list() {
inline bool ELIST_ITERATOR::cycled_list() {
#ifndef NDEBUG
if (!this)
NULL_OBJECT.error ("ELIST_ITERATOR::cycled_list", ABORT, NULL);

View File

@ -110,11 +110,11 @@ class DLLSYM ELIST2
void (*zapper) (ELIST2_LINK *));
//ptr to zapper functn
BOOL8 empty() { //is list empty?
bool empty() { //is list empty?
return !last;
}
BOOL8 singleton() {
bool singleton() {
return last ? (last == last->next) : FALSE;
}
@ -179,13 +179,13 @@ class DLLSYM ELIST2_ITERATOR
ELIST2_LINK *prev; //prev element
ELIST2_LINK *current; //current element
ELIST2_LINK *next; //next element
BOOL8 ex_current_was_last; //current extracted
bool ex_current_was_last; //current extracted
//was end of list
BOOL8 ex_current_was_cycle_pt; //current extracted
bool ex_current_was_cycle_pt; //current extracted
//was cycle point
ELIST2_LINK *cycle_pt; //point we are cycling
//the list to.
BOOL8 started_cycling; //Have we moved off
bool started_cycling; //Have we moved off
//the start?
ELIST2_LINK *extract_sublist( //from this current...
@ -246,7 +246,7 @@ class DLLSYM ELIST2_ITERATOR
void mark_cycle_pt(); //remember current
BOOL8 empty() { //is list empty?
bool empty() { //is list empty?
#ifndef NDEBUG
if (!list)
NO_LIST.error ("ELIST2_ITERATOR::empty", ABORT, NULL);
@ -254,15 +254,15 @@ class DLLSYM ELIST2_ITERATOR
return list->empty ();
}
BOOL8 current_extracted() { //current extracted?
bool current_extracted() { //current extracted?
return !current;
}
BOOL8 at_first(); //Current is first?
bool at_first(); //Current is first?
BOOL8 at_last(); //Current is last?
bool at_last(); //Current is last?
BOOL8 cycled_list(); //Completed a cycle?
bool cycled_list(); //Completed a cycle?
void add_to_end( //add at end &
ELIST2_LINK *new_link); //dont move
@ -750,7 +750,7 @@ inline void ELIST2_ITERATOR::mark_cycle_pt() {
*
**********************************************************************/
inline BOOL8 ELIST2_ITERATOR::at_first() {
inline bool ELIST2_ITERATOR::at_first() {
#ifndef NDEBUG
if (!this)
NULL_OBJECT.error ("ELIST2_ITERATOR::at_first", ABORT, NULL);
@ -772,7 +772,7 @@ inline BOOL8 ELIST2_ITERATOR::at_first() {
*
**********************************************************************/
inline BOOL8 ELIST2_ITERATOR::at_last() {
inline bool ELIST2_ITERATOR::at_last() {
#ifndef NDEBUG
if (!this)
NULL_OBJECT.error ("ELIST2_ITERATOR::at_last", ABORT, NULL);
@ -794,7 +794,7 @@ inline BOOL8 ELIST2_ITERATOR::at_last() {
*
**********************************************************************/
inline BOOL8 ELIST2_ITERATOR::cycled_list() {
inline bool ELIST2_ITERATOR::cycled_list() {
#ifndef NDEBUG
if (!this)
NULL_OBJECT.error ("ELIST2_ITERATOR::cycled_list", ABORT, NULL);

398
ccutil/genericvector.h Normal file
View File

@ -0,0 +1,398 @@
///////////////////////////////////////////////////////////////////////
// File: genericvector.h
// Description: Generic vector class
// Author: Daria Antonova
// Created: Mon Jun 23 11:26:43 PDT 2008
//
// (C) Copyright 2007, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
//
#ifndef TESSERACT_CCUTIL_GENERICVECTOR_H_
#define TESSERACT_CCUTIL_GENERICVECTOR_H_
#include <stdio.h>
#include "callback.h"
#include "errcode.h"
template <typename T>
class GenericVector {
public:
GenericVector() { this->init(kDefaultVectorSize); }
GenericVector(int size) { this->init(size); }
// Copy
GenericVector(const GenericVector& other) {
this->init(other.size());
this->operator+=(other);
}
GenericVector<T> &operator+=(const GenericVector& other);
GenericVector<T> &operator=(const GenericVector& other);
virtual ~GenericVector();
// Reserve some memory.
void reserve(int size);
// Double the size of the internal array.
void double_the_size();
// Init the object, allocating size memory.
void init(int size);
// Return the size used.
int size() const {
return size_used_;
}
int length() const {
return size_used_;
}
// Return true if empty.
bool empty() const {
return size_used_ == 0;
}
// Return the object from an index.
T &get(int index) const;
T &operator[](int index) const;
// Return the index of the T object.
// This method NEEDS a compare_callback to be passed to
// set_compare_callback.
int get_index(T object) const;
// Return true if T is in the array
bool contains(T object) const;
// Return true if the index is valid
T contains_index(int index) const;
// Push an element in the end of the array
int push_back(T object);
void operator+=(T t);
// Set the value at the given index
void set(T t, int index);
// Insert t at the given index, push other elements to the right.
void insert(T t, int index);
// Removes an element at the given index and
// shifts the remaining elements to the left.
void remove(int index);
// Add a callback to be called to delete the elements when the array took
// their ownership.
void set_clear_callback(Callback1<T>* cb);
// Add a callback to be called to compare the elements when needed (contains,
// get_id, ...)
void set_compare_callback(ResultCallback2<bool, T const &, T const &>* cb);
// Clear the array, calling the clear callback function if any.
// All the owned Callbacks are also deleted.
// If you don't want the Callbacks to be deleted, before calling clear, set
// the callback to NULL.
virtual void clear();
// Delete objects pointed to by data_[i]
void delete_data_pointers();
// This method clears the current object, then, does a shallow copy of
// its argument, and finally invalidate its argument.
// Callbacks are moved to the current object;
void move(GenericVector<T>* from);
// Read/Write the array to a file. This does _NOT_ read/write the callbacks.
// The Callback given must be permanent since they will be called more than
// once. The given callback will be deleted at the end.
void write(FILE* f, Callback2<FILE*, T const &>* cb);
void read(FILE* f, Callback3<FILE*, T*, bool>* cb, bool swap);
// Allocates a new array of double the current_size, copies over the
// information from data to the new location, deletes data and returns
// the pointed to the new larger array.
// This function uses memcpy to copy the data, instead of invoking
// operator=() for each element like double_the_size() does.
static T *double_the_size_memcpy(int current_size, T *data) {
T *data_new = new T[current_size * 2];
memcpy(data_new, data, sizeof(T) * current_size);
delete[] data;
return data_new;
}
protected:
// We are assuming that the object generally placed in thie
// vector are small enough that for efficiency it makes sence
// to start with a larger initial size.
static const int kDefaultVectorSize = 4;
int size_used_;
int size_reserved_;
T* data_;
Callback1<T>* clear_cb_;
// Mutable because Run method is not const
mutable ResultCallback2<bool, T const &, T const &>* compare_cb_;
};
namespace tesseract {
template <typename T>
bool cmp_eq(T const & t1, T const & t2) {
return t1 == t2;
}
} // namespace tesseract
// A useful vector that uses operator== to do comparisons.
template <typename T>
class GenericVectorEqEq : public GenericVector<T> {
public:
GenericVectorEqEq() {
GenericVector<T>::set_compare_callback(
NewPermanentCallback(tesseract::cmp_eq<T>));
}
GenericVectorEqEq(int size) : GenericVector<T>(size) {
GenericVector<T>::set_compare_callback(
NewPermanentCallback(tesseract::cmp_eq<T>));
}
};
template <typename T>
void GenericVector<T>::init(int size) {
size_used_ = 0;
size_reserved_ = 0;
data_ = 0;
clear_cb_ = 0;
compare_cb_ = 0;
reserve(size);
}
template <typename T>
GenericVector<T>::~GenericVector() {
clear();
}
// Reserve some memory. If the internal array contains elements, they are
// copied.
template <typename T>
void GenericVector<T>::reserve(int size) {
if (size_reserved_ > size || size <= 0)
return;
T* new_array = new T[size];
for (int i = 0; i < size_used_; ++i)
new_array[i] = data_[i];
if (data_ != NULL) delete[] data_;
data_ = new_array;
size_reserved_ = size;
}
template <typename T>
void GenericVector<T>::double_the_size() {
if (size_reserved_ == 0) {
reserve(kDefaultVectorSize);
}
else {
reserve(2 * size_reserved_);
}
}
// Return the object from an index.
template <typename T>
T &GenericVector<T>::get(int index) const {
ASSERT_HOST(index >= 0 && index < size_used_);
return data_[index];
}
template <typename T>
T &GenericVector<T>::operator[](int index) const {
return data_[index];
}
// Return the object from an index.
template <typename T>
void GenericVector<T>::set(T t, int index) {
ASSERT_HOST(index >= 0 && index < size_used_);
data_[index] = t;
}
// Shifts the rest of the elements to the right to make
// space for the new elements and inserts the given element
// at the specified index.
template <typename T>
void GenericVector<T>::insert(T t, int index) {
ASSERT_HOST(index >= 0 && index < size_used_);
if (size_reserved_ == size_used_)
double_the_size();
for (int i = size_used_; i > index; --i) {
data_[i] = data_[i-1];
}
data_[index] = t;
size_used_++;
}
// Removes an element at the given index and
// shifts the remaining elements to the left.
template <typename T>
void GenericVector<T>::remove(int index) {
ASSERT_HOST(index >= 0 && index < size_used_);
for (int i = index; i < size_used_ - 1; ++i) {
data_[i] = data_[i+1];
}
size_used_--;
}
// Return true if the index is valindex
template <typename T>
T GenericVector<T>::contains_index(int index) const {
return index >= 0 && index < size_used_;
}
// Return the index of the T object.
template <typename T>
int GenericVector<T>::get_index(T object) const {
for (int i = 0; i < size_used_; ++i) {
ASSERT_HOST(compare_cb_ != NULL);
if (compare_cb_->Run(object, data_[i]))
return i;
}
return -1;
}
// Return true if T is in the array
template <typename T>
bool GenericVector<T>::contains(T object) const {
return get_index(object) != -1;
}
// Add an element in the array
template <typename T>
int GenericVector<T>::push_back(T object) {
int index = 0;
if (size_used_ == size_reserved_)
double_the_size();
index = size_used_++;
data_[index] = object;
return index;
}
template <typename T>
void GenericVector<T>::operator+=(T t) {
push_back(t);
}
template <typename T>
GenericVector<T> &GenericVector<T>::operator+=(const GenericVector& other) {
for (int i = 0; i < other.size(); ++i) {
this->operator+=(other.data_[i]);
}
return *this;
}
template <typename T>
GenericVector<T> &GenericVector<T>::operator=(const GenericVector& other) {
this->clear();
this->operator+=(other);
return *this;
}
// Add a callback to be called to delete the elements when the array took
// their ownership.
template <typename T>
void GenericVector<T>::set_clear_callback(Callback1<T>* cb) {
clear_cb_ = cb;
}
// Add a callback to be called to delete the elements when the array took
// their ownership.
template <typename T>
void GenericVector<T>::set_compare_callback(ResultCallback2<bool, T const &, T const &>* cb) {
compare_cb_ = cb;
}
// Clear the array, calling the callback function if any.
template <typename T>
void GenericVector<T>::clear() {
if (size_reserved_ > 0) {
if (clear_cb_ != NULL)
for (int i = 0; i < size_used_; ++i)
clear_cb_->Run(data_[i]);
delete[] data_;
size_used_ = 0;
size_reserved_ = 0;
}
if (clear_cb_ != NULL) {
delete clear_cb_;
clear_cb_ = NULL;
}
if (compare_cb_ != NULL) {
delete compare_cb_;
compare_cb_ = NULL;
}
}
template <typename T>
void GenericVector<T>::delete_data_pointers() {
for (int i = 0; i < size_used_; ++i)
if (data_[i]) {
delete data_[i];
}
}
template <typename T>
void GenericVector<T>::write(FILE* f, Callback2<FILE*, T const &>* cb) {
fwrite(&size_reserved_, sizeof(int), 1, f);
fwrite(&size_used_, sizeof(int), 1, f);
for (int i = 0; i < size_used_; ++i) {
cb->Run(f, data_[i]);
}
delete cb;
}
template <typename T>
void GenericVector<T>::read(FILE* f, Callback3<FILE*, T*, bool>* cb, bool swap) {
uinT32 reserved;
fread(&reserved, sizeof(int), 1, f);
if (swap)
reserved = reverse32(reserved);
reserve(reserved);
fread(&size_used_, sizeof(int), 1, f);
if (swap)
size_used_ = reverse32(size_used_);
for (int i = 0; i < size_used_; ++i) {
cb->Run(f, data_ + i, swap);
}
delete cb;
}
// This method clear the current object, then, does a shallow copy of
// its argument, and finally invalindate its argument.
template <typename T>
void GenericVector<T>::move(GenericVector<T>* from) {
this->clear();
this->data_ = from->data_;
this->size_reserved_ = from->size_reserved_;
this->size_used_ = from->size_used_;
this->compare_cb_ = from->compare_cb_;
this->clear_cb_ = from->clear_cb_;
from->data_ = NULL;
from->clear_cb_ = NULL;
from->compare_cb_ = NULL;
from->size_used_ = 0;
from->size_reserved_ = 0;
}
#endif // TESSERACT_CCUTIL_GENERICVECTOR_H_

41
ccutil/helpers.h Normal file
View File

@ -0,0 +1,41 @@
/* -*-C-*-
********************************************************************************
*
* File: helpers.h
* Description: General utility functions
* Author: Daria Antonova
* Created: Wed Apr 8 14:37:00 2009
* Language: C
* Package: N/A
* Status: Reusable Software Component
*
* (c) Copyright 2009, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
********************************************************************************/
#ifndef TESSERACT_CCUTIL_HELPERS_H_
#define TESSERACT_CCUTIL_HELPERS_H_
// Remove newline (if any) at the end of the string.
inline void chomp_string(char *string) {
int last_index = strlen(string) - 1;
if (string[last_index] == '\n') {
string[last_index] = '\0';
}
}
// Advance the current pointer of the file if it points to a newline character.
inline void SkipNewline(FILE *file) {
if (fgetc(file) != '\n') fseek(file, -1, SEEK_CUR);
}
#endif

View File

@ -22,23 +22,22 @@
#ifdef __UNIX__
#include <unistd.h>
#include <signal.h>
#else
#include <io.h>
#endif
#include <stdlib.h>
#include "basedir.h"
#include "mainblk.h"
#include "ccutil.h"
#define VARDIR "configs/" /*variables files */
#define EXTERN
/*
EXTERN DLLSYM STRING datadir; //dir for data files
//name of image
EXTERN DLLSYM STRING imagebasename;
EXTERN BOOL_VAR (m_print_variables, FALSE,
"Print initial values of all variables");
EXTERN STRING_VAR (m_data_sub_dir, "tessdata/", "Directory for data files");
/*
EXTERN INT_VAR (memgrab_size, 0, "Preallocation size for batch use");*/
@ -46,24 +45,17 @@ const ERRCODE NO_PATH =
"Warning:explicit path for executable will not be used for configs";
static const ERRCODE USAGE = "Usage";
namespace tesseract {
/**********************************************************************
* main_setup
*
* Main for mithras demo program. Read the arguments and set up globals.
**********************************************************************/
void main_setup( /*main demo program */
void CCUtil::main_setup( /*main demo program */
const char *argv0, //program name
const char *basename, //name of image
int argc, /*argument count */
const char *const *argv /*arguments */
const char *basename //name of image
) {
inT32 arg; /*argument */
inT32 offset; //for flag
FILE *fp; /*variables file */
char flag[2]; //+/-
STRING varfile; /*name of file */
imagebasename = basename; /*name of image */
// TESSDATA_PREFIX Environment variable overrules everything.
@ -93,34 +85,6 @@ void main_setup( /*main demo program */
datadir = getenv("TESSDATA_PREFIX");
}
for (arg = 0; arg < argc; arg++) {
if (argv[arg][0] == '+' || argv[arg][0] == '-') {
offset = 1;
flag[0] = argv[arg][0];
}
else {
offset = 0;
}
flag[offset] = '\0';
varfile = flag;
/*attempt open */
fp = fopen (argv[arg] + offset, "r");
if (fp != NULL) {
fclose(fp); /*was only to test */
}
else {
varfile += datadir;
varfile += m_data_sub_dir; /*data directory */
varfile += VARDIR; /*variables dir */
}
/*actual name */
varfile += argv[arg] + offset;
read_variables_file (varfile.string ());
}
if (m_print_variables)
print_variables(stdout); /*print them all */
datadir += m_data_sub_dir; /*data directory */
}
} // namespace tesseract

View File

@ -26,14 +26,15 @@
extern DLLSYM STRING datadir; //dir for data files
//name of image
extern DLLSYM STRING imagebasename;
extern BOOL_VAR_H (m_print_variables, FALSE,
"Print initial values of all variables");
extern STRING_VAR_H (m_data_sub_dir, "data/", "Directory for data files");
extern INT_VAR_H (memgrab_size, 13000000, "Preallocation size for batch use");
void main_setup( /*main demo program */
const char *argv0, //program name
const char *basename, //name of image
int argc, /*argument count */
const char *const *argv /*arguments */
);
extern BOOL_VAR_H(m_print_variables, FALSE,
"Print initial values of all variables");
extern STRING_VAR_H(m_data_sub_dir, "data/", "Directory for data files");
extern INT_VAR_H(memgrab_size, 13000000, "Preallocation size for batch use");
// > ccutil.h
//void main_setup( /*main demo program */
// const char *argv0, //program name
// const char *basename, //name of image
// int argc, /*argument count */
// const char *const *argv /*arguments */
// );
#endif

View File

@ -3,6 +3,14 @@
#ifdef __MSW32__
#define SIGNED
#define snprintf _snprintf
#define read _read
#define write _write
#define close _close
#define lseek _lseek
#define open _open
#define ultoa _ultoa
#define ltoa _ltoa
#define strtok_r(s, d, p) strtok(s, d)
#if (_MSC_VER <= 1400)
#define vsnprintf _vsnprintf
#endif

80
ccutil/qrsequence.h Normal file
View File

@ -0,0 +1,80 @@
///////////////////////////////////////////////////////////////////////
// File: qrsequence.h
// Description: Quasi-random sequence generator class.
// Author: Ranjith Unnikrishnan
// Created: Wed May 20 2009
//
// Class to generate a (deterministic) quasi-random Van der Corput sequence that
// covers the interval [0,N) without repetition.
//
// The sequence is generated by reversing the base-2 representation of the
// sequence of natural numbers {0, 1,... M-1}, where M is 2^{num_bits_} and
// num_bits is the minimum number of bits required to represent N. If a reversed
// numbers is >= N it is rejected and the next natural number is considered
// until a valid output number is found.
//
// (C) Copyright 2009, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
// of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
// by applicable law or agreed to in writing, software distributed under the
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
// OF ANY KIND, either express or implied. See the License for the specific
// language governing permissions and limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCUTIL_QRSEQUENCE_H_
#define TESSERACT_CCUTIL_QRSEQUENCE_H_
#include <math.h>
class QRSequenceGenerator {
public:
// Object is initalized with the size of the output range.
explicit QRSequenceGenerator(int N) : N_(N), next_num_(0) {
num_bits_ = ceil(log(static_cast<double>(N)) / log(2.0));
}
// Main worker method that retrieves the next number in the sequence.
// Returns kInvalidVal if called more than N times after object initialization
int GetVal() {
const int kInvalidVal = -1;
const int kMaxNaturalNumberValue = 1 << num_bits_;
if (next_num_ >= kMaxNaturalNumberValue)
return kInvalidVal;
int n = next_num_;
while (next_num_ < kMaxNaturalNumberValue) {
n = GetBinaryReversedInteger(next_num_++);
if (n < N_) break;
}
return (next_num_ > kMaxNaturalNumberValue) ? kInvalidVal : n;
}
protected:
// Outputs the integer formed by reversing the bits of the input integer. Only
// the lowest num_bits_ bits of the input integer are reversed.
int GetBinaryReversedInteger(int in_val) const {
int bit_pos = num_bits_;
int out_val = 0;
while(bit_pos--) {
// Set the value of the last bit.
out_val |= (in_val & 0x1);
if (bit_pos > 0) {
// Left-shift output value to prepare for storing the next bit.
out_val <<= 1;
}
// Right-shift input value to prepare for retrieving the next bit.
in_val >>= 1;
}
return out_val;
}
int N_;
// Next number to be considered for reversal and output.
int next_num_;
// number of bits required to represent the numbers of the sequence
int num_bits_;
};
#endif // TESSERACT_CCUTIL_QRSEQUENCE_H_

View File

@ -31,6 +31,7 @@
#include <fcntl.h>
#include "scanutils.h"
#include "tprintf.h"
enum Flags {
FL_SPLAT = 0x01, // Drop the value, do not assign
@ -45,6 +46,7 @@ enum Ranks {
RANK_INT = 0,
RANK_LONG = 1,
RANK_LONGLONG = 2,
RANK_PTR = INT_MAX // Special value used for pointers
RANK_PTR = 3 // Special value used for pointers
};
@ -183,7 +185,7 @@ double strtofloat(const char* s)
{
int minus = 0;
int v = 0;
int d, c;
int d;
int k = 1;
int w = 0;
@ -243,7 +245,7 @@ int vfscanf(FILE* stream, const char *format, va_list ap)
ST_MATCH, // Main state of %[ sequence
ST_MATCH_RANGE, // After - in a %[ sequence
} state = ST_NORMAL;
char *oarg, *sarg = NULL; // %s %c or %[ string argument
char *sarg = NULL; // %s %c or %[ string argument
enum Bail bail = BAIL_NONE;
int sign;
int converted = 0; // Successful conversions

View File

@ -25,6 +25,7 @@
#include <stddef.h>
#include <stdio.h>
#include <klibc/extern.h>
#include <sys/stat.h>
// Attempts to parse the given file stream s as an integer of the base
// 'base'. Returns the first successfully parsed integer as a uintmax_t, or

203
ccutil/tessdatamanager.cpp Normal file
View File

@ -0,0 +1,203 @@
///////////////////////////////////////////////////////////////////////
// File: tessdatamanager.cpp
// Description: Functions to handle loading/combining tesseract data files.
// Author: Daria Antonova
// Created: Wed Jun 03 11:26:43 PST 2009
//
// (C) Copyright 2009, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include "tessdatamanager.h"
#include <stdio.h>
#include "serialis.h"
#include "strngs.h"
#include "tprintf.h"
#include "varable.h"
BOOL_VAR(global_load_system_dawg, true, "Load system word dawg.");
BOOL_VAR(global_load_freq_dawg, true, "Load frequent word dawg.");
BOOL_VAR(global_load_punc_dawg, true, "Load dawg with punctuation patterns.");
BOOL_VAR(global_load_number_dawg, true, "Load dawg with number patterns.");
INT_VAR(global_tessdata_manager_debug_level, 0,
"Debug level for TessdataManager functions.");
namespace tesseract {
void TessdataManager::Init(const char *data_file_name) {
int i;
data_file_ = fopen(data_file_name, "rb");
if (data_file_ == NULL) {
tprintf("Error openning data file %s\n", data_file_name);
exit(1);
}
fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
bool swap = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
if (swap) {
actual_tessdata_num_entries_ = reverse32(actual_tessdata_num_entries_);
}
ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
fread(offset_table_, sizeof(inT64),
actual_tessdata_num_entries_, data_file_);
if (swap) {
for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
offset_table_[i] = reverse64(offset_table_[i]);
}
}
if (global_tessdata_manager_debug_level) {
tprintf("TessdataManager loaded %d types of tesseract data files.\n",
actual_tessdata_num_entries_);
for (i = 0; i < actual_tessdata_num_entries_; ++i) {
tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
}
}
}
FILE *TessdataManager::GetFilePtr(const char *language_data_path_prefix,
const char *file_suffix, bool required_file,
bool text_file) {
STRING file_name = language_data_path_prefix;
file_name += file_suffix;
FILE *file_ptr = fopen(file_name.string(), text_file ? "r" : "rb");
if (required_file && (file_ptr == NULL)) {
tprintf("Error openning required file %s\n", file_name.string());
exit(1);
}
return file_ptr;
}
void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
bool newline_end) {
int buffer_size = 1024;
char *chunk = new char[buffer_size];
int bytes_read;
char last_char = 0x0;
while ((bytes_read = fread(chunk, sizeof(char),
buffer_size, input_file))) {
fwrite(chunk, sizeof(char), bytes_read, output_file);
last_char = chunk[bytes_read-1];
}
if (newline_end) ASSERT_HOST(last_char == '\n');
delete[] chunk;
}
void TessdataManager::CombineDataFiles(
const char *language_data_path_prefix,
const char *output_filename) {
FILE *file_ptr;
STRING file_name;
int i;
inT64 offset_table[TESSDATA_NUM_ENTRIES];
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
FILE *output_file = fopen(output_filename, "wb");
// Leave some space for recording the offset_table.
fseek(output_file,
sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
// Record language-specific tesseract config file.
file_ptr = GetFilePtr(language_data_path_prefix,
kLangConfigFileSuffix, false, true);
if (file_ptr != NULL) {
offset_table[TESSDATA_LANG_CONFIG] = ftell(output_file);
CopyFile(file_ptr, output_file, true);
fclose(file_ptr);
}
// Record unicharset.
file_ptr = GetFilePtr(language_data_path_prefix,
kUnicharsetFileSuffix, true, true);
offset_table[TESSDATA_UNICHARSET] = ftell(output_file);
CopyFile(file_ptr, output_file, true);
fclose(file_ptr);
// Record ambiguities.
file_ptr = GetFilePtr(language_data_path_prefix,
kAmbigsFileSuffix, false, true);
if (file_ptr != NULL) {
offset_table[TESSDATA_AMBIGS] = ftell(output_file);
CopyFile(file_ptr, output_file, true);
fclose(file_ptr);
}
// Record inttemp.
file_ptr =
GetFilePtr(language_data_path_prefix,
kBuiltInTemplatesFileSuffix, false, false);
if (file_ptr != NULL) {
offset_table[TESSDATA_INTTEMP] = ftell(output_file);
CopyFile(file_ptr, output_file, false);
fclose(file_ptr);
// Record pffmtable.
file_ptr = GetFilePtr(language_data_path_prefix,
kBuiltInCutoffsFileSuffix, true, true);
offset_table[TESSDATA_PFFMTABLE] = ftell(output_file);
CopyFile(file_ptr, output_file, true);
fclose(file_ptr);
// Record normproto.
file_ptr = GetFilePtr(language_data_path_prefix,
kNormProtoFileSuffix, true, true);
offset_table[TESSDATA_NORMPROTO] = ftell(output_file);
CopyFile(file_ptr, output_file, true);
fclose(file_ptr);
}
// Record dawgs.
file_ptr = GetFilePtr(language_data_path_prefix,
kPuncDawgFileSuffix, false, false);
if (file_ptr != NULL) {
offset_table[TESSDATA_PUNC_DAWG] = ftell(output_file);
CopyFile(file_ptr, output_file, false);
fclose(file_ptr);
}
file_ptr = GetFilePtr(language_data_path_prefix,
kSystemDawgFileSuffix, false, false);
if (file_ptr != NULL) {
offset_table[TESSDATA_SYSTEM_DAWG] = ftell(output_file);
CopyFile(file_ptr, output_file, false);
fclose(file_ptr);
}
file_ptr = GetFilePtr(language_data_path_prefix,
kNumberDawgFileSuffix, false, false);
if (file_ptr != NULL) {
offset_table[TESSDATA_NUMBER_DAWG] = ftell(output_file);
CopyFile(file_ptr, output_file, false);
fclose(file_ptr);
}
file_ptr = GetFilePtr(language_data_path_prefix,
kFreqDawgFileSuffix, false, false);
if (file_ptr != NULL) {
offset_table[TESSDATA_FREQ_DAWG] = ftell(output_file);
CopyFile(file_ptr, output_file, false);
fclose(file_ptr);
}
fseek(output_file, 0, SEEK_SET);
inT32 num_entries = TESSDATA_NUM_ENTRIES;
fwrite(&num_entries, sizeof(inT32), 1, output_file);
fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file);
fclose(output_file);
tprintf("TessdataManager combined tesseract data files.\n");
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
tprintf("Offset for type %d is %lld\n", i, offset_table[i]);
}
}
} // namespace tesseract

165
ccutil/tessdatamanager.h Normal file
View File

@ -0,0 +1,165 @@
///////////////////////////////////////////////////////////////////////
// File: tessdatamanager.h
// Description: Functions to handle loading/combining tesseract data files.
// Author: Daria Antonova
// Created: Wed Jun 03 11:26:43 PST 2009
//
// (C) Copyright 2009, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
#define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
#include <stdio.h>
#include "host.h"
#include "tprintf.h"
#include "varable.h"
extern BOOL_VAR_H(global_load_punc_dawg, true,
"Load dawg with punctuation patterns.");
extern BOOL_VAR_H(global_load_system_dawg, true, "Load system word dawg.");
extern BOOL_VAR_H(global_load_number_dawg, true,
"Load dawg with number patterns.");
extern BOOL_VAR_H(global_load_freq_dawg, true, "Load frequent word dawg.");
extern INT_VAR_H(global_tessdata_manager_debug_level, 0,
"Debug level for TessdataManager functions.");
static const char kTrainedDataSuffix[] = "traineddata";
static const char kLangConfigFileSuffix[] = "config";
static const char kUnicharsetFileSuffix[] = "unicharset";
static const char kAmbigsFileSuffix[] = "unicharambigs";
static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
static const char kNormProtoFileSuffix[] = "normproto";
static const char kPuncDawgFileSuffix[] = "punc-dawg";
static const char kSystemDawgFileSuffix[] = "word-dawg";
static const char kNumberDawgFileSuffix[] = "number-dawg";
static const char kFreqDawgFileSuffix[] = "freq-dawg";
namespace tesseract {
enum TessdataType {
TESSDATA_LANG_CONFIG, // 0
TESSDATA_UNICHARSET, // 1
TESSDATA_AMBIGS, // 2
TESSDATA_INTTEMP, // 3
TESSDATA_PFFMTABLE, // 4
TESSDATA_NORMPROTO, // 5
TESSDATA_PUNC_DAWG, // 6
TESSDATA_SYSTEM_DAWG, // 7
TESSDATA_NUMBER_DAWG, // 8
TESSDATA_FREQ_DAWG, // 9
TESSDATA_NUM_ENTRIES
};
// TessdataType could be updated to contain more entries, however
// we do not expect that number to be astronomically high.
// In order to automatically detect endianness TessdataManager will
// flip the bits if actual_tessdata_num_entries_ is larger than
// kMaxNumTessdataEntries.
static const int kMaxNumTessdataEntries = 1000;
class TessdataManager {
public:
TessdataManager() {
data_file_ = NULL;
actual_tessdata_num_entries_ = 0;
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
offset_table_[i] = -1;
}
}
~TessdataManager() {}
// Opens the given data file and reads the offset table.
void Init(const char *data_file_name);
// Returns data file pointer.
inline FILE *GetDataFilePtr() const { return data_file_; }
// Returns false if there is no data of the given type.
// Otherwise does a seek on the data_file_ to position the pointer
// at the start of the data of the given type.
inline bool SeekToStart(TessdataType tessdata_type) {
if (global_tessdata_manager_debug_level) {
tprintf("TessdataManager: seek to offset %lld (start of tessdata"
"type %d)\n", offset_table_[tessdata_type], tessdata_type);
}
if (offset_table_[tessdata_type] < 0) {
return false;
} else {
ASSERT_HOST(fseek(data_file_,
offset_table_[tessdata_type], SEEK_SET) == 0);
return true;
}
}
// Returns the end offset for the given tesseract data file type.
inline inT64 GetEndOffset(TessdataType tessdata_type) const {
int index = tessdata_type + 1;
while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) {
++index; // skip tessdata types not present in the combined file
}
if (global_tessdata_manager_debug_level) {
tprintf("TessdataManager: end offset for type %d is %lld\n",
tessdata_type,
(index == actual_tessdata_num_entries_) ? -1
: offset_table_[index]);
}
return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1;
}
// Closes data_file_ (if it was opened by Init()).
inline void End() {
if (data_file_ != NULL) {
fclose(data_file_);
data_file_ = NULL;
}
}
// Reads all the standard tesseract config and data files for a language
// at the given path and bundles them up into one binary data file.
static void CombineDataFiles(const char *language_data_path_prefix,
const char *output_filename);
private:
// Opens the file whose name is a concatentation of language_data_path_prefix
// and file_suffix. Terminates the program if required_file is set to true,
// but the file could not be found or opened for reading.
// Returns a file pointer to the opened file.
static FILE *GetFilePtr(const char *language_data_path_prefix,
const char *file_suffix, bool required_file,
bool text_file);
// Copies all the bytes in the given input file to the output_file provided.
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end);
// Each offset_table_[i] contains a file offset in the combined data file
// where the data of TessdataFileType i is stored.
inT64 offset_table_[TESSDATA_NUM_ENTRIES];
// Actual number of entries in the tessdata table. This value can only be
// same or smaller than TESSDATA_NUM_ENTRIES, but can never be larger,
// since then it would be impossible to interpret the type of tessdata at
// indices same and higher than TESSDATA_NUM_ENTRIES.
// This parameter is used to allow for backward compatiblity
// when new tessdata types are introduced.
inT32 actual_tessdata_num_entries_;
FILE *data_file_; // pointer to the data file.
};
} // namespace tesseract
#endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_

66
ccutil/tordvars.cpp Normal file
View File

@ -0,0 +1,66 @@
/* -*-C-*-
********************************************************************************
*
* File: tordvars.cpp
* Description: Text Ordering Control Variables
* Author: Mark Seaman, OCR Technology
* Created: Wed Jan 17 12:47:29 1990
* Modified: Tue Jul 30 16:22:40 1991 (Mark Seaman) marks@hpgrlt
* Language: C
* Package: N/A
* Status: Experimental (Do Not Distribute)
*
* (c) Copyright 1990, Hewlett-Packard Company.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
*********************************************************************************/
/*----------------------------------------------------------------------
I n c l u d e s
----------------------------------------------------------------------*/
#include "mfcpch.h"
#include <stdio.h>
#include "varable.h"
/*----------------------------------------------------------------------
V a r i a b l e s
----------------------------------------------------------------------*/
FILE *rawfile; /* Text before dictionary */
FILE *textfile; /* Text output file */
FILE *matcher_fp; //matcher log
FILE *correct_fp; //correct text
BOOL_VAR(tord_write_output, 0, "Text file output");
BOOL_VAR(tord_write_raw_output, 0, "Text before context");
BOOL_VAR(tord_similarity_enable, 0, "Switch for Similarity");
double_VAR(tord_certainty_threshold, -2.25, "Certainty Value");
INT_VAR(tord_num_word_choices, 30, "Number of choices");
BOOL_VAR(tord_blob_skip, 0, "Skip to Next selection");
double_VAR(tord_overlap_threshold, 0.33, "Overlap Threshold");
BOOL_VAR(tord_debug_3, 0, "Textord Debug #3");
BOOL_VAR(tord_debug_5, 0, "Textord Debug #5");
BOOL_VAR(tord_debug_8, 0, "Textord Debug #8");
INT_VAR(tord_display_ratings, 0, "Ratings display");
BOOL_VAR(tord_display_text, 0, "Display Text");
BOOL_VAR(tord_show_bold, 1, "Show Bold Text");

66
ccutil/tordvars.h Normal file
View File

@ -0,0 +1,66 @@
/* -*-C-*-
********************************************************************************
*
* File: tordvars.h
* Description: Text Ordering Control Variables
* Author: Mark Seaman, OCR Technology
* Created: Wed Oct 25 16:33:01 1989
* Modified: Mon Jul 1 14:28:23 1991 (Mark Seaman) marks@hpgrlt
* Language: C
* Package: N/A
* Status: Experimental (Do Not Distribute)
*
* (c) Copyright 1989, Hewlett-Packard Company.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
*********************************************************************************/
#ifndef TOVARS_H
#define TOVARS_H
#include <stdio.h>
#include "varable.h"
/*----------------------------------------------------------------------
V a r i a b l e s
----------------------------------------------------------------------*/
extern FILE *rawfile; /* Text before dictionary */
extern FILE *textfile; /* Text output file */
extern FILE *correct_fp; //correct text
extern FILE *matcher_fp;
extern BOOL_VAR_H(tord_write_output, 0, "Text file output");
extern BOOL_VAR_H(tord_write_raw_output, 0, "Text before context");
extern BOOL_VAR_H(tord_similarity_enable, 0, "Switch for Similarity");
extern double_VAR_H(tord_certainty_threshold, -2.25, "Certainty Value");
extern INT_VAR_H(tord_num_word_choices, 30, "Number of choices");
extern BOOL_VAR_H(tord_blob_skip, 0, "Skip to Next selection");
extern double_VAR_H(tord_overlap_threshold, 0.33, "Overlap Threshold");
extern BOOL_VAR_H(tord_debug_3, 0, "Textord Debug #3");
extern BOOL_VAR_H(tord_debug_5, 0, "Textord Debug #5");
extern BOOL_VAR_H(tord_debug_8, 0, "Textord Debug #8");
extern INT_VAR_H(tord_display_ratings, 0, "Ratings display");
extern BOOL_VAR_H(tord_display_text, 0, "Display Text");
extern BOOL_VAR_H(tord_show_bold, 1, "Show Bold Text");
#endif

View File

@ -24,6 +24,7 @@
#include "debugwin.h"
//#include "ipeerr.h"
#include "tprintf.h"
#include "ccutil.h"
#define MAX_MSG_LEN 1024
@ -36,6 +37,7 @@ DLLSYM void
tprintf ( //Trace printf
const char *format, ... //special message
) {
tesseract::tprintfMutex.Lock();
va_list args; //variable args
static FILE *debugfp = NULL; //debug file
//debug window
@ -76,6 +78,7 @@ const char *format, ... //special message
fprintf (stderr, "%s", msg);
}
}
tesseract::tprintfMutex.Unlock();
}

View File

@ -21,6 +21,7 @@
#define TESSERACT_CCUTIL_UNICHAR_H__
#include <memory.h>
#include <string.h>
// Maximum number of characters that can be stored in a UNICHAR. Must be
// at least 4. Must not exceed 31 without changing the coding of length.

View File

@ -22,13 +22,16 @@
#include <stdio.h>
#include <string.h>
#include "tprintf.h"
#include "unichar.h"
#include "unicharset.h"
#include "varable.h"
static const int ISALPHA_MASK = 0x1;
static const int ISLOWER_MASK = 0x2;
static const int ISUPPER_MASK = 0x4;
static const int ISDIGIT_MASK = 0x8;
static const int ISPUNCTUATION_MASK = 0x10;
UNICHARSET::UNICHARSET() :
unichars(NULL),
@ -38,15 +41,20 @@ UNICHARSET::UNICHARSET() :
script_table(0),
script_table_size_used(0),
script_table_size_reserved(0),
null_script("NULL")
{
}
null_script("NULL"),
null_sid_(0),
common_sid_(0),
latin_sid_(0),
cyrillic_sid_(0),
greek_sid_(0),
han_sid_(0) {}
UNICHARSET::~UNICHARSET() {
if (size_reserved > 0) {
for (int i = 0; i < script_table_size_used; ++i)
delete[] script_table[i];
delete[] script_table;
delete_pointers_in_unichars();
delete[] unichars;
}
}
@ -56,8 +64,10 @@ void UNICHARSET::reserve(int unichars_number) {
UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
for (int i = 0; i < size_used; ++i)
memcpy(&unichars_new[i], &unichars[i], sizeof(UNICHAR_SLOT));
for (int j = size_used; j < unichars_number; ++j)
unichars_new[j].properties.script = add_script(null_script);
for (int j = size_used; j < unichars_number; ++j) {
unichars_new[j].properties.script_id = add_script(null_script);
unichars_new[j].properties.fragment = NULL;
}
delete[] unichars;
unichars = unichars_new;
size_reserved = unichars_number;
@ -66,15 +76,15 @@ void UNICHARSET::reserve(int unichars_number) {
const UNICHAR_ID
UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
assert(ids.contains(unichar_repr));
return ids.unichar_to_id(unichar_repr);
return ids.contains(unichar_repr) ?
ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
}
const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
int length) const {
assert(length > 0 && length <= UNICHAR_LEN);
assert(ids.contains(unichar_repr, length));
return ids.unichar_to_id(unichar_repr, length);
return ids.contains(unichar_repr, length) ?
ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
}
// Return the minimum number of bytes that matches a legal UNICHAR_ID,
@ -102,14 +112,16 @@ int UNICHARSET::step(const char* str) const {
}
const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
if (id == INVALID_UNICHAR_ID) {
return INVALID_UNICHAR;
}
assert(id < this->size());
return unichars[id].representation;
}
// Return a STRING containing debug information on the unichar, including
// the id_to_unichar, its hex unicodes and the properties.
STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
const char* str = id_to_unichar(id);
// Return a STRING that reformats the utf8 str into the str followed
// by its hex unicodes.
STRING UNICHARSET::debug_utf8_str(const char* str) {
STRING result = str;
result += " [";
int step = 1;
@ -128,6 +140,21 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
result += " ";
}
result += "]";
return result;
}
// Return a STRING containing debug information on the unichar, including
// the id_to_unichar, its hex unicodes and the properties.
STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
const CHAR_FRAGMENT *fragment = this->get_fragment(id);
if (fragment) {
STRING base = debug_str(fragment->get_unichar());
return CHAR_FRAGMENT::to_string(base.string(), fragment->get_pos(),
fragment->get_total());
}
const char* str = id_to_unichar(id);
if (id == INVALID_UNICHAR_ID) return STRING(str);
STRING result = debug_utf8_str(str);
// Append a for lower alpha, A for upper alpha, and x if alpha but neither.
if (get_isalpha(id)) {
if (get_islower(id))
@ -141,11 +168,22 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
if (get_isdigit(id)) {
result += "0";
}
// Append p is a punctuation symbol.
if (get_ispunctuation(id)) {
result += "p";
}
return result;
}
void UNICHARSET::unichar_insert(const char* const unichar_repr) {
if (!ids.contains(unichar_repr)) {
if (strlen(unichar_repr) > UNICHAR_LEN) {
fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
int(strlen(unichar_repr)), unichar_repr);
return;
}
if (size_used == size_reserved) {
if (size_used == 0)
reserve(8);
@ -158,31 +196,43 @@ void UNICHARSET::unichar_insert(const char* const unichar_repr) {
this->set_islower(size_used, false);
this->set_isupper(size_used, false);
this->set_isdigit(size_used, false);
this->set_script(size_used, add_script(null_script));
this->set_ispunctuation(size_used, false);
this->set_isngram(size_used, false);
this->set_script(size_used, null_script);
// If the given unichar_repr represents a fragmented character, set
// fragment property to a pointer to CHAR_FRAGMENT class instance with
// information parsed from the unichar representation. Use the script
// of the base unichar for the fragmented character if possible.
CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
this->unichars[size_used].properties.fragment = frag;
if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
this->unichars[size_used].properties.script_id =
this->get_script(frag->get_unichar());
}
this->unichars[size_used].properties.enabled = true;
ids.insert(unichar_repr, size_used);
++size_used;
}
}
bool UNICHARSET::contains_unichar(const char* const unichar_repr) {
bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
return ids.contains(unichar_repr);
}
bool UNICHARSET::contains_unichar(const char* const unichar_repr, int length) {
bool UNICHARSET::contains_unichar(const char* const unichar_repr,
int length) const {
if (length == 0) {
return false;
}
return ids.contains(unichar_repr, length);
}
bool UNICHARSET::eq(UNICHAR_ID unichar_id, const char* const unichar_repr) {
bool UNICHARSET::eq(UNICHAR_ID unichar_id,
const char* const unichar_repr) const {
return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
}
bool UNICHARSET::save_to_file(const char* filename) const {
FILE* file = fopen(filename, "w+");
if (file == NULL)
return false;
bool UNICHARSET::save_to_file(FILE *file) const {
fprintf(file, "%d\n", this->size());
for (UNICHAR_ID id = 0; id < this->size(); ++id) {
unsigned int properties = 0;
@ -195,29 +245,28 @@ bool UNICHARSET::save_to_file(const char* filename) const {
properties |= ISUPPER_MASK;
if (this->get_isdigit(id))
properties |= ISDIGIT_MASK;
if (this->get_ispunctuation(id))
properties |= ISPUNCTUATION_MASK;
if (strcmp(this->id_to_unichar(id), " ") == 0)
fprintf(file, "%s %x %s\n", "NULL", properties, this->get_script(id));
fprintf(file, "%s %x %s %d\n", "NULL", properties,
this->get_script_from_script_id(this->get_script(id)),
this->get_other_case(id));
else
fprintf(file, "%s %x %s\n", this->id_to_unichar(id), properties,
this->get_script(id));
fprintf(file, "%s %x %s %d\n", this->id_to_unichar(id), properties,
this->get_script_from_script_id(this->get_script(id)),
this->get_other_case(id));
}
fclose(file);
return true;
}
bool UNICHARSET::load_from_file(const char* filename) {
FILE* file = fopen(filename, "r");
bool UNICHARSET::load_from_file(FILE *file) {
int unicharset_size;
char buffer[256];
if (file == NULL)
return false;
this->clear();
if (fgets(buffer, sizeof (buffer), file) == NULL ||
sscanf(buffer, "%d", &unicharset_size) != 1) {
fclose(file);
return false;
}
this->reserve(unicharset_size);
@ -226,11 +275,13 @@ bool UNICHARSET::load_from_file(const char* filename) {
unsigned int properties;
char script[64];
strcpy(script, null_script);
this->unichars[id].properties.other_case = id;
if (fgets(buffer, sizeof (buffer), file) == NULL ||
(sscanf(buffer, "%s %x %63s", unichar, &properties, script) != 3 &&
!(sscanf(buffer, "%s %x", unichar, &properties) == 2 &&
strcpy(script, null_script)))) {
fclose(file);
(sscanf(buffer, "%s %x %63s %d", unichar, &properties,
script, &(this->unichars[id].properties.other_case)) != 4 &&
sscanf(buffer, "%s %x %63s", unichar, &properties, script) != 3 &&
sscanf(buffer, "%s %x", unichar, &properties) != 2)) {
return false;
}
if (strcmp(unichar, "NULL") == 0)
@ -238,14 +289,23 @@ bool UNICHARSET::load_from_file(const char* filename) {
else
this->unichar_insert(unichar);
this->set_isalpha(id, properties & ISALPHA_MASK);
this->set_islower(id, properties & ISLOWER_MASK);
this->set_isupper(id, properties & ISUPPER_MASK);
this->set_isdigit(id, properties & ISDIGIT_MASK);
this->set_script(id, add_script(script));
this->set_isalpha(id, (properties & ISALPHA_MASK) != 0);
this->set_islower(id, (properties & ISLOWER_MASK) != 0);
this->set_isupper(id, (properties & ISUPPER_MASK) != 0);
this->set_isdigit(id, (properties & ISDIGIT_MASK) != 0);
this->set_ispunctuation(id, (properties & ISPUNCTUATION_MASK) != 0);
this->set_isngram(id, false);
this->set_script(id, script);
this->unichars[id].properties.enabled = true;
}
fclose(file);
null_sid_ = get_script_id_from_name(null_script);
ASSERT_HOST(null_sid_ == 0);
common_sid_ = get_script_id_from_name("Common");
latin_sid_ = get_script_id_from_name("Latin");
cyrillic_sid_ = get_script_id_from_name("Cyrillic");
greek_sid_ = get_script_id_from_name("Greek");
han_sid_ = get_script_id_from_name("Han");
return true;
}
@ -285,10 +345,10 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
}
}
char* UNICHARSET::add_script(const char* script) {
int UNICHARSET::add_script(const char* script) {
for (int i = 0; i < script_table_size_used; ++i) {
if (strcmp(script, script_table[i]) == 0)
return script_table[i];
return i;
}
if (script_table_size_reserved == 0) {
script_table_size_reserved = 8;
@ -303,5 +363,51 @@ char* UNICHARSET::add_script(const char* script) {
}
script_table[script_table_size_used] = new char[strlen(script) + 1];
strcpy(script_table[script_table_size_used], script);
return script_table[script_table_size_used++];
return script_table_size_used++;
}
CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
const char *ptr = string;
int len = strlen(string);
if (len < kMinLen || *ptr != kSeparator) {
return NULL; // this string can not represent a fragment
}
ptr++; // move to the next character
int step = 0;
while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
step += UNICHAR::utf8_step(ptr + step);
}
if (step == 0 || step > UNICHAR_LEN) {
return NULL; // no character for unichar or the character is too long
}
char unichar[UNICHAR_LEN + 1];
strncpy(unichar, ptr, step);
unichar[step] = '\0'; // null terminate unichar
ptr += step; // move to the next fragment separator
int pos = 0;
int total = 0;
char *end_ptr = NULL;
for (int i = 0; i < 2; i++) {
if (ptr > string + len || *ptr != kSeparator) {
return NULL; // failed to parse fragment representation
}
ptr++; // move to the next character
i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
: total = static_cast<int>(strtol(ptr, &end_ptr, 10));
ptr = end_ptr;
}
if (ptr != string + len) {
return NULL; // malformed fragment representation
}
CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT();
fragment->set_all(unichar, pos, total);
return fragment;
}
int UNICHARSET::get_script_id_from_name(const char* script_name) const {
for (int i = 0; i < script_table_size_used; ++i) {
if (strcmp(script_name, script_table[i]) == 0)
return i;
}
return 0; // 0 is always the null_script
}

View File

@ -17,19 +17,110 @@
//
///////////////////////////////////////////////////////////////////////
#ifndef THIRD_PARTY_TESSERACT_CCUTIL_UNICHARSET_H__
#define THIRD_PARTY_TESSERACT_CCUTIL_UNICHARSET_H__
#ifndef TESSERACT_CCUTIL_UNICHARSET_H__
#define TESSERACT_CCUTIL_UNICHARSET_H__
#include "assert.h"
#include "strngs.h"
#include "unichar.h"
#include "unicharmap.h"
#include "varable.h"
class CHAR_FRAGMENT {
public:
// Minimum number of characters used for fragment representation.
static const int kMinLen = 6;
// Maximum number of characters used for fragment representation.
static const int kMaxLen = 3 + UNICHAR_LEN + 2;
// Special character used in representing character fragments.
static const char kSeparator = '|';
// Maximum number of fragments per character.
static const int kMaxChunks = 3;
// Setters and Getters.
inline void set_all(const char *unichar, int pos, int total) {
this->set_unichar(unichar);
this->set_pos(pos);
this->set_total(total);
}
inline void set_unichar(const char *uch) {
strncpy(this->unichar, uch, UNICHAR_LEN);
this->unichar[UNICHAR_LEN] = '\0';
}
inline void set_pos(int p) { this->pos = p; }
inline void set_total(int t) { this->total = t; }
inline const char* get_unichar() const { return this->unichar; }
inline int get_pos() const { return this->pos; }
inline int get_total() const { return this->total; }
// Returns the string that represents a fragment
// with the given unichar, pos and total.
static STRING to_string(const char *unichar, int pos, int total) {
STRING result = "";
result += kSeparator;
result += unichar;
char buffer[kMaxLen];
snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, kSeparator, total);
result += buffer;
return result;
}
// Returns the string that represents this fragment.
STRING to_string() const {
return to_string(this->unichar, this->pos, this->total);
}
// Checks whether a fragment has the same unichar,
// position and total as the given inputs.
inline bool equals(const char *other_unichar,
int other_pos, int other_total) const {
return (strcmp(this->unichar, other_unichar) == 0 &&
this->pos == other_pos && this->total == other_total);
}
inline bool equals(const CHAR_FRAGMENT *other) const {
return this->equals(other->get_unichar(),
other->get_pos(),
other->get_total());
}
// Checks whether a given fragment is a continuation of this fragment.
// Assumes that the given fragment pointer is not NULL.
inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
this->total == fragment->get_total() &&
this->pos == fragment->get_pos() + 1);
}
// Returns true if this fragment is a beginning fragment.
inline bool is_beginning() const { return this->pos == 0; }
// Returns true if this fragment is an ending fragment.
inline bool is_ending() const { return this->pos == this->total-1; }
// Parses the string to see whether it represents a character fragment
// (rather than a regular character). If so, allocates memory for a new
// CHAR_FRAGMENT instance and fills it in with the corresponding fragment
// information. Fragments are of the form:
// |m|1|2, meaning chunk 1 of 2 of character m.
//
// If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
// instance, otherwise (if the string does not represent a fragment or it
// looks like it does, but parsing it as a fragment fails) returns NULL.
//
// Note: The caller is responsible for deallocating memory
// associated with the returned pointer.
static CHAR_FRAGMENT *parse_from_string(const char *str);
private:
char unichar[UNICHAR_LEN + 1];
inT16 pos; // fragment position in the character
inT16 total; // total number of fragments in the character
};
// The UNICHARSET class is an utility class for Tesseract that holds the
// set of characters that are used by the engine. Each character is identified
// by a unique number, from 0 to (size - 1).
class UNICHARSET {
public:
// Create an empty UNICHARSET
UNICHARSET();
@ -54,20 +145,43 @@ class UNICHARSET {
// within the UNICHARSET.
const char* const id_to_unichar(UNICHAR_ID id) const;
// Return a STRING that reformats the utf8 str into the str followed
// by its hex unicodes.
static STRING debug_utf8_str(const char* str);
// Return a STRING containing debug information on the unichar, including
// the id_to_unichar, its hex unicodes and the properties.
STRING debug_str(UNICHAR_ID id) const;
STRING debug_str(const char * unichar_repr) const {
return debug_str(unichar_to_id(unichar_repr));
}
// Add a unichar representation to the set.
void unichar_insert(const char* const unichar_repr);
// Return true if the given unichar id exists within the set.
// Relies on the fact that unichar ids are contiguous in the unicharset.
bool contains_unichar_id(UNICHAR_ID unichar_id) const {
return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used;
}
// Return true if the given unichar representation exists within the set.
bool contains_unichar(const char* const unichar_repr);
bool contains_unichar(const char* const unichar_repr, int length);
bool contains_unichar(const char* const unichar_repr) const;
bool contains_unichar(const char* const unichar_repr, int length) const;
// Return true if the given unichar representation corresponds to the given
// UNICHAR_ID within the set.
bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr);
bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
// Delete CHAR_FRAGMENTs stored in properties of unichars array.
void delete_pointers_in_unichars() {
for (int i = 0; i < size_used; ++i) {
if (unichars[i].properties.fragment != NULL) {
delete unichars[i].properties.fragment;
unichars[i].properties.fragment = NULL;
}
}
}
// Clear the UNICHARSET (all the previous data is lost).
void clear() {
@ -78,6 +192,7 @@ class UNICHARSET {
script_table = 0;
script_table_size_reserved = 0;
script_table_size_used = 0;
delete_pointers_in_unichars();
delete[] unichars;
unichars = 0;
size_reserved = 0;
@ -94,13 +209,34 @@ class UNICHARSET {
// Reserve enough memory space for the given number of UNICHARS
void reserve(int unichars_number);
// Save the content of the UNICHARSET to the given file. Return true if the
// operation is successful.
bool save_to_file(const char* const filename) const;
// Opens the file indicated by filename and saves unicharset to that file.
// Returns true if the operation is successful.
bool save_to_file(const char * const filename) const {
FILE* file = fopen(filename, "w+");
if (file == NULL) return false;
bool result = save_to_file(file);
fclose(file);
return result;
}
// Load the UNICHARSET from the given file. The previous data is lost. Return
// true if the operation is successful.
bool load_from_file(const char* const filename);
// Saves the content of the UNICHARSET to the given file.
// Returns true if the operation is successful.
bool save_to_file(FILE *file) const;
// Opens the file indicated by filename and loads the UNICHARSET
// from the given file. The previous data is lost.
// Returns true if the operation is successful.
bool load_from_file(const char* const filename) {
FILE* file = fopen(filename, "r");
if (file == NULL) return false;
bool result = load_from_file(file);
fclose(file);
return result;
}
// Loads the UNICHARSET from the given file. The previous data is lost.
// Returns true if the operation is successful.
bool load_from_file(FILE *file);
// Set a whitelist and/or blacklist of characters to recognize.
// An empty or NULL whitelist enables everything (minus any blacklist).
@ -131,10 +267,25 @@ class UNICHARSET {
unichars[unichar_id].properties.isdigit = value;
}
// Set the ispunctuation property of the given unichar to the given value.
void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
unichars[unichar_id].properties.ispunctuation = value;
}
// Set the isngram property of the given unichar to the given value.
void set_isngram(UNICHAR_ID unichar_id, bool value) {
unichars[unichar_id].properties.isngram = value;
}
// Set the script name of the given unichar to the given value.
// Value is copied and thus can be a temporary;
void set_script(UNICHAR_ID unichar_id, const char* value) {
unichars[unichar_id].properties.script = add_script(value);
unichars[unichar_id].properties.script_id = add_script(value);
}
// Set other_case unichar id in the properties for the given unichar id.
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
unichars[unichar_id].properties.other_case = other_case;
}
// Return the isalpha property of the given unichar.
@ -157,11 +308,44 @@ class UNICHARSET {
return unichars[unichar_id].properties.isdigit;
}
// Return the ispunctuation property of the given unichar.
bool get_ispunctuation(UNICHAR_ID unichar_id) const {
return unichars[unichar_id].properties.ispunctuation;
}
// Return the isngram property of the given unichar.
bool get_isngram(UNICHAR_ID unichar_id) const {
return unichars[unichar_id].properties.isngram;
}
// Return the script name of the given unichar.
// The returned pointer will always be the same for the same script, it's
// managed by unicharset and thus MUST NOT be deleted
const char* get_script(UNICHAR_ID unichar_id) const {
return unichars[unichar_id].properties.script;
int get_script(UNICHAR_ID unichar_id) const {
return unichars[unichar_id].properties.script_id;
}
// Get other_case unichar id in the properties for the given unichar id.
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
return unichars[unichar_id].properties.other_case;
}
// Returns UNICHAR_ID of the corresponding lower-case unichar.
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
if (unichars[unichar_id].properties.islower) return unichar_id;
return unichars[unichar_id].properties.other_case;
}
// Returns UNICHAR_ID of the corresponding upper-case unichar.
UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
if (unichars[unichar_id].properties.isupper) return unichar_id;
return unichars[unichar_id].properties.other_case;
}
// Return a pointer to the CHAR_FRAGMENT class if the given
// unichar id represents a character fragment.
const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
return unichars[unichar_id].properties.fragment;
}
// Return the isalpha property of the given unichar representation.
@ -184,13 +368,28 @@ class UNICHARSET {
return get_isdigit(unichar_to_id(unichar_repr));
}
// Return the ispunctuation property of the given unichar representation.
bool get_ispunctuation(const char* const unichar_repr) const {
return get_ispunctuation(unichar_to_id(unichar_repr));
}
// Return the script name of the given unichar representation.
// The returned pointer will always be the same for the same script, it's
// managed by unicharset and thus MUST NOT be deleted
const char* get_script(const char* const unichar_repr) const {
int get_script(const char* const unichar_repr) const {
return get_script(unichar_to_id(unichar_repr));
}
// Return a pointer to the CHAR_FRAGMENT class struct if the given
// unichar representation represents a character fragment.
const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
!ids.contains(unichar_repr)) {
return NULL;
}
return get_fragment(unichar_to_id(unichar_repr));
}
// Return the isalpha property of the given unichar representation.
// Only the first length characters from unichar_repr are used.
bool get_isalpha(const char* const unichar_repr,
@ -219,34 +418,82 @@ class UNICHARSET {
return get_isdigit(unichar_to_id(unichar_repr, length));
}
// Return the ispunctuation property of the given unichar representation.
// Only the first length characters from unichar_repr are used.
bool get_ispunctuation(const char* const unichar_repr,
int length) const {
return get_ispunctuation(unichar_to_id(unichar_repr, length));
}
// Return the script name of the given unichar representation.
// Only the first length characters from unichar_repr are used.
// The returned pointer will always be the same for the same script, it's
// managed by unicharset and thus MUST NOT be deleted
const char* get_script(const char* const unichar_repr,
int length) const {
int get_script(const char* const unichar_repr,
int length) const {
return get_script(unichar_to_id(unichar_repr, length));
}
// Return the (current) number of scripts in the script table
int get_script_table_size() const {
return script_table_size_used;
}
// Return the script string from its id
const char* get_script_from_script_id(int id) const {
if (id >= script_table_size_used || id < 0)
return null_script;
return script_table[id];
}
// Returns the id from the name of the script, or 0 if script is not found.
// Note that this is an expensive operation since it involves iteratively
// comparing strings in the script table. To avoid dependency on STL, we
// won't use a hash. Instead, the calling function can use this to lookup
// and save the ID for relevant scripts for fast comparisons later.
int get_script_id_from_name(const char* script_name) const;
// Return true if the given script is the null script
bool is_null_script(const char* script) const {
return script == null_script;
}
// Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
// then the returned pointer will be the same.
// The script parameter is copied and thus can be a temporary.
int add_script(const char* script);
// Return the enabled property of the given unichar.
bool get_enabled(UNICHAR_ID unichar_id) const {
return unichars[unichar_id].properties.enabled;
}
private:
// Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
// then the returned pointer will be the same.
// The script parameter is copied and thus can be a temporary.
char* add_script(const char* script);
int null_sid() const { return null_sid_; }
int common_sid() const { return common_sid_; }
int latin_sid() const { return latin_sid_; }
int cyrillic_sid() const { return cyrillic_sid_; }
int greek_sid() const { return greek_sid_; }
int han_sid() const { return han_sid_; }
private:
struct UNICHAR_PROPERTIES {
bool isalpha;
bool islower;
bool isupper;
bool isdigit;
bool ispunctuation;
bool isngram;
bool enabled;
char* script;
int script_id;
UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar
// Contains meta information about the fragment if a unichar represents
// a fragment of a character, otherwise should be set to NULL.
// It is assumed that character fragments are added to the unicharset
// after the corresponding 'base' characters.
CHAR_FRAGMENT *fragment;
};
struct UNICHAR_SLOT {
@ -262,6 +509,16 @@ class UNICHARSET {
int script_table_size_used;
int script_table_size_reserved;
const char* null_script;
// A few convenient script name-to-id mapping without using hash.
// These are initialized when unicharset file is loaded. Anything
// missing from this list can be looked up using get_script_id_from_name.
int null_sid_;
int common_sid_;
int latin_sid_;
int cyrillic_sid_;
int greek_sid_;
int han_sid_;
};
#endif // THIRD_PARTY_TESSERACT_CCUTIL_UNICHARSET_H__
#endif // TESSERACT_CCUTIL_UNICHARSET_H__

198
ccutil/unicity_table.h Normal file
View File

@ -0,0 +1,198 @@
///////////////////////////////////////////////////////////////////////
// File: UnicityTable.h
// Description: a class to uniquify objects, manipulating them using integers
// ids.
// Author: Samuel Charron
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#ifndef TESSERACT_CCUTIL_UNICITY_TABLE_H_
#define TESSERACT_CCUTIL_UNICITY_TABLE_H_
#include "callback.h"
#include "errcode.h"
#include "genericvector.h"
// A class to uniquify objects, manipulating them using integers ids.
// T requirements:
// operator= to add an element
// default-constructible: allocating the internal table will call the default
// constructor.
template <typename T>
class UnicityTable {
public:
UnicityTable();
// Clear the structures and deallocate internal structures.
~UnicityTable();
// Reserve some memory. If there is size or more elements, the table will
// then allocate size * 2 elements.
void reserve(int size);
// Return the size used.
int size() const;
// Return the object from an id.
T get(int id) const;
// Return the id of the T object.
// This method NEEDS a compare_callback to be passed to
// set_compare_callback.
int get_id(T object) const;
// Return true if T is in the table
bool contains(T object) const;
// Return true if the id is valid
T contains_id(int id) const;
// Add an element in the table
int push_back(T object);
// Add a callback to be called to delete the elements when the table took
// their ownership.
void set_clear_callback(Callback1<T>* cb);
// Add a callback to be called to compare the elements when needed (contains,
// get_id, ...)
void set_compare_callback(ResultCallback2<bool, T const &, T const &>* cb);
// Clear the table, calling the callback function if any.
// All the owned Callbacks are also deleted.
// If you don't want the Callbacks to be deleted, before calling clear, set
// the callback to NULL.
void clear();
// This method clear the current object, then, does a shallow copy of
// its argument, and finally invalidate its argument.
void move(UnicityTable<T>* from);
// Read/Write the table to a file. This does _NOT_ read/write the callbacks.
// The Callback given must be permanent since they will be called more than
// once. The given callback will be deleted at the end.
void write(FILE* f, Callback2<FILE*, T const &>* cb);
// swap is used to switch the endianness.
void read(FILE* f, Callback3<FILE*, T*, bool>* cb, bool swap);
private:
GenericVector<T> table_;
// Mutable because Run method is not const
mutable ResultCallback2<bool, T const &, T const &>* compare_cb_;
};
template <typename T>
class UnicityTableEqEq : public UnicityTable<T> {
public:
UnicityTableEqEq() {
UnicityTable<T>::set_compare_callback(
NewPermanentCallback(tesseract::cmp_eq<T>));
}
};
template <typename T>
UnicityTable<T>::UnicityTable() :
compare_cb_(0) {
}
template <typename T>
UnicityTable<T>::~UnicityTable() {
clear();
}
template <typename T>
int UnicityTable<T>::size() const{
return table_.size();
}
// Reserve some memory. If there is size or more elements, the table will
// then allocate size * 2 elements.
template <typename T>
void UnicityTable<T>::reserve(int size) {
table_.reserve(size);
}
// Return the object from an id.
template <typename T>
T UnicityTable<T>::get(int id) const {
return table_.get(id);
}
// Return true if the id is valid
template <typename T>
T UnicityTable<T>::contains_id(int id) const {
return table_.contains_index(id);
}
// Return the id of the T object.
template <typename T>
int UnicityTable<T>::get_id(T object) const {
return table_.get_index(object);
}
// Return true if T is in the table
template <typename T>
bool UnicityTable<T>::contains(T object) const {
return get_id(object) != -1;
}
// Add an element in the table
template <typename T>
int UnicityTable<T>::push_back(T object) {
int idx = get_id(object);
if (idx == -1) {
idx = table_.push_back(object);
}
return idx;
}
// Add a callback to be called to delete the elements when the table took
// their ownership.
template <typename T>
void UnicityTable<T>::set_clear_callback(Callback1<T>* cb) {
table_.set_clear_callback(cb);
}
// Add a callback to be called to delete the elements when the table took
// their ownership.
template <typename T>
void UnicityTable<T>::set_compare_callback(ResultCallback2<bool, T const &, T const &>* cb) {
table_.set_compare_callback(cb);
compare_cb_ = cb;
}
// Clear the table, calling the callback function if any.
template <typename T>
void UnicityTable<T>::clear() {
table_.clear();
}
template <typename T>
void UnicityTable<T>::write(FILE* f, Callback2<FILE*, T const &>* cb) {
table_.write(f, cb);
}
template <typename T>
void UnicityTable<T>::read(FILE* f, Callback3<FILE*, T*, bool>* cb, bool swap) {
table_.read(f, cb, swap);
}
// This method clear the current object, then, does a shallow copy of
// its argument, and finally invalidate its argument.
template <typename T>
void UnicityTable<T>::move(UnicityTable<T>* from) {
table_.move(&from->table_);
}
#endif // TESSERACT_CCUTIL_UNICITY_TABLE_H_

View File

@ -18,13 +18,14 @@
**********************************************************************/
#include "mfcpch.h" //precompiled headers
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "tprintf.h"
//#include "ipeerr.h"
#include "varable.h"
#include "scanutils.h"
#include "tprintf.h"
#include "varable.h"
#define PLUS '+' //flag states
#define MINUS '-'
@ -379,24 +380,23 @@ STRING_VARIABLE_CLIST *STRING_VARIABLE::get_head() { // access to static
* Print the entire list of STRING_VARIABLEs.
**********************************************************************/
void STRING_VARIABLE::print(FILE *fp // file to print on
) {
// list iterator
STRING_VARIABLE_C_IT it = &head;
void STRING_VARIABLE::print(FILE *fp) {
STRING_VARIABLE_C_IT it = &head; // list iterator
STRING_VARIABLE *elt; // current element
// Comments aren't allowed with string variables, so the # character can
// be part of a string.
if (fp == stdout) {
tprintf("#Variables of type STRING_VARIABLE:\n");
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
elt = it.data();
tprintf("%s #%s %s\n", elt->name, elt->value.string(), elt->info);
tprintf("%s %s\n", elt->name, elt->value.string());
}
} else {
fprintf(fp, "#Variables of type STRING_VARIABLE:\n");
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
elt = it.data();
fprintf(fp, "%s #%s %s\n",
elt->name, elt->value.string(), elt->info);
fprintf(fp, "%s %s\n", elt->name, elt->value.string());
}
}
}
@ -519,20 +519,14 @@ void double_VARIABLE::print(FILE *fp // file to print on
* Values may have any whitespace after the name and are the rest of line.
**********************************************************************/
DLLSYM BOOL8 read_variables_file(const char *file // name to read
) {
BOOL8 anyerr; // true if any error
DLLSYM BOOL8 read_variables_file(const char *file, // name to read
bool global_only // only set variables
) { // starting with "global_"
char flag; // file flag
BOOL8 foundit; // found variable
inT16 length; // length of line
inT16 nameoffset; // offset for real name
char *valptr; // value field
char *stringend; // end of string value
FILE *fp; // file pointer
// iterators
char line[MAX_PATH]; // input line
anyerr = FALSE;
if (*file == PLUS) {
flag = PLUS; // file has flag
nameoffset = 1;
@ -546,54 +540,48 @@ DLLSYM BOOL8 read_variables_file(const char *file // name to read
fp = fopen(file + nameoffset, "r");
if (fp == NULL) {
tprintf("read_variables_file:Can't open %s", file + nameoffset);
tprintf("read_variables_file: Can't open %s\n", file + nameoffset);
return TRUE; // can't open it
}
while (fgets (line, MAX_PATH, fp)) {
return read_variables_from_fp(fp, -1, global_only);
fclose(fp);
}
bool read_variables_from_fp(FILE *fp, inT64 end_offset, bool global_only) {
char line[MAX_PATH]; // input line
bool anyerr = false; // true if any error
bool foundit; // found variable
inT16 length; // length of line
char *valptr; // value field
while ((end_offset < 0 || ftell(fp) < end_offset) &&
fgets(line, MAX_PATH, fp)) {
if (line[0] != '\n' && line[0] != '#') {
length = strlen (line);
if (line[length - 1] == '\n')
line[length - 1] = '\0'; // cut newline
for (valptr = line; *valptr && *valptr != ' ' && *valptr != '\t';
valptr++);
if (*valptr) { //found blank
*valptr = '\0'; //make name a string
if (*valptr) { // found blank
*valptr = '\0'; // make name a string
do
valptr++; //find end of blanks
valptr++; // find end of blanks
while (*valptr == ' ' || *valptr == '\t');
if (*valptr && *valptr != '#') {
//last char in string
stringend = valptr + strlen (valptr) - 1;
while (stringend != valptr) {
while (stringend != valptr
&& (*stringend == ' ' || *stringend == '\t'))
// cut trailing blanks
stringend--;
stringend[1] = '\0'; // terminate string
while (stringend != valptr
&& ((*stringend != ' ' && *stringend != '\t')
|| stringend[1] != '#'))
stringend--; // find word start
}
}
}
foundit = set_new_style_variable(line, valptr);
if (global_only && strstr(line, kGlobalVariablePrefix) == NULL) continue;
foundit = set_variable(line, valptr);
if (!foundit) {
anyerr = TRUE; // had an error
tprintf("read_variables_file:variable not found: %s\n",
line);
tprintf("read_variables_file: variable not found: %s\n", line);
exit(1);
}
}
}
fclose(fp); // close file
return anyerr;
}
bool set_new_style_variable(const char *variable, const char* value) {
bool set_variable(const char *variable, const char* value) {
INT_VARIABLE_C_IT int_it = &INT_VARIABLE::head;
BOOL_VARIABLE_C_IT BOOL_it = &BOOL_VARIABLE::head;
STRING_VARIABLE_C_IT STRING_it = &STRING_VARIABLE::head;
@ -606,10 +594,7 @@ bool set_new_style_variable(const char *variable, const char* value) {
STRING_it.forward());
if (!STRING_it.cycled_list()) {
foundit = true; // found the varaible
if (*value == '\0')
STRING_it.data()->set_value((char *) NULL); // No value.
else
STRING_it.data()->set_value(value); // set its value
STRING_it.data()->set_value(value); // set its value
}
if (*value) {
@ -624,7 +609,7 @@ bool set_new_style_variable(const char *variable, const char* value) {
int_it.data()->set_value(intval); // set its value.
}
for (BOOL_it.mark_cycle_pt();
!BOOL_it.cycled_list () && strcmp(variable, BOOL_it.data()->name);
!BOOL_it.cycled_list() && strcmp(variable, BOOL_it.data()->name);
BOOL_it.forward());
if (!BOOL_it.cycled_list()) {
if (*value == 'T' || *value == 't' ||

View File

@ -21,18 +21,27 @@
#define VARABLE_H
#include <stdio.h>
#include "clst.h"
#include "strngs.h"
class DLLSYM INT_VARIABLE;
//read the file
extern DLLSYM BOOL8 read_variables_file(const char *file //name to read
);
bool set_new_style_variable(const char *variable, const char* value);
//print all vars
extern DLLSYM void print_variables(FILE *fp //file to print on
);
// Read config file.
extern DLLSYM BOOL8 read_variables_file(
const char *file, // filename to read
bool global_only); // only set variables starting with "global_"
// Read variables from the given file pointer (stop at end_offset).
bool read_variables_from_fp(FILE *fp, inT64 end_offset, bool global_only);
// Set a variable to have the given value.
bool set_variable(const char *variable, const char* value);
// Print variables to a file.
extern DLLSYM void print_variables(FILE *fp);
const char kGlobalVariablePrefix[] = "global_";
CLISTIZEH (INT_VARIABLE)
class DLLSYM INT_VAR_FROM
@ -57,7 +66,7 @@ class DLLSYM INT_VARIABLE
friend class INT_VAR_TO;
friend class INT_VAR_FROM;
//for setting values
friend bool set_new_style_variable(const char *variable, const char* value);
friend bool set_variable(const char *variable, const char* value);
public:
INT_VARIABLE(inT32 v, // initial value
@ -124,7 +133,7 @@ class DLLSYM BOOL_VARIABLE {
friend class BOOL_VAR_FROM;
friend class BOOL_VAR_TO;
//for setting values
friend bool set_new_style_variable(const char *variable, const char* value);
friend bool set_variable(const char *variable, const char* value);
public:
BOOL_VARIABLE( //constructor
@ -197,7 +206,7 @@ class DLLSYM STRING_VARIABLE
friend class STRING_VAR_TO;
friend class STRING_VAR_FROM;
//for setting values
friend bool set_new_style_variable(const char *variable, const char* value);
friend bool set_variable(const char *variable, const char* value);
public:
STRING_VARIABLE( //constructor
@ -274,7 +283,7 @@ class DLLSYM double_VARIABLE
friend class double_VAR_TO;
friend class double_VAR_FROM;
//for setting values
friend bool set_new_style_variable(const char *variable, const char* value);
friend bool set_variable(const char *variable, const char* value);
public:
double_VARIABLE( //constructor