mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 14:41:36 +08:00
Changes to ccutil for 3.00
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@305 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
b47efd2cc4
commit
d8b1456dd5
@ -1,19 +1,30 @@
|
||||
SUBDIRS =
|
||||
AM_CXXFLAGS = -DTESSDATA_PREFIX=@datadir@/
|
||||
|
||||
EXTRA_DIST = ccutil.vcproj mfcpch.cpp scanutils.cpp scanutils.h
|
||||
|
||||
include_HEADERS = \
|
||||
basedir.h bits16.h clst.h debugwin.h elst2.h elst.h errcode.h \
|
||||
fileerr.h tessopt.h globaloc.h hashfn.h host.h hosthplb.h lsterr.h \
|
||||
mainblk.h memblk.h memryerr.h memry.h mfcpch.h ndminx.h notdll.h \
|
||||
nwmain.h ocrclass.h ocrshell.h platform.h secname.h serialis.h \
|
||||
stderr.h strngs.h tessclas.h tprintf.h varable.h \
|
||||
mfcpch.cpp scanutils.cpp scanutils.h unichar.h \
|
||||
unicharmap.h unicharset.h boxread.h
|
||||
ambigs.h basedir.h bits16.h boxread.h \
|
||||
callback.h ccutil.h clst.h \
|
||||
debugwin.h elst2.h elst.h errcode.h \
|
||||
fileerr.h genericvector.h globaloc.h \
|
||||
hashfn.h helpers.h host.h hosthplb.h lsterr.h \
|
||||
mainblk.h memblk.h memry.h memryerr.h mfcpch.h \
|
||||
ndminx.h notdll.h nwmain.h \
|
||||
ocrclass.h ocrshell.h platform.h qrsequence.h \
|
||||
secname.h serialis.h stderr.h strngs.h \
|
||||
tessclas.h tessdatamanager.h tessopt.h tordvars.h tprintf.h \
|
||||
unichar.h unicharmap.h unicharset.h unicity_table.h \
|
||||
varable.h
|
||||
|
||||
lib_LIBRARIES = libtesseract_ccutil.a
|
||||
libtesseract_ccutil_a_SOURCES = \
|
||||
basedir.cpp bits16.cpp clst.cpp debugwin.cpp elst.cpp \
|
||||
elst2.cpp errcode.cpp globaloc.cpp hashfn.cpp mainblk.cpp \
|
||||
memblk.cpp memry.cpp ocrshell.cpp serialis.cpp strngs.cpp \
|
||||
tprintf.cpp varable.cpp unichar.cpp tessopt.cpp \
|
||||
unicharmap.cpp unicharset.cpp boxread.cpp
|
||||
ambigs.cpp basedir.cpp bits16.cpp boxread.cpp \
|
||||
ccutil.cpp clst.cpp debugwin.cpp \
|
||||
elst2.cpp elst.cpp errcode.cpp \
|
||||
globaloc.cpp hashfn.cpp \
|
||||
mainblk.cpp memblk.cpp memry.cpp ocrshell.cpp \
|
||||
serialis.cpp strngs.cpp \
|
||||
tessdatamanager.cpp tessopt.cpp tordvars.cpp tprintf.cpp \
|
||||
unichar.cpp unicharmap.cpp unicharset.cpp \
|
||||
varable.cpp
|
||||
|
@ -57,14 +57,15 @@ AR = ar
|
||||
ARFLAGS = cru
|
||||
libtesseract_ccutil_a_AR = $(AR) $(ARFLAGS)
|
||||
libtesseract_ccutil_a_LIBADD =
|
||||
am_libtesseract_ccutil_a_OBJECTS = basedir.$(OBJEXT) bits16.$(OBJEXT) \
|
||||
clst.$(OBJEXT) debugwin.$(OBJEXT) elst.$(OBJEXT) \
|
||||
elst2.$(OBJEXT) errcode.$(OBJEXT) globaloc.$(OBJEXT) \
|
||||
am_libtesseract_ccutil_a_OBJECTS = ambigs.$(OBJEXT) basedir.$(OBJEXT) \
|
||||
bits16.$(OBJEXT) boxread.$(OBJEXT) ccutil.$(OBJEXT) \
|
||||
clst.$(OBJEXT) debugwin.$(OBJEXT) elst2.$(OBJEXT) \
|
||||
elst.$(OBJEXT) errcode.$(OBJEXT) globaloc.$(OBJEXT) \
|
||||
hashfn.$(OBJEXT) mainblk.$(OBJEXT) memblk.$(OBJEXT) \
|
||||
memry.$(OBJEXT) ocrshell.$(OBJEXT) serialis.$(OBJEXT) \
|
||||
strngs.$(OBJEXT) tprintf.$(OBJEXT) varable.$(OBJEXT) \
|
||||
unichar.$(OBJEXT) tessopt.$(OBJEXT) unicharmap.$(OBJEXT) \
|
||||
unicharset.$(OBJEXT) boxread.$(OBJEXT)
|
||||
strngs.$(OBJEXT) tessdatamanager.$(OBJEXT) tessopt.$(OBJEXT) \
|
||||
tordvars.$(OBJEXT) tprintf.$(OBJEXT) unichar.$(OBJEXT) \
|
||||
unicharmap.$(OBJEXT) unicharset.$(OBJEXT) varable.$(OBJEXT)
|
||||
libtesseract_ccutil_a_OBJECTS = $(am_libtesseract_ccutil_a_OBJECTS)
|
||||
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
|
||||
depcomp = $(SHELL) $(top_srcdir)/config/depcomp
|
||||
@ -196,22 +197,32 @@ top_builddir = @top_builddir@
|
||||
top_srcdir = @top_srcdir@
|
||||
SUBDIRS =
|
||||
AM_CXXFLAGS = -DTESSDATA_PREFIX=@datadir@/
|
||||
EXTRA_DIST = ccutil.vcproj mfcpch.cpp scanutils.cpp scanutils.h
|
||||
include_HEADERS = \
|
||||
basedir.h bits16.h clst.h debugwin.h elst2.h elst.h errcode.h \
|
||||
fileerr.h tessopt.h globaloc.h hashfn.h host.h hosthplb.h lsterr.h \
|
||||
mainblk.h memblk.h memryerr.h memry.h mfcpch.h ndminx.h notdll.h \
|
||||
nwmain.h ocrclass.h ocrshell.h platform.h secname.h serialis.h \
|
||||
stderr.h strngs.h tessclas.h tprintf.h varable.h \
|
||||
mfcpch.cpp scanutils.cpp scanutils.h unichar.h \
|
||||
unicharmap.h unicharset.h boxread.h
|
||||
ambigs.h basedir.h bits16.h boxread.h \
|
||||
callback.h ccutil.h clst.h \
|
||||
debugwin.h elst2.h elst.h errcode.h \
|
||||
fileerr.h genericvector.h globaloc.h \
|
||||
hashfn.h helpers.h host.h hosthplb.h lsterr.h \
|
||||
mainblk.h memblk.h memry.h memryerr.h mfcpch.h \
|
||||
ndminx.h notdll.h nwmain.h \
|
||||
ocrclass.h ocrshell.h platform.h qrsequence.h \
|
||||
secname.h serialis.h stderr.h strngs.h \
|
||||
tessclas.h tessdatamanager.h tessopt.h tordvars.h tprintf.h \
|
||||
unichar.h unicharmap.h unicharset.h unicity_table.h \
|
||||
varable.h
|
||||
|
||||
lib_LIBRARIES = libtesseract_ccutil.a
|
||||
libtesseract_ccutil_a_SOURCES = \
|
||||
basedir.cpp bits16.cpp clst.cpp debugwin.cpp elst.cpp \
|
||||
elst2.cpp errcode.cpp globaloc.cpp hashfn.cpp mainblk.cpp \
|
||||
memblk.cpp memry.cpp ocrshell.cpp serialis.cpp strngs.cpp \
|
||||
tprintf.cpp varable.cpp unichar.cpp tessopt.cpp \
|
||||
unicharmap.cpp unicharset.cpp boxread.cpp
|
||||
ambigs.cpp basedir.cpp bits16.cpp boxread.cpp \
|
||||
ccutil.cpp clst.cpp debugwin.cpp \
|
||||
elst2.cpp elst.cpp errcode.cpp \
|
||||
globaloc.cpp hashfn.cpp \
|
||||
mainblk.cpp memblk.cpp memry.cpp ocrshell.cpp \
|
||||
serialis.cpp strngs.cpp \
|
||||
tessdatamanager.cpp tessopt.cpp tordvars.cpp tprintf.cpp \
|
||||
unichar.cpp unicharmap.cpp unicharset.cpp \
|
||||
varable.cpp
|
||||
|
||||
all: all-recursive
|
||||
|
||||
@ -286,9 +297,11 @@ mostlyclean-compile:
|
||||
distclean-compile:
|
||||
-rm -f *.tab.c
|
||||
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ambigs.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/basedir.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bits16.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/boxread.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ccutil.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/clst.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/debugwin.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/elst.Po@am__quote@
|
||||
@ -302,7 +315,9 @@ distclean-compile:
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ocrshell.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/serialis.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/strngs.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tessdatamanager.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tessopt.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tordvars.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/tprintf.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unichar.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/unicharmap.Po@am__quote@
|
||||
|
254
ccutil/ambigs.cpp
Normal file
254
ccutil/ambigs.cpp
Normal file
@ -0,0 +1,254 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: ambigs.cc
|
||||
// Description: Functions for dealing with ambiguities
|
||||
// (training and recognition).
|
||||
// Author: Daria Antonova
|
||||
// Created: Mon Feb 5 11:26:43 PDT 2009
|
||||
//
|
||||
// (C) Copyright 2008, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "ambigs.h"
|
||||
#include "helpers.h"
|
||||
|
||||
INT_VAR(global_ambigs_debug_level, 0, "Debug level for unichar ambiguities");
|
||||
BOOL_VAR(use_definite_ambigs_for_classifier, 0,
|
||||
"Use definite ambiguities when running character classifier");
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
AmbigSpec::AmbigSpec() {
|
||||
wrong_ngram[0] = INVALID_UNICHAR_ID;
|
||||
correct_fragments[0] = INVALID_UNICHAR_ID;
|
||||
correct_ngram_id = INVALID_UNICHAR_ID;
|
||||
type = NOT_AMBIG;
|
||||
wrong_ngram_size = 0;
|
||||
}
|
||||
|
||||
ELISTIZE(AmbigSpec);
|
||||
|
||||
void UnicharAmbigs::LoadUnicharAmbigs(FILE *AmbigFile, inT64 end_offset,
|
||||
UNICHARSET *unicharset) {
|
||||
int i;
|
||||
for (i = 0; i < unicharset->size(); ++i) {
|
||||
replace_ambigs_.push_back(NULL);
|
||||
dang_ambigs_.push_back(NULL);
|
||||
one_to_one_definite_ambigs_.push_back(NULL);
|
||||
}
|
||||
if (global_ambigs_debug_level) tprintf("Reading ambiguities\n");
|
||||
|
||||
int TestAmbigPartSize;
|
||||
int ReplacementAmbigPartSize;
|
||||
// Maximum line size:
|
||||
// 10 for sizes of ambigs, tabs, abmig type and newline
|
||||
// UNICHAR_LEN * (MAX_AMBIG_SIZE + 1) for each part of the ambig
|
||||
// The space for buffer is allocated on the heap to avoid
|
||||
// GCC frame size warning.
|
||||
const int kMaxAmbigStringSize = UNICHAR_LEN * (MAX_AMBIG_SIZE + 1);
|
||||
const int kBufferSize = 10 + 2 * kMaxAmbigStringSize;
|
||||
char *buffer = new char[kBufferSize];
|
||||
char ReplacementString[kMaxAmbigStringSize];
|
||||
UNICHAR_ID TestUnicharIds[MAX_AMBIG_SIZE + 1];
|
||||
int line_num = 0;
|
||||
int type = NOT_AMBIG;
|
||||
|
||||
// Determine the version of the ambigs file.
|
||||
int version = 0;
|
||||
ASSERT_HOST(fgets(buffer, kBufferSize, AmbigFile) != NULL &&
|
||||
strlen(buffer) > 0);
|
||||
if (*buffer == 'v') {
|
||||
version = static_cast<int>(strtol(buffer+1, NULL, 10));
|
||||
++line_num;
|
||||
} else {
|
||||
rewind(AmbigFile);
|
||||
}
|
||||
while ((end_offset < 0 || ftell(AmbigFile) < end_offset) &&
|
||||
fgets(buffer, kBufferSize, AmbigFile) != NULL) {
|
||||
chomp_string(buffer);
|
||||
if (global_ambigs_debug_level > 2) tprintf("read line %s\n", buffer);
|
||||
++line_num;
|
||||
if (!ParseAmbiguityLine(line_num, version, *unicharset, buffer,
|
||||
&TestAmbigPartSize, TestUnicharIds,
|
||||
&ReplacementAmbigPartSize,
|
||||
ReplacementString, &type)) continue;
|
||||
// Construct AmbigSpec and add it to the appropriate AmbigSpec_LIST.
|
||||
AmbigSpec *ambig_spec = new AmbigSpec();
|
||||
InsertIntoTable((type == REPLACE_AMBIG) ? replace_ambigs_ : dang_ambigs_,
|
||||
TestAmbigPartSize, TestUnicharIds,
|
||||
ReplacementAmbigPartSize, ReplacementString, type,
|
||||
ambig_spec, unicharset);
|
||||
|
||||
// Update one_to_one_definite_ambigs_.
|
||||
if (use_definite_ambigs_for_classifier && TestAmbigPartSize == 1 &&
|
||||
ReplacementAmbigPartSize == 1 && type == DEFINITE_AMBIG) {
|
||||
if (one_to_one_definite_ambigs_[TestUnicharIds[0]] == NULL) {
|
||||
one_to_one_definite_ambigs_[TestUnicharIds[0]] = new UnicharIdVector();
|
||||
}
|
||||
one_to_one_definite_ambigs_[TestUnicharIds[0]]->push_back(
|
||||
ambig_spec->correct_ngram_id);
|
||||
}
|
||||
}
|
||||
delete[] buffer;
|
||||
// Print what was read from the input file.
|
||||
if (global_ambigs_debug_level > 2) {
|
||||
for (int tbl = 0; tbl < 2; ++tbl) {
|
||||
const UnicharAmbigsVector &print_table =
|
||||
(tbl == 0) ? replace_ambigs_ : dang_ambigs_;
|
||||
for (i = 0; i < print_table.size(); ++i) {
|
||||
AmbigSpec_LIST *lst = print_table[i];
|
||||
if (lst == NULL) continue;
|
||||
if (!lst->empty()) {
|
||||
tprintf("%s Ambiguities for %s:\n",
|
||||
(tbl == 0) ? "Replaceable" : "Dangerous",
|
||||
unicharset->debug_str(i).string());
|
||||
}
|
||||
AmbigSpec_IT lst_it(lst);
|
||||
for (lst_it.mark_cycle_pt(); !lst_it.cycled_list(); lst_it.forward()) {
|
||||
AmbigSpec *ambig_spec = lst_it.data();
|
||||
tprintf("wrong_ngram:");
|
||||
UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, *unicharset);
|
||||
tprintf("correct_fragments:");
|
||||
UnicharIdArrayUtils::print(ambig_spec->correct_fragments, *unicharset);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool UnicharAmbigs::ParseAmbiguityLine(
|
||||
int line_num, int version, const UNICHARSET &unicharset,
|
||||
char *buffer, int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
|
||||
int *ReplacementAmbigPartSize, char *ReplacementString, int *type) {
|
||||
int i;
|
||||
char *token;
|
||||
char *next_token;
|
||||
if (!(token = strtok_r(buffer, kAmbigDelimiters, &next_token)) ||
|
||||
!sscanf(token, "%d", TestAmbigPartSize) || TestAmbigPartSize <= 0) {
|
||||
if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num);
|
||||
return false;
|
||||
}
|
||||
if (*TestAmbigPartSize > MAX_AMBIG_SIZE) {
|
||||
tprintf("Too many unichars in ambiguity on line %d\n");
|
||||
return false;
|
||||
}
|
||||
for (i = 0; i < *TestAmbigPartSize; ++i) {
|
||||
if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
|
||||
if (!unicharset.contains_unichar(token)) {
|
||||
if (global_ambigs_debug_level) tprintf(kIllegalUnicharMsg, token);
|
||||
break;
|
||||
}
|
||||
TestUnicharIds[i] = unicharset.unichar_to_id(token);
|
||||
}
|
||||
TestUnicharIds[i] = INVALID_UNICHAR_ID;
|
||||
|
||||
if (i != *TestAmbigPartSize ||
|
||||
!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
|
||||
!sscanf(token, "%d", ReplacementAmbigPartSize) ||
|
||||
*ReplacementAmbigPartSize <= 0) {
|
||||
if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num);
|
||||
return false;
|
||||
}
|
||||
if (*ReplacementAmbigPartSize > MAX_AMBIG_SIZE) {
|
||||
tprintf("Too many unichars in ambiguity on line %d\n");
|
||||
return false;
|
||||
}
|
||||
ReplacementString[0] = '\0';
|
||||
for (i = 0; i < *ReplacementAmbigPartSize; ++i) {
|
||||
if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token))) break;
|
||||
strcat(ReplacementString, token);
|
||||
if (!unicharset.contains_unichar(token)) {
|
||||
if (global_ambigs_debug_level) tprintf(kIllegalUnicharMsg, token);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i != *ReplacementAmbigPartSize) {
|
||||
if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num);
|
||||
return false;
|
||||
}
|
||||
if (version > 0) {
|
||||
// The next field being true indicates that the abiguity should
|
||||
// always be substituted (e.g. '' should always be changed to ").
|
||||
// For such "certain" n -> m ambigs tesseract will insert character
|
||||
// fragments for the n pieces in the unicharset. AmbigsFound()
|
||||
// will then replace the incorrect ngram with the character
|
||||
// fragments of the correct character (or ngram if m > 1).
|
||||
// Note that if m > 1, an ngram will be inserted into the
|
||||
// modified word, not the individual unigrams. Tesseract
|
||||
// has limited support for ngram unichar (e.g. dawg permuter).
|
||||
if (!(token = strtok_r(NULL, kAmbigDelimiters, &next_token)) ||
|
||||
!sscanf(token, "%d", type)) {
|
||||
if (global_ambigs_debug_level) tprintf(kIllegalMsg, line_num);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void UnicharAmbigs::InsertIntoTable(
|
||||
UnicharAmbigsVector &table, int TestAmbigPartSize,
|
||||
UNICHAR_ID *TestUnicharIds, int ReplacementAmbigPartSize,
|
||||
const char *ReplacementString, int type,
|
||||
AmbigSpec *ambig_spec, UNICHARSET *unicharset) {
|
||||
ambig_spec->type = static_cast<AmbigType>(type);
|
||||
if (TestAmbigPartSize == 1 && ReplacementAmbigPartSize == 1 &&
|
||||
unicharset->to_lower(TestUnicharIds[0]) ==
|
||||
unicharset->to_lower(unicharset->unichar_to_id(ReplacementString))) {
|
||||
ambig_spec->type = CASE_AMBIG;
|
||||
}
|
||||
|
||||
ambig_spec->wrong_ngram_size =
|
||||
UnicharIdArrayUtils::copy(TestUnicharIds, ambig_spec->wrong_ngram);
|
||||
|
||||
// Since we need to maintain a constant number of unichar positions in
|
||||
// order to construct ambig_blob_choices vector in NoDangerousAmbig(), for
|
||||
// each n->m ambiguity we will have to place n character fragments of the
|
||||
// correct ngram into the corresponding positions in the vector (e.g. given
|
||||
// "vvvvw" and vvvv->ww we will place v and |ww|0|4 into position 0, v and
|
||||
// |ww|1|4 into position 1 and so on. The correct ngram is reconstructed
|
||||
// from fragments by dawg_permute_and_select().
|
||||
|
||||
// Insert the corresponding correct ngram into the unicharset.
|
||||
// Unicharset code assumes that the "base" ngram is inserted into
|
||||
// the unicharset before fragments of this ngram are inserted.
|
||||
unicharset->unichar_insert(ReplacementString);
|
||||
ambig_spec->correct_ngram_id =
|
||||
unicharset->unichar_to_id(ReplacementString);
|
||||
if (ReplacementAmbigPartSize > 1) {
|
||||
unicharset->set_isngram(ambig_spec->correct_ngram_id, true);
|
||||
}
|
||||
// Add the corresponding fragments of the correct ngram to unicharset.
|
||||
int i;
|
||||
for (i = 0; i < TestAmbigPartSize; ++i) {
|
||||
UNICHAR_ID unichar_id;
|
||||
if (TestAmbigPartSize == 1) {
|
||||
unichar_id = ambig_spec->correct_ngram_id;
|
||||
} else {
|
||||
STRING frag_str = CHAR_FRAGMENT::to_string(
|
||||
ReplacementString, i, TestAmbigPartSize);
|
||||
unicharset->unichar_insert(frag_str.string());
|
||||
unichar_id = unicharset->unichar_to_id(frag_str.string());
|
||||
}
|
||||
ambig_spec->correct_fragments[i] = unichar_id;
|
||||
}
|
||||
ambig_spec->correct_fragments[i] = INVALID_UNICHAR_ID;
|
||||
|
||||
// Add AmbigSpec for this ambiguity to the corresponding AmbigSpec_LIST.
|
||||
// Keep AmbigSpec_LISTs sorted by AmbigSpec.wrong_ngram.
|
||||
if (table[TestUnicharIds[0]] == NULL) {
|
||||
table[TestUnicharIds[0]] = new AmbigSpec_LIST();
|
||||
}
|
||||
table[TestUnicharIds[0]]->add_sorted(
|
||||
AmbigSpec::compare_ambig_specs, ambig_spec);
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
186
ccutil/ambigs.h
Normal file
186
ccutil/ambigs.h
Normal file
@ -0,0 +1,186 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: ambigs.h
|
||||
// Description: Constants, flags, functions for dealing with
|
||||
// ambiguities (training and recognition).
|
||||
// Author: Daria Antonova
|
||||
// Created: Mon Aug 23 11:26:43 PDT 2008
|
||||
//
|
||||
// (C) Copyright 2008, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCUTIL_AMBIGS_H_
|
||||
#define TESSERACT_CCUTIL_AMBIGS_H_
|
||||
|
||||
#include "elst.h"
|
||||
#include "tprintf.h"
|
||||
#include "unichar.h"
|
||||
#include "unicharset.h"
|
||||
#include "genericvector.h"
|
||||
|
||||
#define MAX_AMBIG_SIZE 10
|
||||
|
||||
extern INT_VAR_H(global_ambigs_debug_level, 0,
|
||||
"Debug level for unichar ambiguities");
|
||||
extern BOOL_VAR_H(use_definite_ambigs_for_classifier, 0,
|
||||
"Use definite ambiguities when running character classifier");
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
static const int kUnigramAmbigsBufferSize = 1000;
|
||||
static const char kAmbigNgramSeparator[] = { ' ', '\0' };
|
||||
static const char kAmbigDelimiters[] = "\t ";
|
||||
static const char kIllegalMsg[] =
|
||||
"Illegal ambiguity specification on line %d\n";
|
||||
static const char kIllegalUnicharMsg[] =
|
||||
"Illegal unichar %s in ambiguity specification\n";
|
||||
|
||||
enum AmbigType {
|
||||
NOT_AMBIG, // the ngram pair is not ambiguous
|
||||
REPLACE_AMBIG, // ocred ngram should always be substituted with correct
|
||||
DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1)
|
||||
SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1)
|
||||
CASE_AMBIG, // this is a case ambiguity (1-1)
|
||||
|
||||
AMBIG_TYPE_COUNT // number of enum entries
|
||||
};
|
||||
|
||||
// A collection of utility functions for arrays of UNICHAR_IDs that are
|
||||
// terminated by INVALID_UNICHAR_ID.
|
||||
class UnicharIdArrayUtils {
|
||||
public:
|
||||
// Compares two arrays of unichar ids. Returns -1 if the length of array1 is
|
||||
// less than length of array2, if any array1[i] is less than array2[i].
|
||||
// Returns 0 if the arrays are equal, 1 otherwise.
|
||||
// The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
|
||||
static inline int compare(const UNICHAR_ID array1[],
|
||||
const UNICHAR_ID array2[]) {
|
||||
const UNICHAR_ID *ptr1 = array1;
|
||||
const UNICHAR_ID *ptr2 = array2;
|
||||
while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) {
|
||||
if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1;
|
||||
++ptr1;
|
||||
++ptr2;
|
||||
}
|
||||
if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0;
|
||||
return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1;
|
||||
}
|
||||
|
||||
// Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
|
||||
// The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
|
||||
// and that dst has enough space for all the elements from src.
|
||||
static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
|
||||
int i = 0;
|
||||
do {
|
||||
dst[i] = src[i];
|
||||
} while (dst[i++] != INVALID_UNICHAR_ID);
|
||||
return i - 1;
|
||||
}
|
||||
|
||||
// Prints unichars corresponding to the unichar_ids in the given array.
|
||||
// The function assumes that array is terminated by INVALID_UNICHAR_ID.
|
||||
static inline void print(const UNICHAR_ID array[],
|
||||
const UNICHARSET &unicharset) {
|
||||
const UNICHAR_ID *ptr = array;
|
||||
if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]");
|
||||
while (*ptr != INVALID_UNICHAR_ID) {
|
||||
tprintf("%s ", unicharset.id_to_unichar(*ptr++));
|
||||
}
|
||||
tprintf("( ");
|
||||
ptr = array;
|
||||
while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++);
|
||||
tprintf(")\n");
|
||||
}
|
||||
};
|
||||
|
||||
// AMBIG_SPEC_LIST stores a list of dangerous ambigs that
|
||||
// start with the same unichar (e.g. r->t rn->m rr1->m).
|
||||
class AmbigSpec : public ELIST_LINK {
|
||||
public:
|
||||
AmbigSpec();
|
||||
~AmbigSpec() {}
|
||||
|
||||
// Comparator function for sorting AmbigSpec_LISTs. The lists will
|
||||
// be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
|
||||
// in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
|
||||
static int compare_ambig_specs(const void *spec1, const void *spec2) {
|
||||
const AmbigSpec *s1 =
|
||||
*reinterpret_cast<const AmbigSpec * const *>(spec1);
|
||||
const AmbigSpec *s2 =
|
||||
*reinterpret_cast<const AmbigSpec * const *>(spec2);
|
||||
return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
|
||||
}
|
||||
|
||||
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
|
||||
UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
|
||||
UNICHAR_ID correct_ngram_id;
|
||||
AmbigType type;
|
||||
int wrong_ngram_size;
|
||||
};
|
||||
ELISTIZEH(AmbigSpec);
|
||||
|
||||
// AMBIG_TABLE[i] stores a set of ambiguities whose
|
||||
// wrong ngram starts with unichar id i.
|
||||
typedef GenericVector<AmbigSpec_LIST *> UnicharAmbigsVector;
|
||||
typedef GenericVector<UNICHAR_ID> UnicharIdVector;
|
||||
|
||||
class UnicharAmbigs {
|
||||
public:
|
||||
UnicharAmbigs() {}
|
||||
~UnicharAmbigs() {
|
||||
replace_ambigs_.delete_data_pointers();
|
||||
dang_ambigs_.delete_data_pointers();
|
||||
one_to_one_definite_ambigs_.delete_data_pointers();
|
||||
}
|
||||
|
||||
const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
|
||||
const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }
|
||||
|
||||
// Fills in two ambiguity tables (replaceable and dangerous) with information
|
||||
// read from the ambigs file. An ambiguity table is an array of lists.
|
||||
// The array is indexed by a class id. Each entry in the table provides
|
||||
// a list of potential ambiguities which can start with the corresponding
|
||||
// character. For example the ambiguity "rn -> m", would be located in the
|
||||
// table at index of unicharset.unichar_to_id('r').
|
||||
// In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
|
||||
// one_to_one_definite_ambigs_. This vector is also indexed by the class id
|
||||
// of the wrong part of the ambiguity and each entry contains a vector of
|
||||
// unichar ids that are ambiguous to it.
|
||||
void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset,
|
||||
UNICHARSET *unicharset);
|
||||
|
||||
// Return definite 1-1 ambigs.
|
||||
const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const {
|
||||
if (one_to_one_definite_ambigs_.empty()) return NULL;
|
||||
return one_to_one_definite_ambigs_[unichar_id];
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
bool ParseAmbiguityLine(int line_num, int version,
|
||||
const UNICHARSET &unicharset, char *buffer,
|
||||
int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
|
||||
int *ReplacementAmbigPartSize,
|
||||
char *ReplacementString, int *type);
|
||||
void InsertIntoTable(UnicharAmbigsVector &table,
|
||||
int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
|
||||
int ReplacementAmbigPartSize,
|
||||
const char *ReplacementString, int type,
|
||||
AmbigSpec *ambig_spec, UNICHARSET *unicharset);
|
||||
UnicharAmbigsVector dang_ambigs_;
|
||||
UnicharAmbigsVector replace_ambigs_;
|
||||
GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_CCUTIL_AMBIGS_H_
|
@ -22,8 +22,6 @@
|
||||
#ifdef __UNIX__
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#else
|
||||
#include <io.h>
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
#include "basedir.h"
|
||||
@ -103,7 +101,7 @@ DLLSYM inT8 getpath( //get dir name of code
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
strcpy(directory, code);
|
||||
strncpy(directory, code, MAX_PATH - 1);
|
||||
}
|
||||
while ((path_end = strchr (directory, '\\')) != NULL)
|
||||
*path_end = '/';
|
||||
|
1006
ccutil/callback.h
Normal file
1006
ccutil/callback.h
Normal file
File diff suppressed because it is too large
Load Diff
48
ccutil/ccutil.cpp
Normal file
48
ccutil/ccutil.cpp
Normal file
@ -0,0 +1,48 @@
|
||||
// Copyright 2008 Google Inc. All Rights Reserved.
|
||||
// Author: scharron@google.com (Samuel Charron)
|
||||
|
||||
#include "ccutil.h"
|
||||
|
||||
namespace tesseract {
|
||||
CCUtil::CCUtil()
|
||||
: //// mainblk.* /////////////////////////////////////////////////////
|
||||
BOOL_MEMBER(m_print_variables, FALSE,
|
||||
"Print initial values of all variables"),
|
||||
STRING_MEMBER(m_data_sub_dir,
|
||||
"tessdata/", "Directory for data files")
|
||||
////////////////////////////////////////////////////////////////////
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
CCUtil::~CCUtil() {
|
||||
}
|
||||
|
||||
|
||||
CCUtilMutex::CCUtilMutex() {
|
||||
#ifdef WIN32
|
||||
mutex_ = CreateMutex(0, FALSE, 0);
|
||||
#else
|
||||
pthread_mutex_init(&mutex_, NULL);
|
||||
#endif
|
||||
}
|
||||
|
||||
void CCUtilMutex::Lock() {
|
||||
#ifdef WIN32
|
||||
WaitForSingleObject(mutex_, INFINITE);
|
||||
#else
|
||||
pthread_mutex_lock(&mutex_);
|
||||
#endif
|
||||
}
|
||||
|
||||
void CCUtilMutex::Unlock() {
|
||||
#ifdef WIN32
|
||||
ReleaseMutex(mutex_);
|
||||
#else
|
||||
pthread_mutex_unlock(&mutex_);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
CCUtilMutex tprintfMutex;
|
||||
} // namespace tesseract
|
83
ccutil/ccutil.h
Normal file
83
ccutil/ccutil.h
Normal file
@ -0,0 +1,83 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: ccutil.h
|
||||
// Description: ccutil class.
|
||||
// Author: Samuel Charron
|
||||
//
|
||||
// (C) Copyright 2006, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCUTIL_CCUTIL_H__
|
||||
#define TESSERACT_CCUTIL_CCUTIL_H__
|
||||
|
||||
#include "ambigs.h"
|
||||
#include "errcode.h"
|
||||
#include "strngs.h"
|
||||
#include "tessdatamanager.h"
|
||||
#include "varable.h"
|
||||
#include "unicharset.h"
|
||||
|
||||
#ifdef WIN32
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <pthread.h>
|
||||
#include <semaphore.h>
|
||||
#endif
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class CCUtilMutex {
|
||||
public:
|
||||
CCUtilMutex();
|
||||
|
||||
void Lock();
|
||||
|
||||
void Unlock();
|
||||
private:
|
||||
#ifdef WIN32
|
||||
HANDLE mutex_;
|
||||
#else
|
||||
pthread_mutex_t mutex_;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
class CCUtil {
|
||||
public:
|
||||
CCUtil();
|
||||
~CCUtil();
|
||||
|
||||
public:
|
||||
void main_setup(
|
||||
const char *argv0, // program name
|
||||
const char *basename // name of image
|
||||
);
|
||||
public:
|
||||
STRING datadir; // dir for data files
|
||||
STRING imagebasename; // name of image
|
||||
|
||||
BOOL_VAR_H (m_print_variables, FALSE,
|
||||
"Print initial values of all variables");
|
||||
STRING_VAR_H (m_data_sub_dir, "tessdata/", "Directory for data files");
|
||||
STRING lang;
|
||||
STRING language_data_path_prefix;
|
||||
TessdataManager tessdata_manager;
|
||||
UNICHARSET unicharset;
|
||||
UnicharAmbigs unichar_ambigs;
|
||||
STRING imagefile; // image file name
|
||||
STRING directory; // main directory
|
||||
};
|
||||
|
||||
extern CCUtilMutex tprintfMutex;
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_CCUTIL_CCUTIL_H__
|
819
ccutil/ccutil.vcproj
Executable file
819
ccutil/ccutil.vcproj
Executable file
@ -0,0 +1,819 @@
|
||||
<?xml version="1.0" encoding="Windows-1252"?>
|
||||
<VisualStudioProject
|
||||
ProjectType="Visual C++"
|
||||
Version="9.00"
|
||||
Name="ccutil"
|
||||
ProjectGUID="{DF2FA86F-A663-4805-AED7-2F81D9EAC796}"
|
||||
RootNamespace="ccutil"
|
||||
Keyword="Win32Proj"
|
||||
TargetFrameworkVersion="196613"
|
||||
>
|
||||
<Platforms>
|
||||
<Platform
|
||||
Name="Win32"
|
||||
/>
|
||||
</Platforms>
|
||||
<ToolFiles>
|
||||
</ToolFiles>
|
||||
<Configurations>
|
||||
<Configuration
|
||||
Name="Debug|Win32"
|
||||
OutputDirectory="$(SolutionDir)$(ConfigurationName)"
|
||||
IntermediateDirectory="$(ConfigurationName)"
|
||||
ConfigurationType="4"
|
||||
CharacterSet="2"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
Optimization="0"
|
||||
PreprocessorDefinitions="WIN32;_DEBUG;_LIB;__MSW32__;_CRT_SECURE_NO_WARNINGS"
|
||||
MinimalRebuild="true"
|
||||
BasicRuntimeChecks="3"
|
||||
RuntimeLibrary="1"
|
||||
UsePrecompiledHeader="0"
|
||||
WarningLevel="3"
|
||||
DebugInformationFormat="4"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLibrarianTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
<Configuration
|
||||
Name="Release|Win32"
|
||||
OutputDirectory="$(SolutionDir)$(ConfigurationName)"
|
||||
IntermediateDirectory="$(ConfigurationName)"
|
||||
ConfigurationType="4"
|
||||
CharacterSet="2"
|
||||
WholeProgramOptimization="1"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
Optimization="2"
|
||||
EnableIntrinsicFunctions="true"
|
||||
PreprocessorDefinitions="WIN32;NDEBUG;_LIB;__MSW32__;_CRT_SECURE_NO_WARNINGS"
|
||||
RuntimeLibrary="0"
|
||||
EnableFunctionLevelLinking="true"
|
||||
UsePrecompiledHeader="0"
|
||||
WarningLevel="3"
|
||||
DebugInformationFormat="3"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLibrarianTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
</Configurations>
|
||||
<References>
|
||||
</References>
|
||||
<Files>
|
||||
<Filter
|
||||
Name="Source Files"
|
||||
Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
|
||||
UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
|
||||
>
|
||||
<File
|
||||
RelativePath=".\ambigs.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\basedir.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\bits16.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\boxread.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ccutil.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\clst.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\debugwin.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\elst.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\elst2.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\errcode.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\globaloc.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\hashfn.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\mainblk.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\memblk.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\memry.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\mfcpch.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="1"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="1"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ocrshell.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\serialis.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\strngs.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\tessdatamanager.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\tessopt.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\tordvars.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\tprintf.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unichar.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unicharmap.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unicharset.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\varable.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
UsePrecompiledHeader="2"
|
||||
PrecompiledHeaderThrough="mfcpch.h"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Header Files"
|
||||
Filter="h;hpp;hxx;hm;inl;inc;xsd"
|
||||
UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
|
||||
>
|
||||
<File
|
||||
RelativePath=".\ambigs.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\basedir.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\bits16.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\boxread.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\callback.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ccutil.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\clst.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\debugwin.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\elst.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\elst2.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\errcode.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\fileerr.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\genericvector.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\globaloc.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\hashfn.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\helpers.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\host.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\hosthplb.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\lsterr.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\mainblk.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\memblk.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\memry.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\memryerr.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\mfcpch.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ndminx.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\notdll.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\nwmain.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ocrclass.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ocrshell.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\platform.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\qrsequence.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\scanutils.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\secname.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\serialis.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\stderr.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\strngs.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\tessclas.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\tessdatamanager.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\tessopt.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\tordvars.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\tprintf.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unichar.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unicharmap.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unicharset.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unicity_table.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\varable.h"
|
||||
>
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Resource Files"
|
||||
Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
|
||||
UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
|
||||
>
|
||||
</Filter>
|
||||
</Files>
|
||||
<Globals>
|
||||
</Globals>
|
||||
</VisualStudioProject>
|
@ -96,11 +96,11 @@ class DLLSYM CLIST
|
||||
void shallow_clear(); //clear list but dont
|
||||
//delete data elements
|
||||
|
||||
BOOL8 empty() { //is list empty?
|
||||
bool empty() { //is list empty?
|
||||
return !last;
|
||||
}
|
||||
|
||||
BOOL8 singleton() {
|
||||
bool singleton() {
|
||||
return last != NULL ? (last == last->next) : FALSE;
|
||||
}
|
||||
|
||||
@ -165,13 +165,13 @@ class DLLSYM CLIST_ITERATOR
|
||||
CLIST_LINK *prev; //prev element
|
||||
CLIST_LINK *current; //current element
|
||||
CLIST_LINK *next; //next element
|
||||
BOOL8 ex_current_was_last; //current extracted
|
||||
bool ex_current_was_last; //current extracted
|
||||
//was end of list
|
||||
BOOL8 ex_current_was_cycle_pt; //current extracted
|
||||
bool ex_current_was_cycle_pt; //current extracted
|
||||
//was cycle point
|
||||
CLIST_LINK *cycle_pt; //point we are cycling
|
||||
//the list to.
|
||||
BOOL8 started_cycling; //Have we moved off
|
||||
bool started_cycling; //Have we moved off
|
||||
//the start?
|
||||
|
||||
CLIST_LINK *extract_sublist( //from this current...
|
||||
@ -229,7 +229,7 @@ class DLLSYM CLIST_ITERATOR
|
||||
|
||||
void mark_cycle_pt(); //remember current
|
||||
|
||||
BOOL8 empty() { //is list empty?
|
||||
bool empty() { //is list empty?
|
||||
#ifndef NDEBUG
|
||||
if (!list)
|
||||
NO_LIST.error ("CLIST_ITERATOR::empty", ABORT, NULL);
|
||||
@ -237,15 +237,15 @@ class DLLSYM CLIST_ITERATOR
|
||||
return list->empty ();
|
||||
}
|
||||
|
||||
BOOL8 current_extracted() { //current extracted?
|
||||
bool current_extracted() { //current extracted?
|
||||
return !current;
|
||||
}
|
||||
|
||||
BOOL8 at_first(); //Current is first?
|
||||
bool at_first(); //Current is first?
|
||||
|
||||
BOOL8 at_last(); //Current is last?
|
||||
bool at_last(); //Current is last?
|
||||
|
||||
BOOL8 cycled_list(); //Completed a cycle?
|
||||
bool cycled_list(); //Completed a cycle?
|
||||
|
||||
void add_to_end( //add at end &
|
||||
void *new_data); //dont move
|
||||
@ -695,7 +695,7 @@ inline void CLIST_ITERATOR::mark_cycle_pt() {
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
inline BOOL8 CLIST_ITERATOR::at_first() {
|
||||
inline bool CLIST_ITERATOR::at_first() {
|
||||
#ifndef NDEBUG
|
||||
if (!this)
|
||||
NULL_OBJECT.error ("CLIST_ITERATOR::at_first", ABORT, NULL);
|
||||
@ -717,7 +717,7 @@ inline BOOL8 CLIST_ITERATOR::at_first() {
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
inline BOOL8 CLIST_ITERATOR::at_last() {
|
||||
inline bool CLIST_ITERATOR::at_last() {
|
||||
#ifndef NDEBUG
|
||||
if (!this)
|
||||
NULL_OBJECT.error ("CLIST_ITERATOR::at_last", ABORT, NULL);
|
||||
@ -739,7 +739,7 @@ inline BOOL8 CLIST_ITERATOR::at_last() {
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
inline BOOL8 CLIST_ITERATOR::cycled_list() {
|
||||
inline bool CLIST_ITERATOR::cycled_list() {
|
||||
#ifndef NDEBUG
|
||||
if (!this)
|
||||
NULL_OBJECT.error ("CLIST_ITERATOR::cycled_list", ABORT, NULL);
|
||||
|
@ -39,7 +39,6 @@ static LCommander *pCommander = NULL;
|
||||
//NT implementation
|
||||
#if defined(__MSW32__) && !defined(_CONSOLE)
|
||||
|
||||
#include <io.h>
|
||||
#define ID_DEBUG_MSG 32779
|
||||
|
||||
/**********************************************************************
|
||||
|
@ -141,11 +141,11 @@ class DLLSYM ELIST
|
||||
//ptr to zapper functn
|
||||
void (*zapper) (ELIST_LINK *));
|
||||
|
||||
BOOL8 empty() { //is list empty?
|
||||
bool empty() { //is list empty?
|
||||
return !last;
|
||||
}
|
||||
|
||||
BOOL8 singleton() {
|
||||
bool singleton() {
|
||||
return last ? (last == last->next) : FALSE;
|
||||
}
|
||||
|
||||
@ -210,13 +210,13 @@ class DLLSYM ELIST_ITERATOR
|
||||
ELIST_LINK *prev; //prev element
|
||||
ELIST_LINK *current; //current element
|
||||
ELIST_LINK *next; //next element
|
||||
BOOL8 ex_current_was_last; //current extracted
|
||||
bool ex_current_was_last; //current extracted
|
||||
//was end of list
|
||||
BOOL8 ex_current_was_cycle_pt; //current extracted
|
||||
bool ex_current_was_cycle_pt; //current extracted
|
||||
//was cycle point
|
||||
ELIST_LINK *cycle_pt; //point we are cycling
|
||||
//the list to.
|
||||
BOOL8 started_cycling; //Have we moved off
|
||||
bool started_cycling; //Have we moved off
|
||||
//the start?
|
||||
|
||||
ELIST_LINK *extract_sublist( //from this current...
|
||||
@ -274,7 +274,7 @@ class DLLSYM ELIST_ITERATOR
|
||||
|
||||
void mark_cycle_pt(); //remember current
|
||||
|
||||
BOOL8 empty() { //is list empty?
|
||||
bool empty() { //is list empty?
|
||||
#ifndef NDEBUG
|
||||
if (!list)
|
||||
NO_LIST.error ("ELIST_ITERATOR::empty", ABORT, NULL);
|
||||
@ -282,15 +282,15 @@ class DLLSYM ELIST_ITERATOR
|
||||
return list->empty ();
|
||||
}
|
||||
|
||||
BOOL8 current_extracted() { //current extracted?
|
||||
bool current_extracted() { //current extracted?
|
||||
return !current;
|
||||
}
|
||||
|
||||
BOOL8 at_first(); //Current is first?
|
||||
bool at_first(); //Current is first?
|
||||
|
||||
BOOL8 at_last(); //Current is last?
|
||||
bool at_last(); //Current is last?
|
||||
|
||||
BOOL8 cycled_list(); //Completed a cycle?
|
||||
bool cycled_list(); //Completed a cycle?
|
||||
|
||||
void add_to_end( //add at end &
|
||||
ELIST_LINK *new_link); //dont move
|
||||
@ -728,7 +728,7 @@ inline void ELIST_ITERATOR::mark_cycle_pt() {
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
inline BOOL8 ELIST_ITERATOR::at_first() {
|
||||
inline bool ELIST_ITERATOR::at_first() {
|
||||
#ifndef NDEBUG
|
||||
if (!this)
|
||||
NULL_OBJECT.error ("ELIST_ITERATOR::at_first", ABORT, NULL);
|
||||
@ -750,7 +750,7 @@ inline BOOL8 ELIST_ITERATOR::at_first() {
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
inline BOOL8 ELIST_ITERATOR::at_last() {
|
||||
inline bool ELIST_ITERATOR::at_last() {
|
||||
#ifndef NDEBUG
|
||||
if (!this)
|
||||
NULL_OBJECT.error ("ELIST_ITERATOR::at_last", ABORT, NULL);
|
||||
@ -772,7 +772,7 @@ inline BOOL8 ELIST_ITERATOR::at_last() {
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
inline BOOL8 ELIST_ITERATOR::cycled_list() {
|
||||
inline bool ELIST_ITERATOR::cycled_list() {
|
||||
#ifndef NDEBUG
|
||||
if (!this)
|
||||
NULL_OBJECT.error ("ELIST_ITERATOR::cycled_list", ABORT, NULL);
|
||||
|
@ -110,11 +110,11 @@ class DLLSYM ELIST2
|
||||
void (*zapper) (ELIST2_LINK *));
|
||||
//ptr to zapper functn
|
||||
|
||||
BOOL8 empty() { //is list empty?
|
||||
bool empty() { //is list empty?
|
||||
return !last;
|
||||
}
|
||||
|
||||
BOOL8 singleton() {
|
||||
bool singleton() {
|
||||
return last ? (last == last->next) : FALSE;
|
||||
}
|
||||
|
||||
@ -179,13 +179,13 @@ class DLLSYM ELIST2_ITERATOR
|
||||
ELIST2_LINK *prev; //prev element
|
||||
ELIST2_LINK *current; //current element
|
||||
ELIST2_LINK *next; //next element
|
||||
BOOL8 ex_current_was_last; //current extracted
|
||||
bool ex_current_was_last; //current extracted
|
||||
//was end of list
|
||||
BOOL8 ex_current_was_cycle_pt; //current extracted
|
||||
bool ex_current_was_cycle_pt; //current extracted
|
||||
//was cycle point
|
||||
ELIST2_LINK *cycle_pt; //point we are cycling
|
||||
//the list to.
|
||||
BOOL8 started_cycling; //Have we moved off
|
||||
bool started_cycling; //Have we moved off
|
||||
//the start?
|
||||
|
||||
ELIST2_LINK *extract_sublist( //from this current...
|
||||
@ -246,7 +246,7 @@ class DLLSYM ELIST2_ITERATOR
|
||||
|
||||
void mark_cycle_pt(); //remember current
|
||||
|
||||
BOOL8 empty() { //is list empty?
|
||||
bool empty() { //is list empty?
|
||||
#ifndef NDEBUG
|
||||
if (!list)
|
||||
NO_LIST.error ("ELIST2_ITERATOR::empty", ABORT, NULL);
|
||||
@ -254,15 +254,15 @@ class DLLSYM ELIST2_ITERATOR
|
||||
return list->empty ();
|
||||
}
|
||||
|
||||
BOOL8 current_extracted() { //current extracted?
|
||||
bool current_extracted() { //current extracted?
|
||||
return !current;
|
||||
}
|
||||
|
||||
BOOL8 at_first(); //Current is first?
|
||||
bool at_first(); //Current is first?
|
||||
|
||||
BOOL8 at_last(); //Current is last?
|
||||
bool at_last(); //Current is last?
|
||||
|
||||
BOOL8 cycled_list(); //Completed a cycle?
|
||||
bool cycled_list(); //Completed a cycle?
|
||||
|
||||
void add_to_end( //add at end &
|
||||
ELIST2_LINK *new_link); //dont move
|
||||
@ -750,7 +750,7 @@ inline void ELIST2_ITERATOR::mark_cycle_pt() {
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
inline BOOL8 ELIST2_ITERATOR::at_first() {
|
||||
inline bool ELIST2_ITERATOR::at_first() {
|
||||
#ifndef NDEBUG
|
||||
if (!this)
|
||||
NULL_OBJECT.error ("ELIST2_ITERATOR::at_first", ABORT, NULL);
|
||||
@ -772,7 +772,7 @@ inline BOOL8 ELIST2_ITERATOR::at_first() {
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
inline BOOL8 ELIST2_ITERATOR::at_last() {
|
||||
inline bool ELIST2_ITERATOR::at_last() {
|
||||
#ifndef NDEBUG
|
||||
if (!this)
|
||||
NULL_OBJECT.error ("ELIST2_ITERATOR::at_last", ABORT, NULL);
|
||||
@ -794,7 +794,7 @@ inline BOOL8 ELIST2_ITERATOR::at_last() {
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
inline BOOL8 ELIST2_ITERATOR::cycled_list() {
|
||||
inline bool ELIST2_ITERATOR::cycled_list() {
|
||||
#ifndef NDEBUG
|
||||
if (!this)
|
||||
NULL_OBJECT.error ("ELIST2_ITERATOR::cycled_list", ABORT, NULL);
|
||||
|
398
ccutil/genericvector.h
Normal file
398
ccutil/genericvector.h
Normal file
@ -0,0 +1,398 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: genericvector.h
|
||||
// Description: Generic vector class
|
||||
// Author: Daria Antonova
|
||||
// Created: Mon Jun 23 11:26:43 PDT 2008
|
||||
//
|
||||
// (C) Copyright 2007, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
#ifndef TESSERACT_CCUTIL_GENERICVECTOR_H_
|
||||
#define TESSERACT_CCUTIL_GENERICVECTOR_H_
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "callback.h"
|
||||
#include "errcode.h"
|
||||
|
||||
template <typename T>
|
||||
class GenericVector {
|
||||
public:
|
||||
GenericVector() { this->init(kDefaultVectorSize); }
|
||||
GenericVector(int size) { this->init(size); }
|
||||
|
||||
// Copy
|
||||
GenericVector(const GenericVector& other) {
|
||||
this->init(other.size());
|
||||
this->operator+=(other);
|
||||
}
|
||||
GenericVector<T> &operator+=(const GenericVector& other);
|
||||
GenericVector<T> &operator=(const GenericVector& other);
|
||||
|
||||
virtual ~GenericVector();
|
||||
|
||||
// Reserve some memory.
|
||||
void reserve(int size);
|
||||
// Double the size of the internal array.
|
||||
void double_the_size();
|
||||
|
||||
// Init the object, allocating size memory.
|
||||
void init(int size);
|
||||
|
||||
// Return the size used.
|
||||
int size() const {
|
||||
return size_used_;
|
||||
}
|
||||
|
||||
int length() const {
|
||||
return size_used_;
|
||||
}
|
||||
|
||||
// Return true if empty.
|
||||
bool empty() const {
|
||||
return size_used_ == 0;
|
||||
}
|
||||
|
||||
// Return the object from an index.
|
||||
T &get(int index) const;
|
||||
T &operator[](int index) const;
|
||||
|
||||
// Return the index of the T object.
|
||||
// This method NEEDS a compare_callback to be passed to
|
||||
// set_compare_callback.
|
||||
int get_index(T object) const;
|
||||
|
||||
// Return true if T is in the array
|
||||
bool contains(T object) const;
|
||||
|
||||
// Return true if the index is valid
|
||||
T contains_index(int index) const;
|
||||
|
||||
// Push an element in the end of the array
|
||||
int push_back(T object);
|
||||
void operator+=(T t);
|
||||
|
||||
// Set the value at the given index
|
||||
void set(T t, int index);
|
||||
|
||||
// Insert t at the given index, push other elements to the right.
|
||||
void insert(T t, int index);
|
||||
|
||||
// Removes an element at the given index and
|
||||
// shifts the remaining elements to the left.
|
||||
void remove(int index);
|
||||
|
||||
// Add a callback to be called to delete the elements when the array took
|
||||
// their ownership.
|
||||
void set_clear_callback(Callback1<T>* cb);
|
||||
|
||||
// Add a callback to be called to compare the elements when needed (contains,
|
||||
// get_id, ...)
|
||||
void set_compare_callback(ResultCallback2<bool, T const &, T const &>* cb);
|
||||
|
||||
// Clear the array, calling the clear callback function if any.
|
||||
// All the owned Callbacks are also deleted.
|
||||
// If you don't want the Callbacks to be deleted, before calling clear, set
|
||||
// the callback to NULL.
|
||||
virtual void clear();
|
||||
|
||||
// Delete objects pointed to by data_[i]
|
||||
void delete_data_pointers();
|
||||
|
||||
// This method clears the current object, then, does a shallow copy of
|
||||
// its argument, and finally invalidate its argument.
|
||||
// Callbacks are moved to the current object;
|
||||
void move(GenericVector<T>* from);
|
||||
|
||||
// Read/Write the array to a file. This does _NOT_ read/write the callbacks.
|
||||
// The Callback given must be permanent since they will be called more than
|
||||
// once. The given callback will be deleted at the end.
|
||||
void write(FILE* f, Callback2<FILE*, T const &>* cb);
|
||||
void read(FILE* f, Callback3<FILE*, T*, bool>* cb, bool swap);
|
||||
|
||||
// Allocates a new array of double the current_size, copies over the
|
||||
// information from data to the new location, deletes data and returns
|
||||
// the pointed to the new larger array.
|
||||
// This function uses memcpy to copy the data, instead of invoking
|
||||
// operator=() for each element like double_the_size() does.
|
||||
static T *double_the_size_memcpy(int current_size, T *data) {
|
||||
T *data_new = new T[current_size * 2];
|
||||
memcpy(data_new, data, sizeof(T) * current_size);
|
||||
delete[] data;
|
||||
return data_new;
|
||||
}
|
||||
|
||||
protected:
|
||||
// We are assuming that the object generally placed in thie
|
||||
// vector are small enough that for efficiency it makes sence
|
||||
// to start with a larger initial size.
|
||||
static const int kDefaultVectorSize = 4;
|
||||
int size_used_;
|
||||
int size_reserved_;
|
||||
T* data_;
|
||||
Callback1<T>* clear_cb_;
|
||||
// Mutable because Run method is not const
|
||||
mutable ResultCallback2<bool, T const &, T const &>* compare_cb_;
|
||||
};
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
template <typename T>
|
||||
bool cmp_eq(T const & t1, T const & t2) {
|
||||
return t1 == t2;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
// A useful vector that uses operator== to do comparisons.
|
||||
template <typename T>
|
||||
class GenericVectorEqEq : public GenericVector<T> {
|
||||
public:
|
||||
GenericVectorEqEq() {
|
||||
GenericVector<T>::set_compare_callback(
|
||||
NewPermanentCallback(tesseract::cmp_eq<T>));
|
||||
}
|
||||
GenericVectorEqEq(int size) : GenericVector<T>(size) {
|
||||
GenericVector<T>::set_compare_callback(
|
||||
NewPermanentCallback(tesseract::cmp_eq<T>));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
void GenericVector<T>::init(int size) {
|
||||
size_used_ = 0;
|
||||
size_reserved_ = 0;
|
||||
data_ = 0;
|
||||
clear_cb_ = 0;
|
||||
compare_cb_ = 0;
|
||||
reserve(size);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
GenericVector<T>::~GenericVector() {
|
||||
clear();
|
||||
}
|
||||
|
||||
// Reserve some memory. If the internal array contains elements, they are
|
||||
// copied.
|
||||
template <typename T>
|
||||
void GenericVector<T>::reserve(int size) {
|
||||
if (size_reserved_ > size || size <= 0)
|
||||
return;
|
||||
T* new_array = new T[size];
|
||||
for (int i = 0; i < size_used_; ++i)
|
||||
new_array[i] = data_[i];
|
||||
if (data_ != NULL) delete[] data_;
|
||||
data_ = new_array;
|
||||
size_reserved_ = size;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GenericVector<T>::double_the_size() {
|
||||
if (size_reserved_ == 0) {
|
||||
reserve(kDefaultVectorSize);
|
||||
}
|
||||
else {
|
||||
reserve(2 * size_reserved_);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Return the object from an index.
|
||||
template <typename T>
|
||||
T &GenericVector<T>::get(int index) const {
|
||||
ASSERT_HOST(index >= 0 && index < size_used_);
|
||||
return data_[index];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T &GenericVector<T>::operator[](int index) const {
|
||||
return data_[index];
|
||||
}
|
||||
|
||||
// Return the object from an index.
|
||||
template <typename T>
|
||||
void GenericVector<T>::set(T t, int index) {
|
||||
ASSERT_HOST(index >= 0 && index < size_used_);
|
||||
data_[index] = t;
|
||||
}
|
||||
|
||||
// Shifts the rest of the elements to the right to make
|
||||
// space for the new elements and inserts the given element
|
||||
// at the specified index.
|
||||
template <typename T>
|
||||
void GenericVector<T>::insert(T t, int index) {
|
||||
ASSERT_HOST(index >= 0 && index < size_used_);
|
||||
if (size_reserved_ == size_used_)
|
||||
double_the_size();
|
||||
for (int i = size_used_; i > index; --i) {
|
||||
data_[i] = data_[i-1];
|
||||
}
|
||||
data_[index] = t;
|
||||
size_used_++;
|
||||
}
|
||||
|
||||
// Removes an element at the given index and
|
||||
// shifts the remaining elements to the left.
|
||||
template <typename T>
|
||||
void GenericVector<T>::remove(int index) {
|
||||
ASSERT_HOST(index >= 0 && index < size_used_);
|
||||
for (int i = index; i < size_used_ - 1; ++i) {
|
||||
data_[i] = data_[i+1];
|
||||
}
|
||||
size_used_--;
|
||||
}
|
||||
|
||||
// Return true if the index is valindex
|
||||
template <typename T>
|
||||
T GenericVector<T>::contains_index(int index) const {
|
||||
return index >= 0 && index < size_used_;
|
||||
}
|
||||
|
||||
// Return the index of the T object.
|
||||
template <typename T>
|
||||
int GenericVector<T>::get_index(T object) const {
|
||||
for (int i = 0; i < size_used_; ++i) {
|
||||
ASSERT_HOST(compare_cb_ != NULL);
|
||||
if (compare_cb_->Run(object, data_[i]))
|
||||
return i;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Return true if T is in the array
|
||||
template <typename T>
|
||||
bool GenericVector<T>::contains(T object) const {
|
||||
return get_index(object) != -1;
|
||||
}
|
||||
|
||||
// Add an element in the array
|
||||
template <typename T>
|
||||
int GenericVector<T>::push_back(T object) {
|
||||
int index = 0;
|
||||
if (size_used_ == size_reserved_)
|
||||
double_the_size();
|
||||
index = size_used_++;
|
||||
data_[index] = object;
|
||||
return index;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GenericVector<T>::operator+=(T t) {
|
||||
push_back(t);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
GenericVector<T> &GenericVector<T>::operator+=(const GenericVector& other) {
|
||||
for (int i = 0; i < other.size(); ++i) {
|
||||
this->operator+=(other.data_[i]);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
GenericVector<T> &GenericVector<T>::operator=(const GenericVector& other) {
|
||||
this->clear();
|
||||
this->operator+=(other);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Add a callback to be called to delete the elements when the array took
|
||||
// their ownership.
|
||||
template <typename T>
|
||||
void GenericVector<T>::set_clear_callback(Callback1<T>* cb) {
|
||||
clear_cb_ = cb;
|
||||
}
|
||||
|
||||
// Add a callback to be called to delete the elements when the array took
|
||||
// their ownership.
|
||||
template <typename T>
|
||||
void GenericVector<T>::set_compare_callback(ResultCallback2<bool, T const &, T const &>* cb) {
|
||||
compare_cb_ = cb;
|
||||
}
|
||||
|
||||
// Clear the array, calling the callback function if any.
|
||||
template <typename T>
|
||||
void GenericVector<T>::clear() {
|
||||
if (size_reserved_ > 0) {
|
||||
if (clear_cb_ != NULL)
|
||||
for (int i = 0; i < size_used_; ++i)
|
||||
clear_cb_->Run(data_[i]);
|
||||
delete[] data_;
|
||||
size_used_ = 0;
|
||||
size_reserved_ = 0;
|
||||
}
|
||||
if (clear_cb_ != NULL) {
|
||||
delete clear_cb_;
|
||||
clear_cb_ = NULL;
|
||||
}
|
||||
if (compare_cb_ != NULL) {
|
||||
delete compare_cb_;
|
||||
compare_cb_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GenericVector<T>::delete_data_pointers() {
|
||||
for (int i = 0; i < size_used_; ++i)
|
||||
if (data_[i]) {
|
||||
delete data_[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
void GenericVector<T>::write(FILE* f, Callback2<FILE*, T const &>* cb) {
|
||||
fwrite(&size_reserved_, sizeof(int), 1, f);
|
||||
fwrite(&size_used_, sizeof(int), 1, f);
|
||||
for (int i = 0; i < size_used_; ++i) {
|
||||
cb->Run(f, data_[i]);
|
||||
}
|
||||
delete cb;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void GenericVector<T>::read(FILE* f, Callback3<FILE*, T*, bool>* cb, bool swap) {
|
||||
uinT32 reserved;
|
||||
fread(&reserved, sizeof(int), 1, f);
|
||||
if (swap)
|
||||
reserved = reverse32(reserved);
|
||||
reserve(reserved);
|
||||
fread(&size_used_, sizeof(int), 1, f);
|
||||
if (swap)
|
||||
size_used_ = reverse32(size_used_);
|
||||
for (int i = 0; i < size_used_; ++i) {
|
||||
cb->Run(f, data_ + i, swap);
|
||||
}
|
||||
delete cb;
|
||||
}
|
||||
|
||||
// This method clear the current object, then, does a shallow copy of
|
||||
// its argument, and finally invalindate its argument.
|
||||
template <typename T>
|
||||
void GenericVector<T>::move(GenericVector<T>* from) {
|
||||
this->clear();
|
||||
this->data_ = from->data_;
|
||||
this->size_reserved_ = from->size_reserved_;
|
||||
this->size_used_ = from->size_used_;
|
||||
this->compare_cb_ = from->compare_cb_;
|
||||
this->clear_cb_ = from->clear_cb_;
|
||||
from->data_ = NULL;
|
||||
from->clear_cb_ = NULL;
|
||||
from->compare_cb_ = NULL;
|
||||
from->size_used_ = 0;
|
||||
from->size_reserved_ = 0;
|
||||
}
|
||||
|
||||
#endif // TESSERACT_CCUTIL_GENERICVECTOR_H_
|
41
ccutil/helpers.h
Normal file
41
ccutil/helpers.h
Normal file
@ -0,0 +1,41 @@
|
||||
/* -*-C-*-
|
||||
********************************************************************************
|
||||
*
|
||||
* File: helpers.h
|
||||
* Description: General utility functions
|
||||
* Author: Daria Antonova
|
||||
* Created: Wed Apr 8 14:37:00 2009
|
||||
* Language: C
|
||||
* Package: N/A
|
||||
* Status: Reusable Software Component
|
||||
*
|
||||
* (c) Copyright 2009, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
********************************************************************************/
|
||||
|
||||
#ifndef TESSERACT_CCUTIL_HELPERS_H_
|
||||
#define TESSERACT_CCUTIL_HELPERS_H_
|
||||
|
||||
// Remove newline (if any) at the end of the string.
|
||||
inline void chomp_string(char *string) {
|
||||
int last_index = strlen(string) - 1;
|
||||
if (string[last_index] == '\n') {
|
||||
string[last_index] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
// Advance the current pointer of the file if it points to a newline character.
|
||||
inline void SkipNewline(FILE *file) {
|
||||
if (fgetc(file) != '\n') fseek(file, -1, SEEK_CUR);
|
||||
}
|
||||
|
||||
#endif
|
@ -22,23 +22,22 @@
|
||||
#ifdef __UNIX__
|
||||
#include <unistd.h>
|
||||
#include <signal.h>
|
||||
#else
|
||||
#include <io.h>
|
||||
#endif
|
||||
#include <stdlib.h>
|
||||
#include "basedir.h"
|
||||
#include "mainblk.h"
|
||||
#include "ccutil.h"
|
||||
|
||||
#define VARDIR "configs/" /*variables files */
|
||||
#define EXTERN
|
||||
|
||||
/*
|
||||
EXTERN DLLSYM STRING datadir; //dir for data files
|
||||
//name of image
|
||||
EXTERN DLLSYM STRING imagebasename;
|
||||
EXTERN BOOL_VAR (m_print_variables, FALSE,
|
||||
"Print initial values of all variables");
|
||||
EXTERN STRING_VAR (m_data_sub_dir, "tessdata/", "Directory for data files");
|
||||
/*
|
||||
EXTERN INT_VAR (memgrab_size, 0, "Preallocation size for batch use");*/
|
||||
|
||||
|
||||
@ -46,24 +45,17 @@ const ERRCODE NO_PATH =
|
||||
"Warning:explicit path for executable will not be used for configs";
|
||||
static const ERRCODE USAGE = "Usage";
|
||||
|
||||
namespace tesseract {
|
||||
/**********************************************************************
|
||||
* main_setup
|
||||
*
|
||||
* Main for mithras demo program. Read the arguments and set up globals.
|
||||
**********************************************************************/
|
||||
|
||||
void main_setup( /*main demo program */
|
||||
void CCUtil::main_setup( /*main demo program */
|
||||
const char *argv0, //program name
|
||||
const char *basename, //name of image
|
||||
int argc, /*argument count */
|
||||
const char *const *argv /*arguments */
|
||||
const char *basename //name of image
|
||||
) {
|
||||
inT32 arg; /*argument */
|
||||
inT32 offset; //for flag
|
||||
FILE *fp; /*variables file */
|
||||
char flag[2]; //+/-
|
||||
STRING varfile; /*name of file */
|
||||
|
||||
imagebasename = basename; /*name of image */
|
||||
|
||||
// TESSDATA_PREFIX Environment variable overrules everything.
|
||||
@ -93,34 +85,6 @@ void main_setup( /*main demo program */
|
||||
datadir = getenv("TESSDATA_PREFIX");
|
||||
}
|
||||
|
||||
for (arg = 0; arg < argc; arg++) {
|
||||
if (argv[arg][0] == '+' || argv[arg][0] == '-') {
|
||||
offset = 1;
|
||||
flag[0] = argv[arg][0];
|
||||
}
|
||||
else {
|
||||
offset = 0;
|
||||
}
|
||||
flag[offset] = '\0';
|
||||
varfile = flag;
|
||||
/*attempt open */
|
||||
fp = fopen (argv[arg] + offset, "r");
|
||||
if (fp != NULL) {
|
||||
fclose(fp); /*was only to test */
|
||||
}
|
||||
else {
|
||||
varfile += datadir;
|
||||
varfile += m_data_sub_dir; /*data directory */
|
||||
varfile += VARDIR; /*variables dir */
|
||||
}
|
||||
/*actual name */
|
||||
varfile += argv[arg] + offset;
|
||||
read_variables_file (varfile.string ());
|
||||
}
|
||||
|
||||
if (m_print_variables)
|
||||
print_variables(stdout); /*print them all */
|
||||
|
||||
|
||||
datadir += m_data_sub_dir; /*data directory */
|
||||
}
|
||||
} // namespace tesseract
|
||||
|
@ -26,14 +26,15 @@
|
||||
extern DLLSYM STRING datadir; //dir for data files
|
||||
//name of image
|
||||
extern DLLSYM STRING imagebasename;
|
||||
extern BOOL_VAR_H (m_print_variables, FALSE,
|
||||
"Print initial values of all variables");
|
||||
extern STRING_VAR_H (m_data_sub_dir, "data/", "Directory for data files");
|
||||
extern INT_VAR_H (memgrab_size, 13000000, "Preallocation size for batch use");
|
||||
void main_setup( /*main demo program */
|
||||
const char *argv0, //program name
|
||||
const char *basename, //name of image
|
||||
int argc, /*argument count */
|
||||
const char *const *argv /*arguments */
|
||||
);
|
||||
extern BOOL_VAR_H(m_print_variables, FALSE,
|
||||
"Print initial values of all variables");
|
||||
extern STRING_VAR_H(m_data_sub_dir, "data/", "Directory for data files");
|
||||
extern INT_VAR_H(memgrab_size, 13000000, "Preallocation size for batch use");
|
||||
// > ccutil.h
|
||||
//void main_setup( /*main demo program */
|
||||
// const char *argv0, //program name
|
||||
// const char *basename, //name of image
|
||||
// int argc, /*argument count */
|
||||
// const char *const *argv /*arguments */
|
||||
// );
|
||||
#endif
|
||||
|
@ -3,6 +3,14 @@
|
||||
#ifdef __MSW32__
|
||||
#define SIGNED
|
||||
#define snprintf _snprintf
|
||||
#define read _read
|
||||
#define write _write
|
||||
#define close _close
|
||||
#define lseek _lseek
|
||||
#define open _open
|
||||
#define ultoa _ultoa
|
||||
#define ltoa _ltoa
|
||||
#define strtok_r(s, d, p) strtok(s, d)
|
||||
#if (_MSC_VER <= 1400)
|
||||
#define vsnprintf _vsnprintf
|
||||
#endif
|
||||
|
80
ccutil/qrsequence.h
Normal file
80
ccutil/qrsequence.h
Normal file
@ -0,0 +1,80 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: qrsequence.h
|
||||
// Description: Quasi-random sequence generator class.
|
||||
// Author: Ranjith Unnikrishnan
|
||||
// Created: Wed May 20 2009
|
||||
//
|
||||
// Class to generate a (deterministic) quasi-random Van der Corput sequence that
|
||||
// covers the interval [0,N) without repetition.
|
||||
//
|
||||
// The sequence is generated by reversing the base-2 representation of the
|
||||
// sequence of natural numbers {0, 1,... M-1}, where M is 2^{num_bits_} and
|
||||
// num_bits is the minimum number of bits required to represent N. If a reversed
|
||||
// numbers is >= N it is rejected and the next natural number is considered
|
||||
// until a valid output number is found.
|
||||
//
|
||||
// (C) Copyright 2009, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
|
||||
// use this file except in compliance with the License. You may obtain a copy
|
||||
// of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
|
||||
// by applicable law or agreed to in writing, software distributed under the
|
||||
// License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
|
||||
// OF ANY KIND, either express or implied. See the License for the specific
|
||||
// language governing permissions and limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCUTIL_QRSEQUENCE_H_
|
||||
#define TESSERACT_CCUTIL_QRSEQUENCE_H_
|
||||
|
||||
#include <math.h>
|
||||
|
||||
class QRSequenceGenerator {
|
||||
public:
|
||||
// Object is initalized with the size of the output range.
|
||||
explicit QRSequenceGenerator(int N) : N_(N), next_num_(0) {
|
||||
num_bits_ = ceil(log(static_cast<double>(N)) / log(2.0));
|
||||
}
|
||||
|
||||
// Main worker method that retrieves the next number in the sequence.
|
||||
// Returns kInvalidVal if called more than N times after object initialization
|
||||
int GetVal() {
|
||||
const int kInvalidVal = -1;
|
||||
const int kMaxNaturalNumberValue = 1 << num_bits_;
|
||||
if (next_num_ >= kMaxNaturalNumberValue)
|
||||
return kInvalidVal;
|
||||
int n = next_num_;
|
||||
|
||||
while (next_num_ < kMaxNaturalNumberValue) {
|
||||
n = GetBinaryReversedInteger(next_num_++);
|
||||
if (n < N_) break;
|
||||
}
|
||||
return (next_num_ > kMaxNaturalNumberValue) ? kInvalidVal : n;
|
||||
}
|
||||
|
||||
protected:
|
||||
// Outputs the integer formed by reversing the bits of the input integer. Only
|
||||
// the lowest num_bits_ bits of the input integer are reversed.
|
||||
int GetBinaryReversedInteger(int in_val) const {
|
||||
int bit_pos = num_bits_;
|
||||
int out_val = 0;
|
||||
while(bit_pos--) {
|
||||
// Set the value of the last bit.
|
||||
out_val |= (in_val & 0x1);
|
||||
if (bit_pos > 0) {
|
||||
// Left-shift output value to prepare for storing the next bit.
|
||||
out_val <<= 1;
|
||||
}
|
||||
// Right-shift input value to prepare for retrieving the next bit.
|
||||
in_val >>= 1;
|
||||
}
|
||||
return out_val;
|
||||
}
|
||||
int N_;
|
||||
// Next number to be considered for reversal and output.
|
||||
int next_num_;
|
||||
// number of bits required to represent the numbers of the sequence
|
||||
int num_bits_;
|
||||
};
|
||||
|
||||
#endif // TESSERACT_CCUTIL_QRSEQUENCE_H_
|
@ -31,6 +31,7 @@
|
||||
#include <fcntl.h>
|
||||
|
||||
#include "scanutils.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
enum Flags {
|
||||
FL_SPLAT = 0x01, // Drop the value, do not assign
|
||||
@ -45,6 +46,7 @@ enum Ranks {
|
||||
RANK_INT = 0,
|
||||
RANK_LONG = 1,
|
||||
RANK_LONGLONG = 2,
|
||||
RANK_PTR = INT_MAX // Special value used for pointers
|
||||
RANK_PTR = 3 // Special value used for pointers
|
||||
};
|
||||
|
||||
@ -183,7 +185,7 @@ double strtofloat(const char* s)
|
||||
{
|
||||
int minus = 0;
|
||||
int v = 0;
|
||||
int d, c;
|
||||
int d;
|
||||
int k = 1;
|
||||
int w = 0;
|
||||
|
||||
@ -243,7 +245,7 @@ int vfscanf(FILE* stream, const char *format, va_list ap)
|
||||
ST_MATCH, // Main state of %[ sequence
|
||||
ST_MATCH_RANGE, // After - in a %[ sequence
|
||||
} state = ST_NORMAL;
|
||||
char *oarg, *sarg = NULL; // %s %c or %[ string argument
|
||||
char *sarg = NULL; // %s %c or %[ string argument
|
||||
enum Bail bail = BAIL_NONE;
|
||||
int sign;
|
||||
int converted = 0; // Successful conversions
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <klibc/extern.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
// Attempts to parse the given file stream s as an integer of the base
|
||||
// 'base'. Returns the first successfully parsed integer as a uintmax_t, or
|
||||
|
203
ccutil/tessdatamanager.cpp
Normal file
203
ccutil/tessdatamanager.cpp
Normal file
@ -0,0 +1,203 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: tessdatamanager.cpp
|
||||
// Description: Functions to handle loading/combining tesseract data files.
|
||||
// Author: Daria Antonova
|
||||
// Created: Wed Jun 03 11:26:43 PST 2009
|
||||
//
|
||||
// (C) Copyright 2009, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "tessdatamanager.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "serialis.h"
|
||||
#include "strngs.h"
|
||||
#include "tprintf.h"
|
||||
#include "varable.h"
|
||||
|
||||
BOOL_VAR(global_load_system_dawg, true, "Load system word dawg.");
|
||||
BOOL_VAR(global_load_freq_dawg, true, "Load frequent word dawg.");
|
||||
BOOL_VAR(global_load_punc_dawg, true, "Load dawg with punctuation patterns.");
|
||||
BOOL_VAR(global_load_number_dawg, true, "Load dawg with number patterns.");
|
||||
|
||||
INT_VAR(global_tessdata_manager_debug_level, 0,
|
||||
"Debug level for TessdataManager functions.");
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
void TessdataManager::Init(const char *data_file_name) {
|
||||
int i;
|
||||
data_file_ = fopen(data_file_name, "rb");
|
||||
if (data_file_ == NULL) {
|
||||
tprintf("Error openning data file %s\n", data_file_name);
|
||||
exit(1);
|
||||
}
|
||||
fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
|
||||
bool swap = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
|
||||
if (swap) {
|
||||
actual_tessdata_num_entries_ = reverse32(actual_tessdata_num_entries_);
|
||||
}
|
||||
ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
|
||||
fread(offset_table_, sizeof(inT64),
|
||||
actual_tessdata_num_entries_, data_file_);
|
||||
if (swap) {
|
||||
for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
|
||||
offset_table_[i] = reverse64(offset_table_[i]);
|
||||
}
|
||||
}
|
||||
if (global_tessdata_manager_debug_level) {
|
||||
tprintf("TessdataManager loaded %d types of tesseract data files.\n",
|
||||
actual_tessdata_num_entries_);
|
||||
for (i = 0; i < actual_tessdata_num_entries_; ++i) {
|
||||
tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
FILE *TessdataManager::GetFilePtr(const char *language_data_path_prefix,
|
||||
const char *file_suffix, bool required_file,
|
||||
bool text_file) {
|
||||
STRING file_name = language_data_path_prefix;
|
||||
file_name += file_suffix;
|
||||
FILE *file_ptr = fopen(file_name.string(), text_file ? "r" : "rb");
|
||||
if (required_file && (file_ptr == NULL)) {
|
||||
tprintf("Error openning required file %s\n", file_name.string());
|
||||
exit(1);
|
||||
}
|
||||
return file_ptr;
|
||||
}
|
||||
|
||||
void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
|
||||
bool newline_end) {
|
||||
int buffer_size = 1024;
|
||||
char *chunk = new char[buffer_size];
|
||||
int bytes_read;
|
||||
char last_char = 0x0;
|
||||
while ((bytes_read = fread(chunk, sizeof(char),
|
||||
buffer_size, input_file))) {
|
||||
fwrite(chunk, sizeof(char), bytes_read, output_file);
|
||||
last_char = chunk[bytes_read-1];
|
||||
}
|
||||
if (newline_end) ASSERT_HOST(last_char == '\n');
|
||||
delete[] chunk;
|
||||
}
|
||||
|
||||
void TessdataManager::CombineDataFiles(
|
||||
const char *language_data_path_prefix,
|
||||
const char *output_filename) {
|
||||
FILE *file_ptr;
|
||||
STRING file_name;
|
||||
int i;
|
||||
inT64 offset_table[TESSDATA_NUM_ENTRIES];
|
||||
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
|
||||
FILE *output_file = fopen(output_filename, "wb");
|
||||
// Leave some space for recording the offset_table.
|
||||
fseek(output_file,
|
||||
sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
|
||||
|
||||
// Record language-specific tesseract config file.
|
||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
||||
kLangConfigFileSuffix, false, true);
|
||||
if (file_ptr != NULL) {
|
||||
offset_table[TESSDATA_LANG_CONFIG] = ftell(output_file);
|
||||
CopyFile(file_ptr, output_file, true);
|
||||
fclose(file_ptr);
|
||||
}
|
||||
|
||||
// Record unicharset.
|
||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
||||
kUnicharsetFileSuffix, true, true);
|
||||
offset_table[TESSDATA_UNICHARSET] = ftell(output_file);
|
||||
CopyFile(file_ptr, output_file, true);
|
||||
fclose(file_ptr);
|
||||
|
||||
// Record ambiguities.
|
||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
||||
kAmbigsFileSuffix, false, true);
|
||||
if (file_ptr != NULL) {
|
||||
offset_table[TESSDATA_AMBIGS] = ftell(output_file);
|
||||
CopyFile(file_ptr, output_file, true);
|
||||
fclose(file_ptr);
|
||||
}
|
||||
|
||||
// Record inttemp.
|
||||
file_ptr =
|
||||
GetFilePtr(language_data_path_prefix,
|
||||
kBuiltInTemplatesFileSuffix, false, false);
|
||||
if (file_ptr != NULL) {
|
||||
offset_table[TESSDATA_INTTEMP] = ftell(output_file);
|
||||
CopyFile(file_ptr, output_file, false);
|
||||
fclose(file_ptr);
|
||||
|
||||
// Record pffmtable.
|
||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
||||
kBuiltInCutoffsFileSuffix, true, true);
|
||||
offset_table[TESSDATA_PFFMTABLE] = ftell(output_file);
|
||||
CopyFile(file_ptr, output_file, true);
|
||||
fclose(file_ptr);
|
||||
|
||||
// Record normproto.
|
||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
||||
kNormProtoFileSuffix, true, true);
|
||||
offset_table[TESSDATA_NORMPROTO] = ftell(output_file);
|
||||
CopyFile(file_ptr, output_file, true);
|
||||
fclose(file_ptr);
|
||||
}
|
||||
|
||||
// Record dawgs.
|
||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
||||
kPuncDawgFileSuffix, false, false);
|
||||
if (file_ptr != NULL) {
|
||||
offset_table[TESSDATA_PUNC_DAWG] = ftell(output_file);
|
||||
CopyFile(file_ptr, output_file, false);
|
||||
fclose(file_ptr);
|
||||
}
|
||||
|
||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
||||
kSystemDawgFileSuffix, false, false);
|
||||
if (file_ptr != NULL) {
|
||||
offset_table[TESSDATA_SYSTEM_DAWG] = ftell(output_file);
|
||||
CopyFile(file_ptr, output_file, false);
|
||||
fclose(file_ptr);
|
||||
}
|
||||
|
||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
||||
kNumberDawgFileSuffix, false, false);
|
||||
if (file_ptr != NULL) {
|
||||
offset_table[TESSDATA_NUMBER_DAWG] = ftell(output_file);
|
||||
CopyFile(file_ptr, output_file, false);
|
||||
fclose(file_ptr);
|
||||
}
|
||||
|
||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
||||
kFreqDawgFileSuffix, false, false);
|
||||
if (file_ptr != NULL) {
|
||||
offset_table[TESSDATA_FREQ_DAWG] = ftell(output_file);
|
||||
CopyFile(file_ptr, output_file, false);
|
||||
fclose(file_ptr);
|
||||
}
|
||||
|
||||
fseek(output_file, 0, SEEK_SET);
|
||||
inT32 num_entries = TESSDATA_NUM_ENTRIES;
|
||||
fwrite(&num_entries, sizeof(inT32), 1, output_file);
|
||||
fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file);
|
||||
fclose(output_file);
|
||||
|
||||
tprintf("TessdataManager combined tesseract data files.\n");
|
||||
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
tprintf("Offset for type %d is %lld\n", i, offset_table[i]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
165
ccutil/tessdatamanager.h
Normal file
165
ccutil/tessdatamanager.h
Normal file
@ -0,0 +1,165 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: tessdatamanager.h
|
||||
// Description: Functions to handle loading/combining tesseract data files.
|
||||
// Author: Daria Antonova
|
||||
// Created: Wed Jun 03 11:26:43 PST 2009
|
||||
//
|
||||
// (C) Copyright 2009, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
|
||||
#define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include "host.h"
|
||||
#include "tprintf.h"
|
||||
#include "varable.h"
|
||||
|
||||
extern BOOL_VAR_H(global_load_punc_dawg, true,
|
||||
"Load dawg with punctuation patterns.");
|
||||
extern BOOL_VAR_H(global_load_system_dawg, true, "Load system word dawg.");
|
||||
extern BOOL_VAR_H(global_load_number_dawg, true,
|
||||
"Load dawg with number patterns.");
|
||||
extern BOOL_VAR_H(global_load_freq_dawg, true, "Load frequent word dawg.");
|
||||
|
||||
extern INT_VAR_H(global_tessdata_manager_debug_level, 0,
|
||||
"Debug level for TessdataManager functions.");
|
||||
|
||||
static const char kTrainedDataSuffix[] = "traineddata";
|
||||
|
||||
static const char kLangConfigFileSuffix[] = "config";
|
||||
static const char kUnicharsetFileSuffix[] = "unicharset";
|
||||
static const char kAmbigsFileSuffix[] = "unicharambigs";
|
||||
static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
|
||||
static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
|
||||
static const char kNormProtoFileSuffix[] = "normproto";
|
||||
static const char kPuncDawgFileSuffix[] = "punc-dawg";
|
||||
static const char kSystemDawgFileSuffix[] = "word-dawg";
|
||||
static const char kNumberDawgFileSuffix[] = "number-dawg";
|
||||
static const char kFreqDawgFileSuffix[] = "freq-dawg";
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
enum TessdataType {
|
||||
TESSDATA_LANG_CONFIG, // 0
|
||||
TESSDATA_UNICHARSET, // 1
|
||||
TESSDATA_AMBIGS, // 2
|
||||
TESSDATA_INTTEMP, // 3
|
||||
TESSDATA_PFFMTABLE, // 4
|
||||
TESSDATA_NORMPROTO, // 5
|
||||
TESSDATA_PUNC_DAWG, // 6
|
||||
TESSDATA_SYSTEM_DAWG, // 7
|
||||
TESSDATA_NUMBER_DAWG, // 8
|
||||
TESSDATA_FREQ_DAWG, // 9
|
||||
|
||||
TESSDATA_NUM_ENTRIES
|
||||
};
|
||||
|
||||
// TessdataType could be updated to contain more entries, however
|
||||
// we do not expect that number to be astronomically high.
|
||||
// In order to automatically detect endianness TessdataManager will
|
||||
// flip the bits if actual_tessdata_num_entries_ is larger than
|
||||
// kMaxNumTessdataEntries.
|
||||
static const int kMaxNumTessdataEntries = 1000;
|
||||
|
||||
|
||||
class TessdataManager {
|
||||
public:
|
||||
TessdataManager() {
|
||||
data_file_ = NULL;
|
||||
actual_tessdata_num_entries_ = 0;
|
||||
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
offset_table_[i] = -1;
|
||||
}
|
||||
}
|
||||
~TessdataManager() {}
|
||||
|
||||
// Opens the given data file and reads the offset table.
|
||||
void Init(const char *data_file_name);
|
||||
|
||||
// Returns data file pointer.
|
||||
inline FILE *GetDataFilePtr() const { return data_file_; }
|
||||
|
||||
// Returns false if there is no data of the given type.
|
||||
// Otherwise does a seek on the data_file_ to position the pointer
|
||||
// at the start of the data of the given type.
|
||||
inline bool SeekToStart(TessdataType tessdata_type) {
|
||||
if (global_tessdata_manager_debug_level) {
|
||||
tprintf("TessdataManager: seek to offset %lld (start of tessdata"
|
||||
"type %d)\n", offset_table_[tessdata_type], tessdata_type);
|
||||
}
|
||||
if (offset_table_[tessdata_type] < 0) {
|
||||
return false;
|
||||
} else {
|
||||
ASSERT_HOST(fseek(data_file_,
|
||||
offset_table_[tessdata_type], SEEK_SET) == 0);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Returns the end offset for the given tesseract data file type.
|
||||
inline inT64 GetEndOffset(TessdataType tessdata_type) const {
|
||||
int index = tessdata_type + 1;
|
||||
while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) {
|
||||
++index; // skip tessdata types not present in the combined file
|
||||
}
|
||||
if (global_tessdata_manager_debug_level) {
|
||||
tprintf("TessdataManager: end offset for type %d is %lld\n",
|
||||
tessdata_type,
|
||||
(index == actual_tessdata_num_entries_) ? -1
|
||||
: offset_table_[index]);
|
||||
}
|
||||
return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1;
|
||||
}
|
||||
// Closes data_file_ (if it was opened by Init()).
|
||||
inline void End() {
|
||||
if (data_file_ != NULL) {
|
||||
fclose(data_file_);
|
||||
data_file_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// Reads all the standard tesseract config and data files for a language
|
||||
// at the given path and bundles them up into one binary data file.
|
||||
static void CombineDataFiles(const char *language_data_path_prefix,
|
||||
const char *output_filename);
|
||||
|
||||
private:
|
||||
|
||||
// Opens the file whose name is a concatentation of language_data_path_prefix
|
||||
// and file_suffix. Terminates the program if required_file is set to true,
|
||||
// but the file could not be found or opened for reading.
|
||||
// Returns a file pointer to the opened file.
|
||||
static FILE *GetFilePtr(const char *language_data_path_prefix,
|
||||
const char *file_suffix, bool required_file,
|
||||
bool text_file);
|
||||
|
||||
// Copies all the bytes in the given input file to the output_file provided.
|
||||
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end);
|
||||
|
||||
// Each offset_table_[i] contains a file offset in the combined data file
|
||||
// where the data of TessdataFileType i is stored.
|
||||
inT64 offset_table_[TESSDATA_NUM_ENTRIES];
|
||||
// Actual number of entries in the tessdata table. This value can only be
|
||||
// same or smaller than TESSDATA_NUM_ENTRIES, but can never be larger,
|
||||
// since then it would be impossible to interpret the type of tessdata at
|
||||
// indices same and higher than TESSDATA_NUM_ENTRIES.
|
||||
// This parameter is used to allow for backward compatiblity
|
||||
// when new tessdata types are introduced.
|
||||
inT32 actual_tessdata_num_entries_;
|
||||
FILE *data_file_; // pointer to the data file.
|
||||
};
|
||||
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_
|
66
ccutil/tordvars.cpp
Normal file
66
ccutil/tordvars.cpp
Normal file
@ -0,0 +1,66 @@
|
||||
/* -*-C-*-
|
||||
********************************************************************************
|
||||
*
|
||||
* File: tordvars.cpp
|
||||
* Description: Text Ordering Control Variables
|
||||
* Author: Mark Seaman, OCR Technology
|
||||
* Created: Wed Jan 17 12:47:29 1990
|
||||
* Modified: Tue Jul 30 16:22:40 1991 (Mark Seaman) marks@hpgrlt
|
||||
* Language: C
|
||||
* Package: N/A
|
||||
* Status: Experimental (Do Not Distribute)
|
||||
*
|
||||
* (c) Copyright 1990, Hewlett-Packard Company.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
*********************************************************************************/
|
||||
/*----------------------------------------------------------------------
|
||||
I n c l u d e s
|
||||
----------------------------------------------------------------------*/
|
||||
#include "mfcpch.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "varable.h"
|
||||
|
||||
/*----------------------------------------------------------------------
|
||||
V a r i a b l e s
|
||||
----------------------------------------------------------------------*/
|
||||
FILE *rawfile; /* Text before dictionary */
|
||||
FILE *textfile; /* Text output file */
|
||||
FILE *matcher_fp; //matcher log
|
||||
FILE *correct_fp; //correct text
|
||||
|
||||
BOOL_VAR(tord_write_output, 0, "Text file output");
|
||||
|
||||
BOOL_VAR(tord_write_raw_output, 0, "Text before context");
|
||||
|
||||
BOOL_VAR(tord_similarity_enable, 0, "Switch for Similarity");
|
||||
|
||||
double_VAR(tord_certainty_threshold, -2.25, "Certainty Value");
|
||||
|
||||
INT_VAR(tord_num_word_choices, 30, "Number of choices");
|
||||
|
||||
BOOL_VAR(tord_blob_skip, 0, "Skip to Next selection");
|
||||
|
||||
double_VAR(tord_overlap_threshold, 0.33, "Overlap Threshold");
|
||||
|
||||
BOOL_VAR(tord_debug_3, 0, "Textord Debug #3");
|
||||
|
||||
BOOL_VAR(tord_debug_5, 0, "Textord Debug #5");
|
||||
|
||||
BOOL_VAR(tord_debug_8, 0, "Textord Debug #8");
|
||||
|
||||
INT_VAR(tord_display_ratings, 0, "Ratings display");
|
||||
|
||||
BOOL_VAR(tord_display_text, 0, "Display Text");
|
||||
|
||||
BOOL_VAR(tord_show_bold, 1, "Show Bold Text");
|
66
ccutil/tordvars.h
Normal file
66
ccutil/tordvars.h
Normal file
@ -0,0 +1,66 @@
|
||||
/* -*-C-*-
|
||||
********************************************************************************
|
||||
*
|
||||
* File: tordvars.h
|
||||
* Description: Text Ordering Control Variables
|
||||
* Author: Mark Seaman, OCR Technology
|
||||
* Created: Wed Oct 25 16:33:01 1989
|
||||
* Modified: Mon Jul 1 14:28:23 1991 (Mark Seaman) marks@hpgrlt
|
||||
* Language: C
|
||||
* Package: N/A
|
||||
* Status: Experimental (Do Not Distribute)
|
||||
*
|
||||
* (c) Copyright 1989, Hewlett-Packard Company.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
*********************************************************************************/
|
||||
#ifndef TOVARS_H
|
||||
#define TOVARS_H
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "varable.h"
|
||||
|
||||
/*----------------------------------------------------------------------
|
||||
V a r i a b l e s
|
||||
----------------------------------------------------------------------*/
|
||||
extern FILE *rawfile; /* Text before dictionary */
|
||||
extern FILE *textfile; /* Text output file */
|
||||
extern FILE *correct_fp; //correct text
|
||||
extern FILE *matcher_fp;
|
||||
|
||||
extern BOOL_VAR_H(tord_write_output, 0, "Text file output");
|
||||
|
||||
extern BOOL_VAR_H(tord_write_raw_output, 0, "Text before context");
|
||||
|
||||
extern BOOL_VAR_H(tord_similarity_enable, 0, "Switch for Similarity");
|
||||
|
||||
extern double_VAR_H(tord_certainty_threshold, -2.25, "Certainty Value");
|
||||
|
||||
extern INT_VAR_H(tord_num_word_choices, 30, "Number of choices");
|
||||
|
||||
extern BOOL_VAR_H(tord_blob_skip, 0, "Skip to Next selection");
|
||||
|
||||
extern double_VAR_H(tord_overlap_threshold, 0.33, "Overlap Threshold");
|
||||
|
||||
extern BOOL_VAR_H(tord_debug_3, 0, "Textord Debug #3");
|
||||
|
||||
extern BOOL_VAR_H(tord_debug_5, 0, "Textord Debug #5");
|
||||
|
||||
extern BOOL_VAR_H(tord_debug_8, 0, "Textord Debug #8");
|
||||
|
||||
extern INT_VAR_H(tord_display_ratings, 0, "Ratings display");
|
||||
|
||||
extern BOOL_VAR_H(tord_display_text, 0, "Display Text");
|
||||
|
||||
extern BOOL_VAR_H(tord_show_bold, 1, "Show Bold Text");
|
||||
|
||||
#endif
|
@ -24,6 +24,7 @@
|
||||
#include "debugwin.h"
|
||||
//#include "ipeerr.h"
|
||||
#include "tprintf.h"
|
||||
#include "ccutil.h"
|
||||
|
||||
#define MAX_MSG_LEN 1024
|
||||
|
||||
@ -36,6 +37,7 @@ DLLSYM void
|
||||
tprintf ( //Trace printf
|
||||
const char *format, ... //special message
|
||||
) {
|
||||
tesseract::tprintfMutex.Lock();
|
||||
va_list args; //variable args
|
||||
static FILE *debugfp = NULL; //debug file
|
||||
//debug window
|
||||
@ -76,6 +78,7 @@ const char *format, ... //special message
|
||||
fprintf (stderr, "%s", msg);
|
||||
}
|
||||
}
|
||||
tesseract::tprintfMutex.Unlock();
|
||||
}
|
||||
|
||||
|
||||
|
@ -21,6 +21,7 @@
|
||||
#define TESSERACT_CCUTIL_UNICHAR_H__
|
||||
|
||||
#include <memory.h>
|
||||
#include <string.h>
|
||||
|
||||
// Maximum number of characters that can be stored in a UNICHAR. Must be
|
||||
// at least 4. Must not exceed 31 without changing the coding of length.
|
||||
|
@ -22,13 +22,16 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "tprintf.h"
|
||||
#include "unichar.h"
|
||||
#include "unicharset.h"
|
||||
#include "varable.h"
|
||||
|
||||
static const int ISALPHA_MASK = 0x1;
|
||||
static const int ISLOWER_MASK = 0x2;
|
||||
static const int ISUPPER_MASK = 0x4;
|
||||
static const int ISDIGIT_MASK = 0x8;
|
||||
static const int ISPUNCTUATION_MASK = 0x10;
|
||||
|
||||
UNICHARSET::UNICHARSET() :
|
||||
unichars(NULL),
|
||||
@ -38,15 +41,20 @@ UNICHARSET::UNICHARSET() :
|
||||
script_table(0),
|
||||
script_table_size_used(0),
|
||||
script_table_size_reserved(0),
|
||||
null_script("NULL")
|
||||
{
|
||||
}
|
||||
null_script("NULL"),
|
||||
null_sid_(0),
|
||||
common_sid_(0),
|
||||
latin_sid_(0),
|
||||
cyrillic_sid_(0),
|
||||
greek_sid_(0),
|
||||
han_sid_(0) {}
|
||||
|
||||
UNICHARSET::~UNICHARSET() {
|
||||
if (size_reserved > 0) {
|
||||
for (int i = 0; i < script_table_size_used; ++i)
|
||||
delete[] script_table[i];
|
||||
delete[] script_table;
|
||||
delete_pointers_in_unichars();
|
||||
delete[] unichars;
|
||||
}
|
||||
}
|
||||
@ -56,8 +64,10 @@ void UNICHARSET::reserve(int unichars_number) {
|
||||
UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
|
||||
for (int i = 0; i < size_used; ++i)
|
||||
memcpy(&unichars_new[i], &unichars[i], sizeof(UNICHAR_SLOT));
|
||||
for (int j = size_used; j < unichars_number; ++j)
|
||||
unichars_new[j].properties.script = add_script(null_script);
|
||||
for (int j = size_used; j < unichars_number; ++j) {
|
||||
unichars_new[j].properties.script_id = add_script(null_script);
|
||||
unichars_new[j].properties.fragment = NULL;
|
||||
}
|
||||
delete[] unichars;
|
||||
unichars = unichars_new;
|
||||
size_reserved = unichars_number;
|
||||
@ -66,15 +76,15 @@ void UNICHARSET::reserve(int unichars_number) {
|
||||
|
||||
const UNICHAR_ID
|
||||
UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
|
||||
assert(ids.contains(unichar_repr));
|
||||
return ids.unichar_to_id(unichar_repr);
|
||||
return ids.contains(unichar_repr) ?
|
||||
ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
|
||||
}
|
||||
|
||||
const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
|
||||
int length) const {
|
||||
assert(length > 0 && length <= UNICHAR_LEN);
|
||||
assert(ids.contains(unichar_repr, length));
|
||||
return ids.unichar_to_id(unichar_repr, length);
|
||||
return ids.contains(unichar_repr, length) ?
|
||||
ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
|
||||
}
|
||||
|
||||
// Return the minimum number of bytes that matches a legal UNICHAR_ID,
|
||||
@ -102,14 +112,16 @@ int UNICHARSET::step(const char* str) const {
|
||||
}
|
||||
|
||||
const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
|
||||
if (id == INVALID_UNICHAR_ID) {
|
||||
return INVALID_UNICHAR;
|
||||
}
|
||||
assert(id < this->size());
|
||||
return unichars[id].representation;
|
||||
}
|
||||
|
||||
// Return a STRING containing debug information on the unichar, including
|
||||
// the id_to_unichar, its hex unicodes and the properties.
|
||||
STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
|
||||
const char* str = id_to_unichar(id);
|
||||
// Return a STRING that reformats the utf8 str into the str followed
|
||||
// by its hex unicodes.
|
||||
STRING UNICHARSET::debug_utf8_str(const char* str) {
|
||||
STRING result = str;
|
||||
result += " [";
|
||||
int step = 1;
|
||||
@ -128,6 +140,21 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
|
||||
result += " ";
|
||||
}
|
||||
result += "]";
|
||||
return result;
|
||||
}
|
||||
|
||||
// Return a STRING containing debug information on the unichar, including
|
||||
// the id_to_unichar, its hex unicodes and the properties.
|
||||
STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
|
||||
const CHAR_FRAGMENT *fragment = this->get_fragment(id);
|
||||
if (fragment) {
|
||||
STRING base = debug_str(fragment->get_unichar());
|
||||
return CHAR_FRAGMENT::to_string(base.string(), fragment->get_pos(),
|
||||
fragment->get_total());
|
||||
}
|
||||
const char* str = id_to_unichar(id);
|
||||
if (id == INVALID_UNICHAR_ID) return STRING(str);
|
||||
STRING result = debug_utf8_str(str);
|
||||
// Append a for lower alpha, A for upper alpha, and x if alpha but neither.
|
||||
if (get_isalpha(id)) {
|
||||
if (get_islower(id))
|
||||
@ -141,11 +168,22 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
|
||||
if (get_isdigit(id)) {
|
||||
result += "0";
|
||||
}
|
||||
// Append p is a punctuation symbol.
|
||||
if (get_ispunctuation(id)) {
|
||||
result += "p";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void UNICHARSET::unichar_insert(const char* const unichar_repr) {
|
||||
if (!ids.contains(unichar_repr)) {
|
||||
if (strlen(unichar_repr) > UNICHAR_LEN) {
|
||||
fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
|
||||
int(strlen(unichar_repr)), unichar_repr);
|
||||
return;
|
||||
}
|
||||
if (size_used == size_reserved) {
|
||||
if (size_used == 0)
|
||||
reserve(8);
|
||||
@ -158,31 +196,43 @@ void UNICHARSET::unichar_insert(const char* const unichar_repr) {
|
||||
this->set_islower(size_used, false);
|
||||
this->set_isupper(size_used, false);
|
||||
this->set_isdigit(size_used, false);
|
||||
this->set_script(size_used, add_script(null_script));
|
||||
this->set_ispunctuation(size_used, false);
|
||||
this->set_isngram(size_used, false);
|
||||
this->set_script(size_used, null_script);
|
||||
// If the given unichar_repr represents a fragmented character, set
|
||||
// fragment property to a pointer to CHAR_FRAGMENT class instance with
|
||||
// information parsed from the unichar representation. Use the script
|
||||
// of the base unichar for the fragmented character if possible.
|
||||
CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
|
||||
this->unichars[size_used].properties.fragment = frag;
|
||||
if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
|
||||
this->unichars[size_used].properties.script_id =
|
||||
this->get_script(frag->get_unichar());
|
||||
}
|
||||
this->unichars[size_used].properties.enabled = true;
|
||||
ids.insert(unichar_repr, size_used);
|
||||
++size_used;
|
||||
}
|
||||
}
|
||||
|
||||
bool UNICHARSET::contains_unichar(const char* const unichar_repr) {
|
||||
bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
|
||||
return ids.contains(unichar_repr);
|
||||
}
|
||||
|
||||
bool UNICHARSET::contains_unichar(const char* const unichar_repr, int length) {
|
||||
bool UNICHARSET::contains_unichar(const char* const unichar_repr,
|
||||
int length) const {
|
||||
if (length == 0) {
|
||||
return false;
|
||||
}
|
||||
return ids.contains(unichar_repr, length);
|
||||
}
|
||||
|
||||
bool UNICHARSET::eq(UNICHAR_ID unichar_id, const char* const unichar_repr) {
|
||||
bool UNICHARSET::eq(UNICHAR_ID unichar_id,
|
||||
const char* const unichar_repr) const {
|
||||
return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
|
||||
}
|
||||
|
||||
bool UNICHARSET::save_to_file(const char* filename) const {
|
||||
FILE* file = fopen(filename, "w+");
|
||||
|
||||
if (file == NULL)
|
||||
return false;
|
||||
|
||||
bool UNICHARSET::save_to_file(FILE *file) const {
|
||||
fprintf(file, "%d\n", this->size());
|
||||
for (UNICHAR_ID id = 0; id < this->size(); ++id) {
|
||||
unsigned int properties = 0;
|
||||
@ -195,29 +245,28 @@ bool UNICHARSET::save_to_file(const char* filename) const {
|
||||
properties |= ISUPPER_MASK;
|
||||
if (this->get_isdigit(id))
|
||||
properties |= ISDIGIT_MASK;
|
||||
if (this->get_ispunctuation(id))
|
||||
properties |= ISPUNCTUATION_MASK;
|
||||
|
||||
if (strcmp(this->id_to_unichar(id), " ") == 0)
|
||||
fprintf(file, "%s %x %s\n", "NULL", properties, this->get_script(id));
|
||||
fprintf(file, "%s %x %s %d\n", "NULL", properties,
|
||||
this->get_script_from_script_id(this->get_script(id)),
|
||||
this->get_other_case(id));
|
||||
else
|
||||
fprintf(file, "%s %x %s\n", this->id_to_unichar(id), properties,
|
||||
this->get_script(id));
|
||||
fprintf(file, "%s %x %s %d\n", this->id_to_unichar(id), properties,
|
||||
this->get_script_from_script_id(this->get_script(id)),
|
||||
this->get_other_case(id));
|
||||
}
|
||||
fclose(file);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool UNICHARSET::load_from_file(const char* filename) {
|
||||
FILE* file = fopen(filename, "r");
|
||||
bool UNICHARSET::load_from_file(FILE *file) {
|
||||
int unicharset_size;
|
||||
char buffer[256];
|
||||
|
||||
if (file == NULL)
|
||||
return false;
|
||||
|
||||
this->clear();
|
||||
if (fgets(buffer, sizeof (buffer), file) == NULL ||
|
||||
sscanf(buffer, "%d", &unicharset_size) != 1) {
|
||||
fclose(file);
|
||||
return false;
|
||||
}
|
||||
this->reserve(unicharset_size);
|
||||
@ -226,11 +275,13 @@ bool UNICHARSET::load_from_file(const char* filename) {
|
||||
unsigned int properties;
|
||||
char script[64];
|
||||
|
||||
strcpy(script, null_script);
|
||||
this->unichars[id].properties.other_case = id;
|
||||
if (fgets(buffer, sizeof (buffer), file) == NULL ||
|
||||
(sscanf(buffer, "%s %x %63s", unichar, &properties, script) != 3 &&
|
||||
!(sscanf(buffer, "%s %x", unichar, &properties) == 2 &&
|
||||
strcpy(script, null_script)))) {
|
||||
fclose(file);
|
||||
(sscanf(buffer, "%s %x %63s %d", unichar, &properties,
|
||||
script, &(this->unichars[id].properties.other_case)) != 4 &&
|
||||
sscanf(buffer, "%s %x %63s", unichar, &properties, script) != 3 &&
|
||||
sscanf(buffer, "%s %x", unichar, &properties) != 2)) {
|
||||
return false;
|
||||
}
|
||||
if (strcmp(unichar, "NULL") == 0)
|
||||
@ -238,14 +289,23 @@ bool UNICHARSET::load_from_file(const char* filename) {
|
||||
else
|
||||
this->unichar_insert(unichar);
|
||||
|
||||
this->set_isalpha(id, properties & ISALPHA_MASK);
|
||||
this->set_islower(id, properties & ISLOWER_MASK);
|
||||
this->set_isupper(id, properties & ISUPPER_MASK);
|
||||
this->set_isdigit(id, properties & ISDIGIT_MASK);
|
||||
this->set_script(id, add_script(script));
|
||||
this->set_isalpha(id, (properties & ISALPHA_MASK) != 0);
|
||||
this->set_islower(id, (properties & ISLOWER_MASK) != 0);
|
||||
this->set_isupper(id, (properties & ISUPPER_MASK) != 0);
|
||||
this->set_isdigit(id, (properties & ISDIGIT_MASK) != 0);
|
||||
this->set_ispunctuation(id, (properties & ISPUNCTUATION_MASK) != 0);
|
||||
this->set_isngram(id, false);
|
||||
this->set_script(id, script);
|
||||
this->unichars[id].properties.enabled = true;
|
||||
}
|
||||
fclose(file);
|
||||
|
||||
null_sid_ = get_script_id_from_name(null_script);
|
||||
ASSERT_HOST(null_sid_ == 0);
|
||||
common_sid_ = get_script_id_from_name("Common");
|
||||
latin_sid_ = get_script_id_from_name("Latin");
|
||||
cyrillic_sid_ = get_script_id_from_name("Cyrillic");
|
||||
greek_sid_ = get_script_id_from_name("Greek");
|
||||
han_sid_ = get_script_id_from_name("Han");
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -285,10 +345,10 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
|
||||
}
|
||||
}
|
||||
|
||||
char* UNICHARSET::add_script(const char* script) {
|
||||
int UNICHARSET::add_script(const char* script) {
|
||||
for (int i = 0; i < script_table_size_used; ++i) {
|
||||
if (strcmp(script, script_table[i]) == 0)
|
||||
return script_table[i];
|
||||
return i;
|
||||
}
|
||||
if (script_table_size_reserved == 0) {
|
||||
script_table_size_reserved = 8;
|
||||
@ -303,5 +363,51 @@ char* UNICHARSET::add_script(const char* script) {
|
||||
}
|
||||
script_table[script_table_size_used] = new char[strlen(script) + 1];
|
||||
strcpy(script_table[script_table_size_used], script);
|
||||
return script_table[script_table_size_used++];
|
||||
return script_table_size_used++;
|
||||
}
|
||||
|
||||
CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
|
||||
const char *ptr = string;
|
||||
int len = strlen(string);
|
||||
if (len < kMinLen || *ptr != kSeparator) {
|
||||
return NULL; // this string can not represent a fragment
|
||||
}
|
||||
ptr++; // move to the next character
|
||||
int step = 0;
|
||||
while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
|
||||
step += UNICHAR::utf8_step(ptr + step);
|
||||
}
|
||||
if (step == 0 || step > UNICHAR_LEN) {
|
||||
return NULL; // no character for unichar or the character is too long
|
||||
}
|
||||
char unichar[UNICHAR_LEN + 1];
|
||||
strncpy(unichar, ptr, step);
|
||||
unichar[step] = '\0'; // null terminate unichar
|
||||
ptr += step; // move to the next fragment separator
|
||||
int pos = 0;
|
||||
int total = 0;
|
||||
char *end_ptr = NULL;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (ptr > string + len || *ptr != kSeparator) {
|
||||
return NULL; // failed to parse fragment representation
|
||||
}
|
||||
ptr++; // move to the next character
|
||||
i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
|
||||
: total = static_cast<int>(strtol(ptr, &end_ptr, 10));
|
||||
ptr = end_ptr;
|
||||
}
|
||||
if (ptr != string + len) {
|
||||
return NULL; // malformed fragment representation
|
||||
}
|
||||
CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT();
|
||||
fragment->set_all(unichar, pos, total);
|
||||
return fragment;
|
||||
}
|
||||
|
||||
int UNICHARSET::get_script_id_from_name(const char* script_name) const {
|
||||
for (int i = 0; i < script_table_size_used; ++i) {
|
||||
if (strcmp(script_name, script_table[i]) == 0)
|
||||
return i;
|
||||
}
|
||||
return 0; // 0 is always the null_script
|
||||
}
|
||||
|
@ -17,19 +17,110 @@
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef THIRD_PARTY_TESSERACT_CCUTIL_UNICHARSET_H__
|
||||
#define THIRD_PARTY_TESSERACT_CCUTIL_UNICHARSET_H__
|
||||
#ifndef TESSERACT_CCUTIL_UNICHARSET_H__
|
||||
#define TESSERACT_CCUTIL_UNICHARSET_H__
|
||||
|
||||
#include "assert.h"
|
||||
#include "strngs.h"
|
||||
#include "unichar.h"
|
||||
#include "unicharmap.h"
|
||||
#include "varable.h"
|
||||
|
||||
class CHAR_FRAGMENT {
|
||||
public:
|
||||
// Minimum number of characters used for fragment representation.
|
||||
static const int kMinLen = 6;
|
||||
// Maximum number of characters used for fragment representation.
|
||||
static const int kMaxLen = 3 + UNICHAR_LEN + 2;
|
||||
// Special character used in representing character fragments.
|
||||
static const char kSeparator = '|';
|
||||
// Maximum number of fragments per character.
|
||||
static const int kMaxChunks = 3;
|
||||
|
||||
// Setters and Getters.
|
||||
inline void set_all(const char *unichar, int pos, int total) {
|
||||
this->set_unichar(unichar);
|
||||
this->set_pos(pos);
|
||||
this->set_total(total);
|
||||
}
|
||||
inline void set_unichar(const char *uch) {
|
||||
strncpy(this->unichar, uch, UNICHAR_LEN);
|
||||
this->unichar[UNICHAR_LEN] = '\0';
|
||||
}
|
||||
inline void set_pos(int p) { this->pos = p; }
|
||||
inline void set_total(int t) { this->total = t; }
|
||||
inline const char* get_unichar() const { return this->unichar; }
|
||||
inline int get_pos() const { return this->pos; }
|
||||
inline int get_total() const { return this->total; }
|
||||
|
||||
// Returns the string that represents a fragment
|
||||
// with the given unichar, pos and total.
|
||||
static STRING to_string(const char *unichar, int pos, int total) {
|
||||
STRING result = "";
|
||||
result += kSeparator;
|
||||
result += unichar;
|
||||
char buffer[kMaxLen];
|
||||
snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, kSeparator, total);
|
||||
result += buffer;
|
||||
return result;
|
||||
}
|
||||
// Returns the string that represents this fragment.
|
||||
STRING to_string() const {
|
||||
return to_string(this->unichar, this->pos, this->total);
|
||||
}
|
||||
|
||||
// Checks whether a fragment has the same unichar,
|
||||
// position and total as the given inputs.
|
||||
inline bool equals(const char *other_unichar,
|
||||
int other_pos, int other_total) const {
|
||||
return (strcmp(this->unichar, other_unichar) == 0 &&
|
||||
this->pos == other_pos && this->total == other_total);
|
||||
}
|
||||
inline bool equals(const CHAR_FRAGMENT *other) const {
|
||||
return this->equals(other->get_unichar(),
|
||||
other->get_pos(),
|
||||
other->get_total());
|
||||
}
|
||||
|
||||
// Checks whether a given fragment is a continuation of this fragment.
|
||||
// Assumes that the given fragment pointer is not NULL.
|
||||
inline bool is_continuation_of(const CHAR_FRAGMENT *fragment) const {
|
||||
return (strcmp(this->unichar, fragment->get_unichar()) == 0 &&
|
||||
this->total == fragment->get_total() &&
|
||||
this->pos == fragment->get_pos() + 1);
|
||||
}
|
||||
|
||||
// Returns true if this fragment is a beginning fragment.
|
||||
inline bool is_beginning() const { return this->pos == 0; }
|
||||
|
||||
// Returns true if this fragment is an ending fragment.
|
||||
inline bool is_ending() const { return this->pos == this->total-1; }
|
||||
|
||||
// Parses the string to see whether it represents a character fragment
|
||||
// (rather than a regular character). If so, allocates memory for a new
|
||||
// CHAR_FRAGMENT instance and fills it in with the corresponding fragment
|
||||
// information. Fragments are of the form:
|
||||
// |m|1|2, meaning chunk 1 of 2 of character m.
|
||||
//
|
||||
// If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
|
||||
// instance, otherwise (if the string does not represent a fragment or it
|
||||
// looks like it does, but parsing it as a fragment fails) returns NULL.
|
||||
//
|
||||
// Note: The caller is responsible for deallocating memory
|
||||
// associated with the returned pointer.
|
||||
static CHAR_FRAGMENT *parse_from_string(const char *str);
|
||||
|
||||
private:
|
||||
char unichar[UNICHAR_LEN + 1];
|
||||
inT16 pos; // fragment position in the character
|
||||
inT16 total; // total number of fragments in the character
|
||||
};
|
||||
|
||||
// The UNICHARSET class is an utility class for Tesseract that holds the
|
||||
// set of characters that are used by the engine. Each character is identified
|
||||
// by a unique number, from 0 to (size - 1).
|
||||
class UNICHARSET {
|
||||
public:
|
||||
|
||||
// Create an empty UNICHARSET
|
||||
UNICHARSET();
|
||||
|
||||
@ -54,20 +145,43 @@ class UNICHARSET {
|
||||
// within the UNICHARSET.
|
||||
const char* const id_to_unichar(UNICHAR_ID id) const;
|
||||
|
||||
// Return a STRING that reformats the utf8 str into the str followed
|
||||
// by its hex unicodes.
|
||||
static STRING debug_utf8_str(const char* str);
|
||||
|
||||
// Return a STRING containing debug information on the unichar, including
|
||||
// the id_to_unichar, its hex unicodes and the properties.
|
||||
STRING debug_str(UNICHAR_ID id) const;
|
||||
STRING debug_str(const char * unichar_repr) const {
|
||||
return debug_str(unichar_to_id(unichar_repr));
|
||||
}
|
||||
|
||||
// Add a unichar representation to the set.
|
||||
void unichar_insert(const char* const unichar_repr);
|
||||
|
||||
// Return true if the given unichar id exists within the set.
|
||||
// Relies on the fact that unichar ids are contiguous in the unicharset.
|
||||
bool contains_unichar_id(UNICHAR_ID unichar_id) const {
|
||||
return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used;
|
||||
}
|
||||
|
||||
// Return true if the given unichar representation exists within the set.
|
||||
bool contains_unichar(const char* const unichar_repr);
|
||||
bool contains_unichar(const char* const unichar_repr, int length);
|
||||
bool contains_unichar(const char* const unichar_repr) const;
|
||||
bool contains_unichar(const char* const unichar_repr, int length) const;
|
||||
|
||||
// Return true if the given unichar representation corresponds to the given
|
||||
// UNICHAR_ID within the set.
|
||||
bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr);
|
||||
bool eq(UNICHAR_ID unichar_id, const char* const unichar_repr) const;
|
||||
|
||||
// Delete CHAR_FRAGMENTs stored in properties of unichars array.
|
||||
void delete_pointers_in_unichars() {
|
||||
for (int i = 0; i < size_used; ++i) {
|
||||
if (unichars[i].properties.fragment != NULL) {
|
||||
delete unichars[i].properties.fragment;
|
||||
unichars[i].properties.fragment = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Clear the UNICHARSET (all the previous data is lost).
|
||||
void clear() {
|
||||
@ -78,6 +192,7 @@ class UNICHARSET {
|
||||
script_table = 0;
|
||||
script_table_size_reserved = 0;
|
||||
script_table_size_used = 0;
|
||||
delete_pointers_in_unichars();
|
||||
delete[] unichars;
|
||||
unichars = 0;
|
||||
size_reserved = 0;
|
||||
@ -94,13 +209,34 @@ class UNICHARSET {
|
||||
// Reserve enough memory space for the given number of UNICHARS
|
||||
void reserve(int unichars_number);
|
||||
|
||||
// Save the content of the UNICHARSET to the given file. Return true if the
|
||||
// operation is successful.
|
||||
bool save_to_file(const char* const filename) const;
|
||||
// Opens the file indicated by filename and saves unicharset to that file.
|
||||
// Returns true if the operation is successful.
|
||||
bool save_to_file(const char * const filename) const {
|
||||
FILE* file = fopen(filename, "w+");
|
||||
if (file == NULL) return false;
|
||||
bool result = save_to_file(file);
|
||||
fclose(file);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Load the UNICHARSET from the given file. The previous data is lost. Return
|
||||
// true if the operation is successful.
|
||||
bool load_from_file(const char* const filename);
|
||||
// Saves the content of the UNICHARSET to the given file.
|
||||
// Returns true if the operation is successful.
|
||||
bool save_to_file(FILE *file) const;
|
||||
|
||||
// Opens the file indicated by filename and loads the UNICHARSET
|
||||
// from the given file. The previous data is lost.
|
||||
// Returns true if the operation is successful.
|
||||
bool load_from_file(const char* const filename) {
|
||||
FILE* file = fopen(filename, "r");
|
||||
if (file == NULL) return false;
|
||||
bool result = load_from_file(file);
|
||||
fclose(file);
|
||||
return result;
|
||||
}
|
||||
|
||||
// Loads the UNICHARSET from the given file. The previous data is lost.
|
||||
// Returns true if the operation is successful.
|
||||
bool load_from_file(FILE *file);
|
||||
|
||||
// Set a whitelist and/or blacklist of characters to recognize.
|
||||
// An empty or NULL whitelist enables everything (minus any blacklist).
|
||||
@ -131,10 +267,25 @@ class UNICHARSET {
|
||||
unichars[unichar_id].properties.isdigit = value;
|
||||
}
|
||||
|
||||
// Set the ispunctuation property of the given unichar to the given value.
|
||||
void set_ispunctuation(UNICHAR_ID unichar_id, bool value) {
|
||||
unichars[unichar_id].properties.ispunctuation = value;
|
||||
}
|
||||
|
||||
// Set the isngram property of the given unichar to the given value.
|
||||
void set_isngram(UNICHAR_ID unichar_id, bool value) {
|
||||
unichars[unichar_id].properties.isngram = value;
|
||||
}
|
||||
|
||||
// Set the script name of the given unichar to the given value.
|
||||
// Value is copied and thus can be a temporary;
|
||||
void set_script(UNICHAR_ID unichar_id, const char* value) {
|
||||
unichars[unichar_id].properties.script = add_script(value);
|
||||
unichars[unichar_id].properties.script_id = add_script(value);
|
||||
}
|
||||
|
||||
// Set other_case unichar id in the properties for the given unichar id.
|
||||
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case) {
|
||||
unichars[unichar_id].properties.other_case = other_case;
|
||||
}
|
||||
|
||||
// Return the isalpha property of the given unichar.
|
||||
@ -157,11 +308,44 @@ class UNICHARSET {
|
||||
return unichars[unichar_id].properties.isdigit;
|
||||
}
|
||||
|
||||
// Return the ispunctuation property of the given unichar.
|
||||
bool get_ispunctuation(UNICHAR_ID unichar_id) const {
|
||||
return unichars[unichar_id].properties.ispunctuation;
|
||||
}
|
||||
|
||||
// Return the isngram property of the given unichar.
|
||||
bool get_isngram(UNICHAR_ID unichar_id) const {
|
||||
return unichars[unichar_id].properties.isngram;
|
||||
}
|
||||
|
||||
// Return the script name of the given unichar.
|
||||
// The returned pointer will always be the same for the same script, it's
|
||||
// managed by unicharset and thus MUST NOT be deleted
|
||||
const char* get_script(UNICHAR_ID unichar_id) const {
|
||||
return unichars[unichar_id].properties.script;
|
||||
int get_script(UNICHAR_ID unichar_id) const {
|
||||
return unichars[unichar_id].properties.script_id;
|
||||
}
|
||||
|
||||
// Get other_case unichar id in the properties for the given unichar id.
|
||||
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
|
||||
return unichars[unichar_id].properties.other_case;
|
||||
}
|
||||
|
||||
// Returns UNICHAR_ID of the corresponding lower-case unichar.
|
||||
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
|
||||
if (unichars[unichar_id].properties.islower) return unichar_id;
|
||||
return unichars[unichar_id].properties.other_case;
|
||||
}
|
||||
|
||||
// Returns UNICHAR_ID of the corresponding upper-case unichar.
|
||||
UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
|
||||
if (unichars[unichar_id].properties.isupper) return unichar_id;
|
||||
return unichars[unichar_id].properties.other_case;
|
||||
}
|
||||
|
||||
// Return a pointer to the CHAR_FRAGMENT class if the given
|
||||
// unichar id represents a character fragment.
|
||||
const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
|
||||
return unichars[unichar_id].properties.fragment;
|
||||
}
|
||||
|
||||
// Return the isalpha property of the given unichar representation.
|
||||
@ -184,13 +368,28 @@ class UNICHARSET {
|
||||
return get_isdigit(unichar_to_id(unichar_repr));
|
||||
}
|
||||
|
||||
// Return the ispunctuation property of the given unichar representation.
|
||||
bool get_ispunctuation(const char* const unichar_repr) const {
|
||||
return get_ispunctuation(unichar_to_id(unichar_repr));
|
||||
}
|
||||
|
||||
// Return the script name of the given unichar representation.
|
||||
// The returned pointer will always be the same for the same script, it's
|
||||
// managed by unicharset and thus MUST NOT be deleted
|
||||
const char* get_script(const char* const unichar_repr) const {
|
||||
int get_script(const char* const unichar_repr) const {
|
||||
return get_script(unichar_to_id(unichar_repr));
|
||||
}
|
||||
|
||||
// Return a pointer to the CHAR_FRAGMENT class struct if the given
|
||||
// unichar representation represents a character fragment.
|
||||
const CHAR_FRAGMENT *get_fragment(const char* const unichar_repr) const {
|
||||
if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
|
||||
!ids.contains(unichar_repr)) {
|
||||
return NULL;
|
||||
}
|
||||
return get_fragment(unichar_to_id(unichar_repr));
|
||||
}
|
||||
|
||||
// Return the isalpha property of the given unichar representation.
|
||||
// Only the first length characters from unichar_repr are used.
|
||||
bool get_isalpha(const char* const unichar_repr,
|
||||
@ -219,34 +418,82 @@ class UNICHARSET {
|
||||
return get_isdigit(unichar_to_id(unichar_repr, length));
|
||||
}
|
||||
|
||||
// Return the ispunctuation property of the given unichar representation.
|
||||
// Only the first length characters from unichar_repr are used.
|
||||
bool get_ispunctuation(const char* const unichar_repr,
|
||||
int length) const {
|
||||
return get_ispunctuation(unichar_to_id(unichar_repr, length));
|
||||
}
|
||||
|
||||
// Return the script name of the given unichar representation.
|
||||
// Only the first length characters from unichar_repr are used.
|
||||
// The returned pointer will always be the same for the same script, it's
|
||||
// managed by unicharset and thus MUST NOT be deleted
|
||||
const char* get_script(const char* const unichar_repr,
|
||||
int length) const {
|
||||
int get_script(const char* const unichar_repr,
|
||||
int length) const {
|
||||
return get_script(unichar_to_id(unichar_repr, length));
|
||||
}
|
||||
|
||||
// Return the (current) number of scripts in the script table
|
||||
int get_script_table_size() const {
|
||||
return script_table_size_used;
|
||||
}
|
||||
|
||||
// Return the script string from its id
|
||||
const char* get_script_from_script_id(int id) const {
|
||||
if (id >= script_table_size_used || id < 0)
|
||||
return null_script;
|
||||
return script_table[id];
|
||||
}
|
||||
|
||||
// Returns the id from the name of the script, or 0 if script is not found.
|
||||
// Note that this is an expensive operation since it involves iteratively
|
||||
// comparing strings in the script table. To avoid dependency on STL, we
|
||||
// won't use a hash. Instead, the calling function can use this to lookup
|
||||
// and save the ID for relevant scripts for fast comparisons later.
|
||||
int get_script_id_from_name(const char* script_name) const;
|
||||
|
||||
// Return true if the given script is the null script
|
||||
bool is_null_script(const char* script) const {
|
||||
return script == null_script;
|
||||
}
|
||||
|
||||
// Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
|
||||
// then the returned pointer will be the same.
|
||||
// The script parameter is copied and thus can be a temporary.
|
||||
int add_script(const char* script);
|
||||
|
||||
// Return the enabled property of the given unichar.
|
||||
bool get_enabled(UNICHAR_ID unichar_id) const {
|
||||
return unichars[unichar_id].properties.enabled;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
// Uniquify the given script. For two scripts a and b, if strcmp(a, b) == 0,
|
||||
// then the returned pointer will be the same.
|
||||
// The script parameter is copied and thus can be a temporary.
|
||||
char* add_script(const char* script);
|
||||
int null_sid() const { return null_sid_; }
|
||||
int common_sid() const { return common_sid_; }
|
||||
int latin_sid() const { return latin_sid_; }
|
||||
int cyrillic_sid() const { return cyrillic_sid_; }
|
||||
int greek_sid() const { return greek_sid_; }
|
||||
int han_sid() const { return han_sid_; }
|
||||
|
||||
private:
|
||||
|
||||
struct UNICHAR_PROPERTIES {
|
||||
bool isalpha;
|
||||
bool islower;
|
||||
bool isupper;
|
||||
bool isdigit;
|
||||
bool ispunctuation;
|
||||
bool isngram;
|
||||
bool enabled;
|
||||
char* script;
|
||||
int script_id;
|
||||
UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar
|
||||
|
||||
// Contains meta information about the fragment if a unichar represents
|
||||
// a fragment of a character, otherwise should be set to NULL.
|
||||
// It is assumed that character fragments are added to the unicharset
|
||||
// after the corresponding 'base' characters.
|
||||
CHAR_FRAGMENT *fragment;
|
||||
};
|
||||
|
||||
struct UNICHAR_SLOT {
|
||||
@ -262,6 +509,16 @@ class UNICHARSET {
|
||||
int script_table_size_used;
|
||||
int script_table_size_reserved;
|
||||
const char* null_script;
|
||||
|
||||
// A few convenient script name-to-id mapping without using hash.
|
||||
// These are initialized when unicharset file is loaded. Anything
|
||||
// missing from this list can be looked up using get_script_id_from_name.
|
||||
int null_sid_;
|
||||
int common_sid_;
|
||||
int latin_sid_;
|
||||
int cyrillic_sid_;
|
||||
int greek_sid_;
|
||||
int han_sid_;
|
||||
};
|
||||
|
||||
#endif // THIRD_PARTY_TESSERACT_CCUTIL_UNICHARSET_H__
|
||||
#endif // TESSERACT_CCUTIL_UNICHARSET_H__
|
||||
|
198
ccutil/unicity_table.h
Normal file
198
ccutil/unicity_table.h
Normal file
@ -0,0 +1,198 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: UnicityTable.h
|
||||
// Description: a class to uniquify objects, manipulating them using integers
|
||||
// ids.
|
||||
// Author: Samuel Charron
|
||||
//
|
||||
// (C) Copyright 2006, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCUTIL_UNICITY_TABLE_H_
|
||||
#define TESSERACT_CCUTIL_UNICITY_TABLE_H_
|
||||
|
||||
#include "callback.h"
|
||||
#include "errcode.h"
|
||||
#include "genericvector.h"
|
||||
|
||||
// A class to uniquify objects, manipulating them using integers ids.
|
||||
// T requirements:
|
||||
// operator= to add an element
|
||||
// default-constructible: allocating the internal table will call the default
|
||||
// constructor.
|
||||
template <typename T>
|
||||
class UnicityTable {
|
||||
public:
|
||||
UnicityTable();
|
||||
// Clear the structures and deallocate internal structures.
|
||||
~UnicityTable();
|
||||
|
||||
// Reserve some memory. If there is size or more elements, the table will
|
||||
// then allocate size * 2 elements.
|
||||
void reserve(int size);
|
||||
|
||||
// Return the size used.
|
||||
int size() const;
|
||||
|
||||
// Return the object from an id.
|
||||
T get(int id) const;
|
||||
|
||||
// Return the id of the T object.
|
||||
// This method NEEDS a compare_callback to be passed to
|
||||
// set_compare_callback.
|
||||
int get_id(T object) const;
|
||||
|
||||
// Return true if T is in the table
|
||||
bool contains(T object) const;
|
||||
|
||||
// Return true if the id is valid
|
||||
T contains_id(int id) const;
|
||||
|
||||
// Add an element in the table
|
||||
int push_back(T object);
|
||||
|
||||
// Add a callback to be called to delete the elements when the table took
|
||||
// their ownership.
|
||||
void set_clear_callback(Callback1<T>* cb);
|
||||
|
||||
// Add a callback to be called to compare the elements when needed (contains,
|
||||
// get_id, ...)
|
||||
void set_compare_callback(ResultCallback2<bool, T const &, T const &>* cb);
|
||||
|
||||
// Clear the table, calling the callback function if any.
|
||||
// All the owned Callbacks are also deleted.
|
||||
// If you don't want the Callbacks to be deleted, before calling clear, set
|
||||
// the callback to NULL.
|
||||
void clear();
|
||||
|
||||
// This method clear the current object, then, does a shallow copy of
|
||||
// its argument, and finally invalidate its argument.
|
||||
void move(UnicityTable<T>* from);
|
||||
|
||||
// Read/Write the table to a file. This does _NOT_ read/write the callbacks.
|
||||
// The Callback given must be permanent since they will be called more than
|
||||
// once. The given callback will be deleted at the end.
|
||||
void write(FILE* f, Callback2<FILE*, T const &>* cb);
|
||||
// swap is used to switch the endianness.
|
||||
void read(FILE* f, Callback3<FILE*, T*, bool>* cb, bool swap);
|
||||
|
||||
private:
|
||||
GenericVector<T> table_;
|
||||
// Mutable because Run method is not const
|
||||
mutable ResultCallback2<bool, T const &, T const &>* compare_cb_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
class UnicityTableEqEq : public UnicityTable<T> {
|
||||
public:
|
||||
UnicityTableEqEq() {
|
||||
UnicityTable<T>::set_compare_callback(
|
||||
NewPermanentCallback(tesseract::cmp_eq<T>));
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
UnicityTable<T>::UnicityTable() :
|
||||
compare_cb_(0) {
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
UnicityTable<T>::~UnicityTable() {
|
||||
clear();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
int UnicityTable<T>::size() const{
|
||||
return table_.size();
|
||||
}
|
||||
|
||||
// Reserve some memory. If there is size or more elements, the table will
|
||||
// then allocate size * 2 elements.
|
||||
template <typename T>
|
||||
void UnicityTable<T>::reserve(int size) {
|
||||
table_.reserve(size);
|
||||
}
|
||||
|
||||
// Return the object from an id.
|
||||
template <typename T>
|
||||
T UnicityTable<T>::get(int id) const {
|
||||
return table_.get(id);
|
||||
}
|
||||
|
||||
// Return true if the id is valid
|
||||
template <typename T>
|
||||
T UnicityTable<T>::contains_id(int id) const {
|
||||
return table_.contains_index(id);
|
||||
}
|
||||
|
||||
// Return the id of the T object.
|
||||
template <typename T>
|
||||
int UnicityTable<T>::get_id(T object) const {
|
||||
return table_.get_index(object);
|
||||
}
|
||||
|
||||
// Return true if T is in the table
|
||||
template <typename T>
|
||||
bool UnicityTable<T>::contains(T object) const {
|
||||
return get_id(object) != -1;
|
||||
}
|
||||
|
||||
// Add an element in the table
|
||||
template <typename T>
|
||||
int UnicityTable<T>::push_back(T object) {
|
||||
int idx = get_id(object);
|
||||
if (idx == -1) {
|
||||
idx = table_.push_back(object);
|
||||
}
|
||||
return idx;
|
||||
}
|
||||
|
||||
// Add a callback to be called to delete the elements when the table took
|
||||
// their ownership.
|
||||
template <typename T>
|
||||
void UnicityTable<T>::set_clear_callback(Callback1<T>* cb) {
|
||||
table_.set_clear_callback(cb);
|
||||
}
|
||||
|
||||
// Add a callback to be called to delete the elements when the table took
|
||||
// their ownership.
|
||||
template <typename T>
|
||||
void UnicityTable<T>::set_compare_callback(ResultCallback2<bool, T const &, T const &>* cb) {
|
||||
table_.set_compare_callback(cb);
|
||||
compare_cb_ = cb;
|
||||
}
|
||||
|
||||
// Clear the table, calling the callback function if any.
|
||||
template <typename T>
|
||||
void UnicityTable<T>::clear() {
|
||||
table_.clear();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void UnicityTable<T>::write(FILE* f, Callback2<FILE*, T const &>* cb) {
|
||||
table_.write(f, cb);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void UnicityTable<T>::read(FILE* f, Callback3<FILE*, T*, bool>* cb, bool swap) {
|
||||
table_.read(f, cb, swap);
|
||||
}
|
||||
|
||||
// This method clear the current object, then, does a shallow copy of
|
||||
// its argument, and finally invalidate its argument.
|
||||
template <typename T>
|
||||
void UnicityTable<T>::move(UnicityTable<T>* from) {
|
||||
table_.move(&from->table_);
|
||||
}
|
||||
|
||||
#endif // TESSERACT_CCUTIL_UNICITY_TABLE_H_
|
@ -18,13 +18,14 @@
|
||||
**********************************************************************/
|
||||
|
||||
#include "mfcpch.h" //precompiled headers
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include "tprintf.h"
|
||||
//#include "ipeerr.h"
|
||||
#include "varable.h"
|
||||
|
||||
#include "scanutils.h"
|
||||
#include "tprintf.h"
|
||||
#include "varable.h"
|
||||
|
||||
#define PLUS '+' //flag states
|
||||
#define MINUS '-'
|
||||
@ -379,24 +380,23 @@ STRING_VARIABLE_CLIST *STRING_VARIABLE::get_head() { // access to static
|
||||
* Print the entire list of STRING_VARIABLEs.
|
||||
**********************************************************************/
|
||||
|
||||
void STRING_VARIABLE::print(FILE *fp // file to print on
|
||||
) {
|
||||
// list iterator
|
||||
STRING_VARIABLE_C_IT it = &head;
|
||||
void STRING_VARIABLE::print(FILE *fp) {
|
||||
STRING_VARIABLE_C_IT it = &head; // list iterator
|
||||
STRING_VARIABLE *elt; // current element
|
||||
|
||||
// Comments aren't allowed with string variables, so the # character can
|
||||
// be part of a string.
|
||||
if (fp == stdout) {
|
||||
tprintf("#Variables of type STRING_VARIABLE:\n");
|
||||
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
|
||||
elt = it.data();
|
||||
tprintf("%s #%s %s\n", elt->name, elt->value.string(), elt->info);
|
||||
tprintf("%s %s\n", elt->name, elt->value.string());
|
||||
}
|
||||
} else {
|
||||
fprintf(fp, "#Variables of type STRING_VARIABLE:\n");
|
||||
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
|
||||
elt = it.data();
|
||||
fprintf(fp, "%s #%s %s\n",
|
||||
elt->name, elt->value.string(), elt->info);
|
||||
fprintf(fp, "%s %s\n", elt->name, elt->value.string());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -519,20 +519,14 @@ void double_VARIABLE::print(FILE *fp // file to print on
|
||||
* Values may have any whitespace after the name and are the rest of line.
|
||||
**********************************************************************/
|
||||
|
||||
DLLSYM BOOL8 read_variables_file(const char *file // name to read
|
||||
) {
|
||||
BOOL8 anyerr; // true if any error
|
||||
DLLSYM BOOL8 read_variables_file(const char *file, // name to read
|
||||
bool global_only // only set variables
|
||||
) { // starting with "global_"
|
||||
char flag; // file flag
|
||||
BOOL8 foundit; // found variable
|
||||
inT16 length; // length of line
|
||||
inT16 nameoffset; // offset for real name
|
||||
char *valptr; // value field
|
||||
char *stringend; // end of string value
|
||||
FILE *fp; // file pointer
|
||||
// iterators
|
||||
char line[MAX_PATH]; // input line
|
||||
|
||||
anyerr = FALSE;
|
||||
if (*file == PLUS) {
|
||||
flag = PLUS; // file has flag
|
||||
nameoffset = 1;
|
||||
@ -546,54 +540,48 @@ DLLSYM BOOL8 read_variables_file(const char *file // name to read
|
||||
|
||||
fp = fopen(file + nameoffset, "r");
|
||||
if (fp == NULL) {
|
||||
tprintf("read_variables_file:Can't open %s", file + nameoffset);
|
||||
tprintf("read_variables_file: Can't open %s\n", file + nameoffset);
|
||||
return TRUE; // can't open it
|
||||
}
|
||||
while (fgets (line, MAX_PATH, fp)) {
|
||||
return read_variables_from_fp(fp, -1, global_only);
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
bool read_variables_from_fp(FILE *fp, inT64 end_offset, bool global_only) {
|
||||
char line[MAX_PATH]; // input line
|
||||
bool anyerr = false; // true if any error
|
||||
bool foundit; // found variable
|
||||
inT16 length; // length of line
|
||||
char *valptr; // value field
|
||||
|
||||
while ((end_offset < 0 || ftell(fp) < end_offset) &&
|
||||
fgets(line, MAX_PATH, fp)) {
|
||||
if (line[0] != '\n' && line[0] != '#') {
|
||||
length = strlen (line);
|
||||
if (line[length - 1] == '\n')
|
||||
line[length - 1] = '\0'; // cut newline
|
||||
for (valptr = line; *valptr && *valptr != ' ' && *valptr != '\t';
|
||||
valptr++);
|
||||
if (*valptr) { //found blank
|
||||
*valptr = '\0'; //make name a string
|
||||
if (*valptr) { // found blank
|
||||
*valptr = '\0'; // make name a string
|
||||
do
|
||||
|
||||
valptr++; //find end of blanks
|
||||
valptr++; // find end of blanks
|
||||
while (*valptr == ' ' || *valptr == '\t');
|
||||
|
||||
if (*valptr && *valptr != '#') {
|
||||
//last char in string
|
||||
stringend = valptr + strlen (valptr) - 1;
|
||||
while (stringend != valptr) {
|
||||
while (stringend != valptr
|
||||
&& (*stringend == ' ' || *stringend == '\t'))
|
||||
// cut trailing blanks
|
||||
stringend--;
|
||||
stringend[1] = '\0'; // terminate string
|
||||
|
||||
while (stringend != valptr
|
||||
&& ((*stringend != ' ' && *stringend != '\t')
|
||||
|| stringend[1] != '#'))
|
||||
stringend--; // find word start
|
||||
}
|
||||
}
|
||||
}
|
||||
foundit = set_new_style_variable(line, valptr);
|
||||
if (global_only && strstr(line, kGlobalVariablePrefix) == NULL) continue;
|
||||
foundit = set_variable(line, valptr);
|
||||
|
||||
if (!foundit) {
|
||||
anyerr = TRUE; // had an error
|
||||
tprintf("read_variables_file:variable not found: %s\n",
|
||||
line);
|
||||
tprintf("read_variables_file: variable not found: %s\n", line);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(fp); // close file
|
||||
return anyerr;
|
||||
}
|
||||
|
||||
bool set_new_style_variable(const char *variable, const char* value) {
|
||||
bool set_variable(const char *variable, const char* value) {
|
||||
INT_VARIABLE_C_IT int_it = &INT_VARIABLE::head;
|
||||
BOOL_VARIABLE_C_IT BOOL_it = &BOOL_VARIABLE::head;
|
||||
STRING_VARIABLE_C_IT STRING_it = &STRING_VARIABLE::head;
|
||||
@ -606,10 +594,7 @@ bool set_new_style_variable(const char *variable, const char* value) {
|
||||
STRING_it.forward());
|
||||
if (!STRING_it.cycled_list()) {
|
||||
foundit = true; // found the varaible
|
||||
if (*value == '\0')
|
||||
STRING_it.data()->set_value((char *) NULL); // No value.
|
||||
else
|
||||
STRING_it.data()->set_value(value); // set its value
|
||||
STRING_it.data()->set_value(value); // set its value
|
||||
}
|
||||
|
||||
if (*value) {
|
||||
@ -624,7 +609,7 @@ bool set_new_style_variable(const char *variable, const char* value) {
|
||||
int_it.data()->set_value(intval); // set its value.
|
||||
}
|
||||
for (BOOL_it.mark_cycle_pt();
|
||||
!BOOL_it.cycled_list () && strcmp(variable, BOOL_it.data()->name);
|
||||
!BOOL_it.cycled_list() && strcmp(variable, BOOL_it.data()->name);
|
||||
BOOL_it.forward());
|
||||
if (!BOOL_it.cycled_list()) {
|
||||
if (*value == 'T' || *value == 't' ||
|
||||
|
@ -21,18 +21,27 @@
|
||||
#define VARABLE_H
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "clst.h"
|
||||
#include "strngs.h"
|
||||
|
||||
class DLLSYM INT_VARIABLE;
|
||||
|
||||
//read the file
|
||||
extern DLLSYM BOOL8 read_variables_file(const char *file //name to read
|
||||
);
|
||||
bool set_new_style_variable(const char *variable, const char* value);
|
||||
//print all vars
|
||||
extern DLLSYM void print_variables(FILE *fp //file to print on
|
||||
);
|
||||
// Read config file.
|
||||
extern DLLSYM BOOL8 read_variables_file(
|
||||
const char *file, // filename to read
|
||||
bool global_only); // only set variables starting with "global_"
|
||||
|
||||
// Read variables from the given file pointer (stop at end_offset).
|
||||
bool read_variables_from_fp(FILE *fp, inT64 end_offset, bool global_only);
|
||||
|
||||
// Set a variable to have the given value.
|
||||
bool set_variable(const char *variable, const char* value);
|
||||
|
||||
// Print variables to a file.
|
||||
extern DLLSYM void print_variables(FILE *fp);
|
||||
|
||||
const char kGlobalVariablePrefix[] = "global_";
|
||||
|
||||
CLISTIZEH (INT_VARIABLE)
|
||||
class DLLSYM INT_VAR_FROM
|
||||
@ -57,7 +66,7 @@ class DLLSYM INT_VARIABLE
|
||||
friend class INT_VAR_TO;
|
||||
friend class INT_VAR_FROM;
|
||||
//for setting values
|
||||
friend bool set_new_style_variable(const char *variable, const char* value);
|
||||
friend bool set_variable(const char *variable, const char* value);
|
||||
|
||||
public:
|
||||
INT_VARIABLE(inT32 v, // initial value
|
||||
@ -124,7 +133,7 @@ class DLLSYM BOOL_VARIABLE {
|
||||
friend class BOOL_VAR_FROM;
|
||||
friend class BOOL_VAR_TO;
|
||||
//for setting values
|
||||
friend bool set_new_style_variable(const char *variable, const char* value);
|
||||
friend bool set_variable(const char *variable, const char* value);
|
||||
|
||||
public:
|
||||
BOOL_VARIABLE( //constructor
|
||||
@ -197,7 +206,7 @@ class DLLSYM STRING_VARIABLE
|
||||
friend class STRING_VAR_TO;
|
||||
friend class STRING_VAR_FROM;
|
||||
//for setting values
|
||||
friend bool set_new_style_variable(const char *variable, const char* value);
|
||||
friend bool set_variable(const char *variable, const char* value);
|
||||
|
||||
public:
|
||||
STRING_VARIABLE( //constructor
|
||||
@ -274,7 +283,7 @@ class DLLSYM double_VARIABLE
|
||||
friend class double_VAR_TO;
|
||||
friend class double_VAR_FROM;
|
||||
//for setting values
|
||||
friend bool set_new_style_variable(const char *variable, const char* value);
|
||||
friend bool set_variable(const char *variable, const char* value);
|
||||
|
||||
public:
|
||||
double_VARIABLE( //constructor
|
||||
|
Loading…
Reference in New Issue
Block a user