Added extra Init that takes a memory buffer or a filereader function pointer to enable read of traineddata from memory or foreign file systems. Updated existing readers to use TFile API instead of FILE. This does not yet add big-endian capability to LSTM, but it is very easy from here.

This commit is contained in:
Ray Smith 2017-04-27 15:48:23 -07:00
parent 10e04ffe99
commit 1cc511188d
48 changed files with 833 additions and 1199 deletions

View File

@ -108,26 +108,30 @@ const int kMinCredibleResolution = 70;
const int kMaxCredibleResolution = 2400;
TessBaseAPI::TessBaseAPI()
: tesseract_(NULL),
osd_tesseract_(NULL),
equ_detect_(NULL),
// Thresholder is initialized to NULL here, but will be set before use by:
// A constructor of a derived API, SetThresholder(), or
// created implicitly when used in InternalSetImage.
thresholder_(NULL),
paragraph_models_(NULL),
block_list_(NULL),
page_res_(NULL),
input_file_(NULL),
output_file_(NULL),
datapath_(NULL),
language_(NULL),
last_oem_requested_(OEM_DEFAULT),
recognition_done_(false),
truth_cb_(NULL),
rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0),
image_width_(0), image_height_(0) {
}
: tesseract_(nullptr),
osd_tesseract_(nullptr),
equ_detect_(nullptr),
reader_(nullptr),
// Thresholder is initialized to NULL here, but will be set before use by:
// A constructor of a derived API, SetThresholder(), or
// created implicitly when used in InternalSetImage.
thresholder_(nullptr),
paragraph_models_(nullptr),
block_list_(nullptr),
page_res_(nullptr),
input_file_(nullptr),
output_file_(nullptr),
datapath_(nullptr),
language_(nullptr),
last_oem_requested_(OEM_DEFAULT),
recognition_done_(false),
truth_cb_(NULL),
rect_left_(0),
rect_top_(0),
rect_width_(0),
rect_height_(0),
image_width_(0),
image_height_(0) {}
TessBaseAPI::~TessBaseAPI() {
End();
@ -275,20 +279,33 @@ int TessBaseAPI::Init(const char* datapath, const char* language,
const GenericVector<STRING> *vars_vec,
const GenericVector<STRING> *vars_values,
bool set_only_non_debug_params) {
return Init(datapath, 0, language, oem, configs, configs_size, vars_vec,
vars_values, set_only_non_debug_params, nullptr);
}
// In-memory version reads the traineddata file directly from the given
// data[data_size] array. Also implements the version with a datapath in data,
// flagged by data_size = 0.
int TessBaseAPI::Init(const char* data, int data_size, const char* language,
OcrEngineMode oem, char** configs, int configs_size,
const GenericVector<STRING>* vars_vec,
const GenericVector<STRING>* vars_values,
bool set_only_non_debug_params, FileReader reader) {
PERF_COUNT_START("TessBaseAPI::Init")
// Default language is "eng".
if (language == NULL) language = "eng";
if (language == nullptr) language = "eng";
STRING datapath = data_size == 0 ? data : language;
// If the datapath, OcrEngineMode or the language have changed - start again.
// Note that the language_ field stores the last requested language that was
// initialized successfully, while tesseract_->lang stores the language
// actually used. They differ only if the requested language was NULL, in
// which case tesseract_->lang is set to the Tesseract default ("eng").
if (tesseract_ != NULL &&
(datapath_ == NULL || language_ == NULL ||
*datapath_ != datapath || last_oem_requested_ != oem ||
if (tesseract_ != nullptr &&
(datapath_ == nullptr || language_ == nullptr || *datapath_ != datapath ||
last_oem_requested_ != oem ||
(*language_ != language && tesseract_->lang != language))) {
delete tesseract_;
tesseract_ = NULL;
tesseract_ = nullptr;
}
// PERF_COUNT_SUB("delete tesseract_")
#ifdef USE_OPENCL
@ -297,19 +314,25 @@ int TessBaseAPI::Init(const char* datapath, const char* language,
#endif
PERF_COUNT_SUB("OD::InitEnv()")
bool reset_classifier = true;
if (tesseract_ == NULL) {
if (tesseract_ == nullptr) {
reset_classifier = false;
tesseract_ = new Tesseract;
if (reader != nullptr) reader_ = reader;
TessdataManager mgr(reader_);
if (data_size != 0) {
mgr.LoadMemBuffer(language, data, data_size);
}
if (tesseract_->init_tesseract(
datapath, output_file_ != NULL ? output_file_->string() : NULL,
language, oem, configs, configs_size, vars_vec, vars_values,
set_only_non_debug_params) != 0) {
datapath.string(),
output_file_ != nullptr ? output_file_->string() : nullptr,
language, oem, configs, configs_size, vars_vec, vars_values,
set_only_non_debug_params, &mgr) != 0) {
return -1;
}
}
PERF_COUNT_SUB("update tesseract_")
// Update datapath and language requested for the last valid initialization.
if (datapath_ == NULL)
if (datapath_ == nullptr)
datapath_ = new STRING(datapath);
else
*datapath_ = datapath;
@ -317,7 +340,7 @@ int TessBaseAPI::Init(const char* datapath, const char* language,
(strcmp(tesseract_->datadir.string(), "") != 0))
*datapath_ = tesseract_->datadir;
if (language_ == NULL)
if (language_ == nullptr)
language_ = new STRING(language);
else
*language_ = language;
@ -421,7 +444,8 @@ int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
tesseract_ = new Tesseract;
else
ParamUtils::ResetToDefaults(tesseract_->params());
return tesseract_->init_tesseract_lm(datapath, NULL, language);
TessdataManager mgr;
return tesseract_->init_tesseract_lm(datapath, NULL, language, &mgr);
}
/**
@ -431,7 +455,7 @@ int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
void TessBaseAPI::InitForAnalysePage() {
if (tesseract_ == NULL) {
tesseract_ = new Tesseract;
tesseract_->InitAdaptiveClassifier(false);
tesseract_->InitAdaptiveClassifier(nullptr);
}
}
@ -2239,7 +2263,7 @@ int TessBaseAPI::FindLines() {
}
if (tesseract_ == NULL) {
tesseract_ = new Tesseract;
tesseract_->InitAdaptiveClassifier(false);
tesseract_->InitAdaptiveClassifier(nullptr);
}
if (tesseract_->pix_binary() == NULL)
Threshold(tesseract_->mutable_pix_binary());
@ -2261,14 +2285,16 @@ int TessBaseAPI::FindLines() {
Tesseract* osd_tess = osd_tesseract_;
OSResults osr;
if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == NULL) {
if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) &&
osd_tess == nullptr) {
if (strcmp(language_->string(), "osd") == 0) {
osd_tess = tesseract_;
} else {
osd_tesseract_ = new Tesseract;
if (osd_tesseract_->init_tesseract(
datapath_->string(), NULL, "osd", OEM_TESSERACT_ONLY,
NULL, 0, NULL, NULL, false) == 0) {
TessdataManager mgr(reader_);
if (osd_tesseract_->init_tesseract(datapath_->string(), nullptr, "osd",
OEM_TESSERACT_ONLY, nullptr, 0,
nullptr, nullptr, false, &mgr) == 0) {
osd_tess = osd_tesseract_;
osd_tesseract_->set_source_resolution(
thresholder_->GetSourceYResolution());
@ -2276,7 +2302,7 @@ int TessBaseAPI::FindLines() {
tprintf("Warning: Auto orientation and script detection requested,"
" but osd language failed to load\n");
delete osd_tesseract_;
osd_tesseract_ = NULL;
osd_tesseract_ = nullptr;
}
}
}

View File

@ -29,14 +29,15 @@
// To avoid collision with other typenames include the ABSOLUTE MINIMUM
// complexity of includes here. Use forward declarations wherever possible
// and hide includes of complex types in baseapi.cpp.
#include "platform.h"
#include "apitypes.h"
#include "pageiterator.h"
#include "platform.h"
#include "publictypes.h"
#include "resultiterator.h"
#include "serialis.h"
#include "tesscallback.h"
#include "thresholder.h"
#include "unichar.h"
#include "tesscallback.h"
#include "publictypes.h"
#include "pageiterator.h"
#include "resultiterator.h"
template <typename T> class GenericVector;
class PAGE_RES;
@ -237,6 +238,13 @@ class TESS_API TessBaseAPI {
int Init(const char* datapath, const char* language) {
return Init(datapath, language, OEM_DEFAULT, NULL, 0, NULL, NULL, false);
}
// In-memory version reads the traineddata file directly from the given
// data[data_size] array, and/or reads data via a FileReader.
int Init(const char* data, int data_size, const char* language,
OcrEngineMode mode, char** configs, int configs_size,
const GenericVector<STRING>* vars_vec,
const GenericVector<STRING>* vars_values,
bool set_only_non_debug_params, FileReader reader);
/**
* Returns the languages string used in the last valid initialization.
@ -859,6 +867,7 @@ class TESS_API TessBaseAPI {
Tesseract* tesseract_; ///< The underlying data object.
Tesseract* osd_tesseract_; ///< For orientation & script detection.
EquationDetect* equ_detect_; ///<The equation detector.
FileReader reader_; ///< Reads files from any filesystem.
ImageThresholder* thresholder_; ///< Image thresholding module.
GenericVector<ParagraphModel *>* paragraph_models_;
BLOCK_LIST* block_list_; ///< The page layout.

View File

@ -92,8 +92,8 @@ bool Tesseract::init_tesseract_lang_data(
const char *arg0, const char *textbase, const char *language,
OcrEngineMode oem, char **configs, int configs_size,
const GenericVector<STRING> *vars_vec,
const GenericVector<STRING> *vars_values,
bool set_only_non_debug_params) {
const GenericVector<STRING> *vars_values, bool set_only_non_debug_params,
TessdataManager *mgr) {
// Set the basename, compute the data directory.
main_setup(arg0, textbase);
@ -105,16 +105,28 @@ bool Tesseract::init_tesseract_lang_data(
// Initialize TessdataManager.
STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
if (!tessdata_manager.Init(tessdata_path.string(),
tessdata_manager_debug_level)) {
return false;
if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string())) {
// Try without tessdata.
m_data_sub_dir.set_value("");
main_setup(arg0, textbase);
language_data_path_prefix = datadir;
language_data_path_prefix += lang;
language_data_path_prefix += ".";
tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
if (!mgr->Init(tessdata_path.string())) {
tprintf("Error opening data file %s\n", tessdata_path.string());
tprintf(
"Please make sure the TESSDATA_PREFIX environment variable is set"
" to your \"tessdata\" directory.\n");
return false;
}
}
if (oem == OEM_DEFAULT) {
// Set the engine mode from availability, which can then be overidden by
// the config file when we read it below.
if (!tessdata_manager.IsLSTMAvailable()) {
if (!mgr->IsLSTMAvailable()) {
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
} else if (!tessdata_manager.IsBaseAvailable()) {
} else if (!mgr->IsBaseAvailable()) {
tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
} else {
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
@ -122,14 +134,10 @@ bool Tesseract::init_tesseract_lang_data(
}
// If a language specific config file (lang.config) exists, load it in.
if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) {
ParamUtils::ReadParamsFromFp(
tessdata_manager.GetDataFilePtr(),
tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG),
SET_PARAM_CONSTRAINT_NONE, this->params());
if (tessdata_manager_debug_level) {
tprintf("Loaded language config file\n");
}
TFile fp;
if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp,
this->params());
}
SetParamConstraint set_params_constraint = set_only_non_debug_params ?
@ -159,10 +167,6 @@ bool Tesseract::init_tesseract_lang_data(
if (params_file != NULL) {
ParamUtils::PrintParams(params_file, this->params());
fclose(params_file);
if (tessdata_manager_debug_level > 0) {
tprintf("Wrote parameters to %s\n",
tessedit_write_params_to_file.string());
}
} else {
tprintf("Failed to open %s for writing params.\n",
tessedit_write_params_to_file.string());
@ -171,17 +175,10 @@ bool Tesseract::init_tesseract_lang_data(
// Determine which ocr engine(s) should be loaded and used for recognition.
if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
if (tessdata_manager_debug_level) {
tprintf("Loading Tesseract/LSTM with tessedit_ocr_engine_mode %d\n",
static_cast<int>(tessedit_ocr_engine_mode));
}
// If we are only loading the config file (and so not planning on doing any
// recognition) then there's nothing else do here.
if (tessedit_init_config_only) {
if (tessdata_manager_debug_level) {
tprintf("Returning after loading config file\n");
}
return true;
}
@ -191,17 +188,14 @@ bool Tesseract::init_tesseract_lang_data(
#ifndef ANDROID_BUILD
if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
if (tessdata_manager.swap()) {
if (mgr->swap()) {
tprintf("Error: LSTM requested on big-endian hardware!!\n");
tprintf("Big-endian not yet supported! Loading tesseract.\n");
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
} else if (tessdata_manager.SeekToStart(TESSDATA_LSTM)) {
} else if (mgr->GetComponent(TESSDATA_LSTM, &fp)) {
lstm_recognizer_ = new LSTMRecognizer;
TFile fp;
fp.Open(tessdata_manager.GetDataFilePtr(), -1);
ASSERT_HOST(lstm_recognizer_->DeSerialize(tessdata_manager.swap(), &fp));
if (lstm_use_matrix)
lstm_recognizer_->LoadDictionary(tessdata_path.string(), language);
ASSERT_HOST(lstm_recognizer_->DeSerialize(mgr->swap(), &fp));
if (lstm_use_matrix) lstm_recognizer_->LoadDictionary(language, mgr);
} else {
tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
@ -215,15 +209,14 @@ bool Tesseract::init_tesseract_lang_data(
#ifndef ANDROID_BUILD
unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
#endif
} else if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
!unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
} else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) ||
!unicharset.load_from_file(&fp, false)) {
return false;
}
if (unicharset.size() > MAX_NUM_CLASSES) {
tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
return false;
}
if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
right_to_left_ = unicharset.major_right_to_left();
// Setup initial unichar ambigs table and read universal ambigs.
@ -232,16 +225,10 @@ bool Tesseract::init_tesseract_lang_data(
unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
if (!tessedit_ambigs_training &&
tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) {
TFile ambigs_file;
ambigs_file.Open(tessdata_manager.GetDataFilePtr(),
tessdata_manager.GetEndOffset(TESSDATA_AMBIGS) + 1);
unichar_ambigs.LoadUnicharAmbigs(
encoder_unicharset,
&ambigs_file,
ambigs_debug_level, use_ambigs_for_adaption, &unicharset);
if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp,
ambigs_debug_level,
use_ambigs_for_adaption, &unicharset);
}
// Init ParamsModel.
// Load pass1 and pass2 weights (for now these two sets are the same, but in
@ -250,15 +237,12 @@ bool Tesseract::init_tesseract_lang_data(
p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
language_model_->getParamsModel().SetPass(
static_cast<ParamsModel::PassEnum>(p));
if (tessdata_manager.SeekToStart(TESSDATA_PARAMS_MODEL)) {
if (!language_model_->getParamsModel().LoadFromFp(
lang.string(), tessdata_manager.GetDataFilePtr(),
tessdata_manager.GetEndOffset(TESSDATA_PARAMS_MODEL))) {
if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
if (!language_model_->getParamsModel().LoadFromFp(lang.string(), &fp)) {
return false;
}
}
}
if (tessdata_manager_debug_level) language_model_->getParamsModel().Print();
return true;
}
@ -303,8 +287,6 @@ void Tesseract::ParseLanguageString(const char* lang_str,
remains = next;
// Check whether lang_code is already in the target vector and add.
if (!IsStrInList(lang_code, *target)) {
if (tessdata_manager_debug_level)
tprintf("Adding language '%s' to list\n", lang_code.string());
target->push_back(lang_code);
}
}
@ -314,12 +296,13 @@ void Tesseract::ParseLanguageString(const char* lang_str,
// string and recursively any additional languages required by any language
// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
// See init_tesseract_internal for args.
int Tesseract::init_tesseract(
const char *arg0, const char *textbase, const char *language,
OcrEngineMode oem, char **configs, int configs_size,
const GenericVector<STRING> *vars_vec,
const GenericVector<STRING> *vars_values,
bool set_only_non_debug_params) {
int Tesseract::init_tesseract(const char *arg0, const char *textbase,
const char *language, OcrEngineMode oem,
char **configs, int configs_size,
const GenericVector<STRING> *vars_vec,
const GenericVector<STRING> *vars_values,
bool set_only_non_debug_params,
TessdataManager *mgr) {
GenericVector<STRING> langs_to_load;
GenericVector<STRING> langs_not_to_load;
ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
@ -341,15 +324,15 @@ int Tesseract::init_tesseract(
}
int result = tess_to_init->init_tesseract_internal(
arg0, textbase, lang_str, oem, configs, configs_size,
vars_vec, vars_values, set_only_non_debug_params);
arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
vars_values, set_only_non_debug_params, mgr);
// Forget that language, but keep any reader we were given.
mgr->Clear();
if (!loaded_primary) {
if (result < 0) {
tprintf("Failed loading language '%s'\n", lang_str);
} else {
if (tessdata_manager_debug_level)
tprintf("Loaded language '%s' as main language\n", lang_str);
ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
&langs_to_load, &langs_not_to_load);
loaded_primary = true;
@ -359,8 +342,6 @@ int Tesseract::init_tesseract(
tprintf("Failed loading language '%s'\n", lang_str);
delete tess_to_init;
} else {
if (tessdata_manager_debug_level)
tprintf("Loaded language '%s' as secondary language\n", lang_str);
sub_langs_.push_back(tess_to_init);
// Add any languages that this language requires
ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
@ -385,16 +366,11 @@ int Tesseract::init_tesseract(
this->language_model_->getParamsModel());
}
tprintf("Using params model of the primary language\n");
if (tessdata_manager_debug_level) {
this->language_model_->getParamsModel().Print();
}
} else {
this->language_model_->getParamsModel().Clear();
for (int s = 0; s < sub_langs_.size(); ++s) {
sub_langs_[s]->language_model_->getParamsModel().Clear();
}
if (tessdata_manager_debug_level)
tprintf("Using default language params\n");
}
}
@ -418,26 +394,26 @@ int Tesseract::init_tesseract(
// in vars_vec.
// If set_only_init_params is true, then only the initialization variables
// will be set.
int Tesseract::init_tesseract_internal(
const char *arg0, const char *textbase, const char *language,
OcrEngineMode oem, char **configs, int configs_size,
const GenericVector<STRING> *vars_vec,
const GenericVector<STRING> *vars_values,
bool set_only_non_debug_params) {
int Tesseract::init_tesseract_internal(const char *arg0, const char *textbase,
const char *language, OcrEngineMode oem,
char **configs, int configs_size,
const GenericVector<STRING> *vars_vec,
const GenericVector<STRING> *vars_values,
bool set_only_non_debug_params,
TessdataManager *mgr) {
if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
configs_size, vars_vec, vars_values,
set_only_non_debug_params)) {
set_only_non_debug_params, mgr)) {
return -1;
}
if (tessedit_init_config_only) {
tessdata_manager.End();
return 0;
}
// If only LSTM will be used, skip loading Tesseract classifier's
// pre-trained templates and dictionary.
bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
program_editup(textbase, init_tesseract, init_tesseract);
tessdata_manager.End();
program_editup(textbase, init_tesseract ? mgr : nullptr,
init_tesseract ? mgr : nullptr);
return 0; //Normal exit
}
@ -482,16 +458,14 @@ void Tesseract::SetupUniversalFontIds() {
}
// init the LM component
int Tesseract::init_tesseract_lm(const char *arg0,
const char *textbase,
const char *language) {
int Tesseract::init_tesseract_lm(const char *arg0, const char *textbase,
const char *language, TessdataManager *mgr) {
if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
NULL, 0, NULL, NULL, false))
NULL, 0, NULL, NULL, false, mgr))
return -1;
getDict().SetupForLoad(Dict::GlobalDawgCache());
getDict().Load(tessdata_manager.GetDataFileName().string(), lang);
getDict().Load(lang, mgr);
getDict().FinishLoad();
tessdata_manager.End();
return 0;
}

View File

@ -466,10 +466,6 @@ Tesseract::Tesseract()
STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word",
this->params()),
INT_MEMBER(tessdata_manager_debug_level, 0,
"Debug level for"
" TessdataManager functions.",
this->params()),
STRING_MEMBER(tessedit_load_sublangs, "",
"List of languages to load with this one", this->params()),
BOOL_MEMBER(tessedit_use_primary_params_model, false,

View File

@ -496,20 +496,17 @@ class Tesseract : public Wordrec {
// string and recursively any additional languages required by any language
// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
// See init_tesseract_internal for args.
int init_tesseract(const char *arg0,
const char *textbase,
const char *language,
OcrEngineMode oem,
char **configs,
int configs_size,
const GenericVector<STRING> *vars_vec,
const GenericVector<STRING> *vars_values,
bool set_only_init_params);
int init_tesseract(const char* arg0, const char* textbase,
const char* language, OcrEngineMode oem, char** configs,
int configs_size, const GenericVector<STRING>* vars_vec,
const GenericVector<STRING>* vars_values,
bool set_only_init_params, TessdataManager* mgr);
int init_tesseract(const char *datapath,
const char *language,
OcrEngineMode oem) {
return init_tesseract(datapath, NULL, language, oem,
NULL, 0, NULL, NULL, false);
TessdataManager mgr;
return init_tesseract(datapath, NULL, language, oem, NULL, 0, NULL, NULL,
false, &mgr);
}
// Common initialization for a single language.
// arg0 is the datapath for the tessdata directory, which could be the
@ -527,36 +524,30 @@ class Tesseract : public Wordrec {
// in vars_vec.
// If set_only_init_params is true, then only the initialization variables
// will be set.
int init_tesseract_internal(const char *arg0,
const char *textbase,
const char *language,
OcrEngineMode oem,
char **configs,
int configs_size,
const GenericVector<STRING> *vars_vec,
const GenericVector<STRING> *vars_values,
bool set_only_init_params);
int init_tesseract_internal(const char* arg0, const char* textbase,
const char* language, OcrEngineMode oem,
char** configs, int configs_size,
const GenericVector<STRING>* vars_vec,
const GenericVector<STRING>* vars_values,
bool set_only_init_params, TessdataManager* mgr);
// Set the universal_id member of each font to be unique among all
// instances of the same font loaded.
void SetupUniversalFontIds();
int init_tesseract_lm(const char *arg0,
const char *textbase,
const char *language);
int init_tesseract_lm(const char* arg0, const char* textbase,
const char* language, TessdataManager* mgr);
void recognize_page(STRING& image_name);
void end_tesseract();
bool init_tesseract_lang_data(const char *arg0,
const char *textbase,
const char *language,
OcrEngineMode oem,
char **configs,
int configs_size,
const GenericVector<STRING> *vars_vec,
const GenericVector<STRING> *vars_values,
bool set_only_init_params);
bool init_tesseract_lang_data(const char* arg0, const char* textbase,
const char* language, OcrEngineMode oem,
char** configs, int configs_size,
const GenericVector<STRING>* vars_vec,
const GenericVector<STRING>* vars_values,
bool set_only_init_params,
TessdataManager* mgr);
void ParseLanguageString(const char* lang_str,
GenericVector<STRING>* to_load,
@ -1074,8 +1065,6 @@ class Tesseract : public Wordrec {
BOOL_VAR_H(interactive_display_mode, false, "Run interactively?");
STRING_VAR_H(file_type, ".tif", "Filename extension");
BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word");
INT_VAR_H(tessdata_manager_debug_level, 0,
"Debug level for TessdataManager functions.");
STRING_VAR_H(tessedit_load_sublangs, "",
"List of languages to load with this one");
BOOL_VAR_H(tessedit_use_primary_params_model, false,

View File

@ -31,7 +31,7 @@ bool FontInfo::Serialize(FILE* fp) const {
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool FontInfo::DeSerialize(bool swap, FILE* fp) {
bool FontInfo::DeSerialize(bool swap, TFile* fp) {
if (!read_info(fp, this, swap)) return false;
if (!read_spacing_info(fp, this, swap)) return false;
return true;
@ -51,7 +51,7 @@ bool FontInfoTable::Serialize(FILE* fp) const {
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool FontInfoTable::DeSerialize(bool swap, FILE* fp) {
bool FontInfoTable::DeSerialize(bool swap, TFile* fp) {
truncate(0);
return this->DeSerializeClasses(swap, fp);
}
@ -149,19 +149,15 @@ void FontSetDeleteCallback(FontSet fs) {
/*---------------------------------------------------------------------------*/
// Callbacks used by UnicityTable to read/write FontInfo/FontSet structures.
bool read_info(FILE* f, FontInfo* fi, bool swap) {
bool read_info(TFile* f, FontInfo* fi, bool swap) {
inT32 size;
if (fread(&size, sizeof(size), 1, f) != 1) return false;
if (swap)
Reverse32(&size);
if (f->FReadEndian(&size, sizeof(size), 1, swap) != 1) return false;
char* font_name = new char[size + 1];
fi->name = font_name;
if (static_cast<int>(fread(font_name, sizeof(*font_name), size, f)) != size)
return false;
if (f->FRead(font_name, sizeof(*font_name), size) != size) return false;
font_name[size] = '\0';
if (fread(&fi->properties, sizeof(fi->properties), 1, f) != 1) return false;
if (swap)
Reverse32(&fi->properties);
if (f->FReadEndian(&fi->properties, sizeof(fi->properties), 1, swap) != 1)
return false;
return true;
}
@ -174,26 +170,22 @@ bool write_info(FILE* f, const FontInfo& fi) {
return true;
}
bool read_spacing_info(FILE *f, FontInfo* fi, bool swap) {
bool read_spacing_info(TFile* f, FontInfo* fi, bool swap) {
inT32 vec_size, kern_size;
if (fread(&vec_size, sizeof(vec_size), 1, f) != 1) return false;
if (swap) Reverse32(&vec_size);
if (f->FReadEndian(&vec_size, sizeof(vec_size), 1, swap) != 1) return false;
ASSERT_HOST(vec_size >= 0);
if (vec_size == 0) return true;
fi->init_spacing(vec_size);
for (int i = 0; i < vec_size; ++i) {
FontSpacingInfo *fs = new FontSpacingInfo();
if (fread(&fs->x_gap_before, sizeof(fs->x_gap_before), 1, f) != 1 ||
fread(&fs->x_gap_after, sizeof(fs->x_gap_after), 1, f) != 1 ||
fread(&kern_size, sizeof(kern_size), 1, f) != 1) {
if (f->FReadEndian(&fs->x_gap_before, sizeof(fs->x_gap_before), 1, swap) !=
1 ||
f->FReadEndian(&fs->x_gap_after, sizeof(fs->x_gap_after), 1, swap) !=
1 ||
f->FReadEndian(&kern_size, sizeof(kern_size), 1, swap) != 1) {
delete fs;
return false;
}
if (swap) {
ReverseN(&(fs->x_gap_before), sizeof(fs->x_gap_before));
ReverseN(&(fs->x_gap_after), sizeof(fs->x_gap_after));
Reverse32(&kern_size);
}
if (kern_size < 0) { // indication of a NULL entry in fi->spacing_vec
delete fs;
continue;
@ -237,16 +229,12 @@ bool write_spacing_info(FILE* f, const FontInfo& fi) {
return true;
}
bool read_set(FILE* f, FontSet* fs, bool swap) {
if (fread(&fs->size, sizeof(fs->size), 1, f) != 1) return false;
if (swap)
Reverse32(&fs->size);
bool read_set(TFile* f, FontSet* fs, bool swap) {
if (f->FReadEndian(&fs->size, sizeof(fs->size), 1, swap) != 1) return false;
fs->configs = new int[fs->size];
for (int i = 0; i < fs->size; ++i) {
if (fread(&fs->configs[i], sizeof(fs->configs[i]), 1, f) != 1) return false;
if (swap)
Reverse32(&fs->configs[i]);
}
if (f->FReadEndian(fs->configs, sizeof(fs->configs[0]), fs->size, swap) !=
fs->size)
return false;
return true;
}

View File

@ -67,7 +67,7 @@ struct FontInfo {
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
bool DeSerialize(bool swap, TFile* fp);
// Reserves unicharset_size spots in spacing_vec.
void init_spacing(int unicharset_size) {
@ -152,7 +152,7 @@ class FontInfoTable : public GenericVector<FontInfo> {
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
bool DeSerialize(bool swap, TFile* fp);
// Returns true if the given set of fonts includes one with the same
// properties as font_id.
@ -177,11 +177,11 @@ void FontInfoDeleteCallback(FontInfo f);
void FontSetDeleteCallback(FontSet fs);
// Callbacks used by UnicityTable to read/write FontInfo/FontSet structures.
bool read_info(FILE* f, FontInfo* fi, bool swap);
bool read_info(TFile* f, FontInfo* fi, bool swap);
bool write_info(FILE* f, const FontInfo& fi);
bool read_spacing_info(FILE *f, FontInfo* fi, bool swap);
bool read_spacing_info(TFile* f, FontInfo* fi, bool swap);
bool write_spacing_info(FILE* f, const FontInfo& fi);
bool read_set(FILE* f, FontSet* fs, bool swap);
bool read_set(TFile* f, FontSet* fs, bool swap);
bool write_set(FILE* f, const FontSet& fs);
} // namespace tesseract.

View File

@ -66,7 +66,6 @@ class CCUtil {
STRING imagebasename; // name of image
STRING lang;
STRING language_data_path_prefix;
TessdataManager tessdata_manager;
UNICHARSET unicharset;
UnicharAmbigs unichar_ambigs;
STRING imagefile; // image file name

View File

@ -162,7 +162,9 @@ class GenericVector {
// Returns false on error or if the callback returns false.
// DEPRECATED. Use [De]Serialize[Classes] instead.
bool write(FILE* f, TessResultCallback2<bool, FILE*, T const &>* cb) const;
bool read(FILE* f, TessResultCallback3<bool, FILE*, T*, bool>* cb, bool swap);
bool read(tesseract::TFile* f,
TessResultCallback3<bool, tesseract::TFile*, T*, bool>* cb,
bool swap);
// Writes a vector of simple types to the given file. Assumes that bitwise
// read/write of T will work. Returns false in case of error.
// TODO(rays) Change all callers to use TFile and remove deprecated methods.
@ -885,15 +887,14 @@ bool GenericVector<T>::write(
}
template <typename T>
bool GenericVector<T>::read(FILE* f,
TessResultCallback3<bool, FILE*, T*, bool>* cb,
bool swap) {
bool GenericVector<T>::read(
tesseract::TFile* f,
TessResultCallback3<bool, tesseract::TFile*, T*, bool>* cb, bool swap) {
inT32 reserved;
if (fread(&reserved, sizeof(reserved), 1, f) != 1) return false;
if (swap) Reverse32(&reserved);
if (f->FReadEndian(&reserved, sizeof(reserved), 1, swap) != 1) return false;
reserve(reserved);
if (fread(&size_used_, sizeof(size_used_), 1, f) != 1) return false;
if (swap) Reverse32(&size_used_);
if (f->FReadEndian(&size_used_, sizeof(size_used_), 1, swap) != 1)
return false;
if (cb != NULL) {
for (int i = 0; i < size_used_; ++i) {
if (!cb->Run(f, data_ + i, swap)) {
@ -903,11 +904,8 @@ bool GenericVector<T>::read(FILE* f,
}
delete cb;
} else {
if (fread(data_, sizeof(T), size_used_, f) != size_used_) return false;
if (swap) {
for (int i = 0; i < size_used_; ++i)
ReverseN(&data_[i], sizeof(T));
}
if (f->FReadEndian(data_, sizeof(T), size_used_, swap) != size_used_)
return false;
}
return true;
}

View File

@ -55,7 +55,7 @@ void CCUtil::main_setup(const char *argv0, const char *basename) {
char *tessdata_prefix = getenv("TESSDATA_PREFIX");
if (argv0 != NULL) {
if (argv0 != NULL && *argv0 != '\0') {
/* Use tessdata prefix from the command line. */
datadir = argv0;
} else if (tessdata_prefix) {

View File

@ -41,8 +41,6 @@ bool ParamUtils::ReadParamsFile(const char *file,
SetParamConstraint constraint,
ParamsVectors *member_params) {
inT16 nameoffset; // offset for real name
FILE *fp; // file pointer
// iterators
if (*file == PLUS) {
nameoffset = 1;
@ -52,26 +50,22 @@ bool ParamUtils::ReadParamsFile(const char *file,
nameoffset = 0;
}
fp = fopen(file + nameoffset, "rb");
if (fp == NULL) {
TFile fp;
if (!fp.Open(file + nameoffset, nullptr)) {
tprintf("read_params_file: Can't open %s\n", file + nameoffset);
return true;
}
const bool anyerr = ReadParamsFromFp(fp, -1, constraint, member_params);
fclose(fp);
return anyerr;
return ReadParamsFromFp(constraint, &fp, member_params);
}
bool ParamUtils::ReadParamsFromFp(FILE *fp, inT64 end_offset,
SetParamConstraint constraint,
bool ParamUtils::ReadParamsFromFp(SetParamConstraint constraint, TFile *fp,
ParamsVectors *member_params) {
char line[MAX_PATH]; // input line
bool anyerr = false; // true if any error
bool foundit; // found parameter
char *valptr; // value field
while ((end_offset < 0 || ftell(fp) < end_offset) &&
fgets(line, MAX_PATH, fp)) {
while (fp->FGets(line, MAX_PATH) != nullptr) {
if (line[0] != '\r' && line[0] != '\n' && line[0] != '#') {
chomp_string(line); // remove newline
for (valptr = line; *valptr && *valptr != ' ' && *valptr != '\t';

View File

@ -60,9 +60,8 @@ class ParamUtils {
SetParamConstraint constraint,
ParamsVectors *member_params);
// Read parameters from the given file pointer (stop at end_offset).
static bool ReadParamsFromFp(FILE *fp, inT64 end_offset,
SetParamConstraint constraint,
// Read parameters from the given file pointer.
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp,
ParamsVectors *member_params);
// Set a parameters to have the given value.

View File

@ -88,6 +88,17 @@ char* TFile::FGets(char* buffer, int buffer_size) {
return size > 0 ? buffer : NULL;
}
int TFile::FReadEndian(void* buffer, int size, int count, bool swap) {
int num_read = FRead(buffer, size, count);
if (swap) {
char* char_buffer = reinterpret_cast<char*>(buffer);
for (int i = 0; i < num_read; ++i, char_buffer += size) {
ReverseN(char_buffer, size);
}
}
return num_read;
}
int TFile::FRead(void* buffer, int size, int count) {
ASSERT_HOST(!is_writing_);
int required_size = size * count;

View File

@ -67,6 +67,10 @@ class TFile {
// the line is longer. Does nothing if buffer_size <= 0.
// To use fscanf use FGets and sscanf.
char* FGets(char* buffer, int buffer_size);
// Replicates fread, followed by a swap of the bytes if needed, returning the
// number of items read. If swap is true then the count items will each have
// size bytes reversed.
int FReadEndian(void* buffer, int size, int count, bool swap);
// Replicates fread, returning the number of items read.
int FRead(void* buffer, int size, int count);
// Resets the TFile as if it has been Opened, but nothing read.

View File

@ -33,206 +33,192 @@
namespace tesseract {
bool TessdataManager::Init(const char *data_file_name, int debug_level) {
int i;
debug_level_ = debug_level;
// Lazily loads from the the given filename. Won't actually read the file
// until it needs it.
void TessdataManager::LoadFileLater(const char *data_file_name) {
Clear();
data_file_name_ = data_file_name;
data_file_ = fopen(data_file_name, "rb");
if (data_file_ == NULL) {
tprintf("Error opening data file %s\n", data_file_name);
tprintf("Please make sure the TESSDATA_PREFIX environment variable is set "
"to the parent directory of your \"tessdata\" directory.\n");
}
bool TessdataManager::Init(const char *data_file_name) {
GenericVector<char> data;
bool result = true;
if (reader_ == nullptr) {
if (!LoadDataFromFile(data_file_name, &data)) return false;
} else {
if (!(*reader_)(data_file_name, &data)) return false;
}
return LoadMemBuffer(data_file_name, &data[0], data.size());
}
// Loads from the given memory buffer as if a file.
bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
int size) {
data_file_name_ = name;
TFile fp;
fp.Open(data, size);
inT32 num_entries = TESSDATA_NUM_ENTRIES;
if (fp.FRead(&num_entries, sizeof(num_entries), 1) != 1) return false;
swap_ = num_entries > kMaxNumTessdataEntries || num_entries < 0;
if (swap_) ReverseN(&num_entries, sizeof(num_entries));
GenericVector<inT64> offset_table;
offset_table.init_to_size(num_entries, -1);
if (fp.FReadEndian(&offset_table[0], sizeof(offset_table[0]), num_entries,
swap_) != num_entries)
return false;
}
fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
if (swap_) {
ReverseN(&actual_tessdata_num_entries_,
sizeof(actual_tessdata_num_entries_));
}
if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) {
// For forward compatibility, truncate to the number we can handle.
actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES;
}
fread(offset_table_, sizeof(inT64),
actual_tessdata_num_entries_, data_file_);
if (swap_) {
for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
ReverseN(&offset_table_[i], sizeof(offset_table_[i]));
}
}
if (debug_level_) {
tprintf("TessdataManager loaded %d types of tesseract data files.\n",
actual_tessdata_num_entries_);
for (i = 0; i < actual_tessdata_num_entries_; ++i) {
tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
if (offset_table[i] >= 0) {
inT64 entry_size = size - offset_table[i];
int j = i + 1;
while (j < num_entries && offset_table[j] == -1) ++j;
if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
entries_[i].init_to_size(entry_size, 0);
if (fp.FRead(&entries_[i][0], 1, entry_size) != entry_size) return false;
}
}
is_loaded_ = true;
return true;
}
void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
bool newline_end, inT64 num_bytes_to_copy) {
if (num_bytes_to_copy == 0) return;
int buffer_size = 1024;
if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) {
buffer_size = num_bytes_to_copy;
}
inT64 num_bytes_copied = 0;
char *chunk = new char[buffer_size];
int bytes_read;
char last_char = 0x0;
while ((bytes_read = fread(chunk, sizeof(char),
buffer_size, input_file))) {
fwrite(chunk, sizeof(char), bytes_read, output_file);
last_char = chunk[bytes_read-1];
if (num_bytes_to_copy > 0) {
num_bytes_copied += bytes_read;
if (num_bytes_copied == num_bytes_to_copy) break;
if (num_bytes_copied + buffer_size > num_bytes_to_copy) {
buffer_size = num_bytes_to_copy - num_bytes_copied;
}
}
}
if (newline_end) ASSERT_HOST(last_char == '\n');
delete[] chunk;
// Overwrites a single entry of the given type.
void TessdataManager::OverwriteEntry(TessdataType type, const char *data,
int size) {
is_loaded_ = true;
entries_[type].init_to_size(size, 0);
memcpy(&entries_[type][0], data, size);
}
bool TessdataManager::WriteMetadata(inT64 *offset_table,
const char * language_data_path_prefix,
FILE *output_file) {
inT32 num_entries = TESSDATA_NUM_ENTRIES;
bool result = true;
if (fseek(output_file, 0, SEEK_SET) != 0 ||
fwrite(&num_entries, sizeof(inT32), 1, output_file) != 1 ||
fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES,
output_file) != TESSDATA_NUM_ENTRIES) {
fclose(output_file);
result = false;
tprintf("WriteMetadata failed in TessdataManager!\n");
} else if (fclose(output_file)) {
result = false;
tprintf("WriteMetadata failed to close file!\n");
} else {
tprintf("TessdataManager combined tesseract data files.\n");
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
tprintf("Offset for type %2d (%s%-22s) is %lld\n", i,
language_data_path_prefix, kTessdataFileSuffixes[i],
offset_table[i]);
// Saves to the given filename.
bool TessdataManager::SaveFile(const STRING &filename,
FileWriter writer) const {
ASSERT_HOST(is_loaded_);
GenericVector<char> data;
Serialize(&data);
if (writer == nullptr)
return SaveDataToFile(data, filename);
else
return (*writer)(data, filename);
}
// Serializes to the given vector.
void TessdataManager::Serialize(GenericVector<char> *data) const {
ASSERT_HOST(is_loaded_);
// Compute the offset_table and total size.
inT64 offset_table[TESSDATA_NUM_ENTRIES];
inT64 offset = sizeof(inT32) + sizeof(offset_table);
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (entries_[i].empty()) {
offset_table[i] = -1;
} else {
offset_table[i] = offset;
offset += entries_[i].size();
}
}
return result;
data->init_to_size(offset, 0);
inT32 num_entries = TESSDATA_NUM_ENTRIES;
TFile fp;
fp.OpenWrite(data);
fp.FWrite(&num_entries, sizeof(num_entries), 1);
fp.FWrite(offset_table, sizeof(offset_table), 1);
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (!entries_[i].empty()) {
fp.FWrite(&entries_[i][0], entries_[i].size(), 1);
}
}
}
// Resets to the initial state, keeping the reader.
void TessdataManager::Clear() {
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
entries_[i].clear();
}
is_loaded_ = false;
}
// Prints a directory of contents.
void TessdataManager::Directory() const {
int offset = TESSDATA_NUM_ENTRIES * sizeof(inT64);
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (!entries_[i].empty()) {
tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
entries_[i].size(), offset);
offset += entries_[i].size();
}
}
}
// Opens the given TFile pointer to the given component type.
// Returns false in case of failure.
bool TessdataManager::GetComponent(TessdataType type, TFile *fp) {
if (!is_loaded_ && !Init(data_file_name_.string())) return false;
if (entries_[type].empty()) return false;
fp->Open(&entries_[type][0], entries_[type].size());
return true;
}
bool TessdataManager::CombineDataFiles(
const char *language_data_path_prefix,
const char *output_filename) {
int i;
inT64 offset_table[TESSDATA_NUM_ENTRIES];
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
FILE *output_file = fopen(output_filename, "wb");
if (output_file == NULL) {
tprintf("Error opening %s for writing\n", output_filename);
return false;
}
// Leave some space for recording the offset_table.
if (fseek(output_file,
sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
tprintf("Error seeking %s\n", output_filename);
fclose(output_file);
return false;
}
TessdataType type = TESSDATA_NUM_ENTRIES;
bool text_file = false;
FILE *file_ptr[TESSDATA_NUM_ENTRIES];
// Load individual tessdata components from files.
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
ASSERT_HOST(TessdataTypeFromFileSuffix(
kTessdataFileSuffixes[i], &type, &text_file));
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
TessdataType type;
ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
STRING filename = language_data_path_prefix;
filename += kTessdataFileSuffixes[i];
file_ptr[i] = fopen(filename.string(), "rb");
if (file_ptr[i] != NULL) {
offset_table[type] = ftell(output_file);
CopyFile(file_ptr[i], output_file, text_file, -1);
fclose(file_ptr[i]);
FILE *fp = fopen(filename.string(), "rb");
if (fp != nullptr) {
fclose(fp);
if (!LoadDataFromFile(filename, &entries_[type])) {
tprintf("Load of file %s failed!\n", filename.string());
return false;
}
}
}
is_loaded_ = true;
// Make sure that the required components are present.
if (!IncludesBaseComponents(offset_table) &&
!IncludesLSTMComponents(offset_table)) {
if (!IsBaseAvailable() && !IsLSTMAvailable()) {
tprintf(
"Error: traineddata file must contain at least (a unicharset file"
"and inttemp) OR an lstm file.\n");
fclose(output_file);
return false;
}
return WriteMetadata(offset_table, language_data_path_prefix, output_file);
// Write updated data to the output traineddata file.
return SaveFile(output_filename, nullptr);
}
bool TessdataManager::OverwriteComponents(
const char *new_traineddata_filename,
char **component_filenames,
int num_new_components) {
int i;
inT64 offset_table[TESSDATA_NUM_ENTRIES];
TessdataType type = TESSDATA_NUM_ENTRIES;
bool text_file = false;
FILE *file_ptr[TESSDATA_NUM_ENTRIES];
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
offset_table[i] = -1;
file_ptr[i] = NULL;
}
FILE *output_file = fopen(new_traineddata_filename, "wb");
if (output_file == NULL) {
tprintf("Error opening %s for writing\n", new_traineddata_filename);
return false;
}
// Leave some space for recording the offset_table.
if (fseek(output_file,
sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
fclose(output_file);
tprintf("Error seeking %s\n", new_traineddata_filename);
return false;
}
// Open the files with the new components.
for (i = 0; i < num_new_components; ++i) {
if (TessdataTypeFromFileName(component_filenames[i], &type, &text_file))
file_ptr[type] = fopen(component_filenames[i], "rb");
}
// Write updated data to the output traineddata file.
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (file_ptr[i] != NULL) {
// Get the data from the opened component file.
offset_table[i] = ftell(output_file);
CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1);
fclose(file_ptr[i]);
} else {
// Get this data component from the loaded data file.
if (SeekToStart(static_cast<TessdataType>(i))) {
offset_table[i] = ftell(output_file);
CopyFile(data_file_, output_file, kTessdataFileIsText[i],
GetEndOffset(static_cast<TessdataType>(i)) -
ftell(data_file_) + 1);
for (int i = 0; i < num_new_components; ++i) {
TessdataType type;
if (TessdataTypeFromFileName(component_filenames[i], &type)) {
if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
tprintf("Failed to read component file:%s\n", component_filenames[i]);
return false;
}
}
}
const char *language_data_path_prefix = strchr(new_traineddata_filename, '.');
return WriteMetadata(offset_table, language_data_path_prefix, output_file);
// Write updated data to the output traineddata file.
return SaveFile(new_traineddata_filename, nullptr);
}
bool TessdataManager::TessdataTypeFromFileSuffix(
const char *suffix, TessdataType *type, bool *text_file) {
bool TessdataManager::ExtractToFile(const char *filename) {
TessdataType type = TESSDATA_NUM_ENTRIES;
ASSERT_HOST(
tesseract::TessdataManager::TessdataTypeFromFileName(filename, &type));
if (entries_[type].empty()) return false;
return SaveDataToFile(entries_[type], filename);
}
bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix,
TessdataType *type) {
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
*type = static_cast<TessdataType>(i);
*text_file = kTessdataFileIsText[i];
return true;
}
}
@ -241,46 +227,12 @@ bool TessdataManager::TessdataTypeFromFileSuffix(
return false;
}
bool TessdataManager::TessdataTypeFromFileName(
const char *filename, TessdataType *type, bool *text_file) {
bool TessdataManager::TessdataTypeFromFileName(const char *filename,
TessdataType *type) {
// Get the file suffix (extension)
const char *suffix = strrchr(filename, '.');
if (suffix == NULL || *(++suffix) == '\0') return false;
return TessdataTypeFromFileSuffix(suffix, type, text_file);
}
// Returns true if the base Tesseract components are present.
/* static */
bool TessdataManager::IncludesBaseComponents(const inT64 *offset_table) {
return offset_table[TESSDATA_UNICHARSET] >= 0 &&
offset_table[TESSDATA_INTTEMP] >= 0;
}
// Returns true if the LSTM components are present.
/* static */
bool TessdataManager::IncludesLSTMComponents(const inT64 *offset_table) {
return offset_table[TESSDATA_LSTM] >= 0;
}
bool TessdataManager::ExtractToFile(const char *filename) {
TessdataType type = TESSDATA_NUM_ENTRIES;
bool text_file = false;
ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(
filename, &type, &text_file));
if (!SeekToStart(type)) return false;
FILE *output_file = fopen(filename, "wb");
if (output_file == NULL) {
tprintf("Error opening %s\n", filename);
exit(1);
}
inT64 begin_offset = ftell(GetDataFilePtr());
inT64 end_offset = GetEndOffset(type);
tesseract::TessdataManager::CopyFile(
GetDataFilePtr(), output_file, text_file,
end_offset - begin_offset + 1);
fclose(output_file);
return true;
if (suffix == nullptr || *(++suffix) == '\0') return false;
return TessdataTypeFromFileSuffix(suffix, type);
}
} // namespace tesseract

View File

@ -108,34 +108,6 @@ static const char *const kTessdataFileSuffixes[] = {
kLSTMNumberDawgFileSuffix, // 20
};
/**
* If kTessdataFileIsText[i] is true - the tessdata component
* of type i (from TessdataType enum) is text, and is binary otherwise.
*/
static const bool kTessdataFileIsText[] = {
true, // 0
true, // 1
true, // 2
false, // 3
true, // 4
true, // 5
false, // 6
false, // 7
false, // 8
false, // 9
false, // 10 // deprecated
true, // 11 // deprecated
false, // 12 // deprecated
false, // 13
false, // 14
false, // 15
true, // 16
false, // 17
false, // 18
false, // 19
false, // 20
};
/**
* TessdataType could be updated to contain more entries, however
* we do not expect that number to be astronomically high.
@ -148,93 +120,61 @@ static const int kMaxNumTessdataEntries = 1000;
class TessdataManager {
public:
TessdataManager() {
data_file_ = NULL;
actual_tessdata_num_entries_ = 0;
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
offset_table_[i] = -1;
}
}
TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) {}
explicit TessdataManager(FileReader reader)
: reader_(reader), is_loaded_(false), swap_(false) {}
~TessdataManager() {}
int DebugLevel() { return debug_level_; }
bool swap() const { return swap_; }
bool is_loaded() const { return is_loaded_; }
// Lazily loads from the the given filename. Won't actually read the file
// until it needs it.
void LoadFileLater(const char *data_file_name);
/**
* Opens the given data file and reads the offset table.
* Opens and reads the given data file right now.
* @return true on success.
*/
bool Init(const char *data_file_name, int debug_level);
bool Init(const char *data_file_name);
// Loads from the given memory buffer as if a file, remembering name as some
// arbitrary source id for caching.
bool LoadMemBuffer(const char *name, const char *data, int size);
// Overwrites a single entry of the given type.
void OverwriteEntry(TessdataType type, const char *data, int size);
// Saves to the given filename.
bool SaveFile(const STRING &filename, FileWriter writer) const;
// Serializes to the given vector.
void Serialize(GenericVector<char> *data) const;
// Resets to the initial state, keeping the reader.
void Clear();
// Prints a directory of contents.
void Directory() const;
// Opens the given TFile pointer to the given component type.
// Returns false in case of failure.
bool GetComponent(TessdataType type, TFile *fp);
// Returns true if the base Tesseract components are present.
bool IsBaseAvailable() const { return IncludesBaseComponents(offset_table_); }
bool IsBaseAvailable() const {
return !entries_[TESSDATA_UNICHARSET].empty() &&
!entries_[TESSDATA_INTTEMP].empty();
}
// Returns true if the LSTM components are present.
bool IsLSTMAvailable() const { return IncludesLSTMComponents(offset_table_); }
bool IsLSTMAvailable() const { return !entries_[TESSDATA_LSTM].empty(); }
// Return the name of the underlying data file.
const STRING &GetDataFileName() const { return data_file_name_; }
/** Returns data file pointer. */
inline FILE *GetDataFilePtr() const { return data_file_; }
/**
* Returns false if there is no data of the given type.
* Otherwise does a seek on the data_file_ to position the pointer
* at the start of the data of the given type.
*/
inline bool SeekToStart(TessdataType tessdata_type) {
if (debug_level_) {
tprintf("TessdataManager: seek to offset %lld - start of tessdata"
"type %d (%s))\n", offset_table_[tessdata_type],
tessdata_type, kTessdataFileSuffixes[tessdata_type]);
}
if (offset_table_[tessdata_type] < 0) {
return false;
} else {
ASSERT_HOST(fseek(data_file_,
static_cast<size_t>(offset_table_[tessdata_type]),
SEEK_SET) == 0);
return true;
}
}
/** Returns the end offset for the given tesseract data file type. */
inline inT64 GetEndOffset(TessdataType tessdata_type) const {
int index = tessdata_type + 1;
while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) {
++index; // skip tessdata types not present in the combined file
}
if (debug_level_) {
tprintf("TessdataManager: end offset for type %d is %lld\n",
tessdata_type,
(index == actual_tessdata_num_entries_) ? -1
: offset_table_[index]);
}
return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1;
}
/** Closes data_file_ (if it was opened by Init()). */
inline void End() {
if (data_file_ != NULL) {
fclose(data_file_);
data_file_ = NULL;
}
}
bool swap() const {
return swap_;
}
/** Writes the number of entries and the given offset table to output_file.
* Returns false on error.
*/
static bool WriteMetadata(inT64 *offset_table,
const char *language_data_path_prefix,
FILE *output_file);
/**
* Reads all the standard tesseract config and data files for a language
* at the given path and bundles them up into one binary data file.
* Returns true if the combined traineddata file was successfully written.
*/
static bool CombineDataFiles(const char *language_data_path_prefix,
const char *output_filename);
bool CombineDataFiles(const char *language_data_path_prefix,
const char *output_filename);
/**
* Gets the individual components from the data_file_ with which the class was
@ -257,69 +197,35 @@ class TessdataManager {
*/
bool ExtractToFile(const char *filename);
/**
* Copies data from the given input file to the output_file provided.
* If num_bytes_to_copy is >= 0, only num_bytes_to_copy is copied from
* the input file, otherwise all the data in the input file is copied.
*/
static void CopyFile(FILE *input_file, FILE *output_file,
bool newline_end, inT64 num_bytes_to_copy);
/**
* Fills type with TessdataType of the tessdata component represented by the
* given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.
* Sets *text_file to true if the component is in text format (e.g.
* unicharset, unichar ambigs, config, etc).
* @return true if the tessdata component type could be determined
* from the given file name.
*/
static bool TessdataTypeFromFileSuffix(const char *suffix,
TessdataType *type,
bool *text_file);
TessdataType *type);
/**
* Tries to determine tessdata component file suffix from filename,
* returns true on success.
*/
static bool TessdataTypeFromFileName(const char *filename,
TessdataType *type,
bool *text_file);
TessdataType *type);
private:
// Returns true if the base Tesseract components are present.
static bool IncludesBaseComponents(const inT64 *offset_table);
// Returns true if the LSTM components are present.
static bool IncludesLSTMComponents(const inT64 *offset_table);
/**
* Opens the file whose name is a concatenation of language_data_path_prefix
* and file_suffix. Returns a file pointer to the opened file.
*/
static FILE *GetFilePtr(const char *language_data_path_prefix,
const char *file_suffix, bool text_file);
/**
* Each offset_table_[i] contains a file offset in the combined data file
* where the data of TessdataFileType i is stored.
*/
inT64 offset_table_[TESSDATA_NUM_ENTRIES];
/**
* Actual number of entries in the tessdata table. This value can only be
* same or smaller than TESSDATA_NUM_ENTRIES, but can never be larger,
* since then it would be impossible to interpret the type of tessdata at
* indices same and higher than TESSDATA_NUM_ENTRIES.
* This parameter is used to allow for backward compatibility
* when new tessdata types are introduced.
*/
inT32 actual_tessdata_num_entries_;
STRING data_file_name_; // name of the data file.
FILE *data_file_; ///< pointer to the data file.
int debug_level_;
// Name of file it came from.
STRING data_file_name_;
// Function to load the file when we need it.
FileReader reader_;
// True if the file has been loaded.
bool is_loaded_;
// True if the bytes need swapping.
bool swap_;
// Contents of each element of the traineddata file.
GenericVector<char> entries_[TESSDATA_NUM_ENTRIES];
};
} // namespace tesseract
#endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_

View File

@ -87,7 +87,9 @@ class UnicityTable {
/// Returns false on read/write error.
bool write(FILE* f, TessResultCallback2<bool, FILE*, T const &>* cb) const;
/// swap is used to switch the endianness.
bool read(FILE* f, TessResultCallback3<bool, FILE*, T*, bool>* cb, bool swap);
bool read(tesseract::TFile* f,
TessResultCallback3<bool, tesseract::TFile*, T*, bool>* cb,
bool swap);
private:
GenericVector<T> table_;
@ -194,7 +196,8 @@ bool UnicityTable<T>::write(
template <typename T>
bool UnicityTable<T>::read(
FILE* f, TessResultCallback3<bool, FILE*, T*, bool>* cb, bool swap) {
tesseract::TFile* f,
TessResultCallback3<bool, tesseract::TFile*, T*, bool>* cb, bool swap) {
return table_.read(f, cb, swap);
}

View File

@ -30,6 +30,8 @@
#endif
#include <stdio.h>
using tesseract::TFile;
/*----------------------------------------------------------------------------
Public Code
----------------------------------------------------------------------------*/
@ -310,7 +312,7 @@ void Classify::PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates) {
* @note Exceptions: none
* @note History: Tue Mar 19 14:11:01 1991, DSJ, Created.
*/
ADAPT_CLASS ReadAdaptedClass(FILE *File) {
ADAPT_CLASS ReadAdaptedClass(TFile *fp) {
int NumTempProtos;
int NumConfigs;
int i;
@ -319,34 +321,34 @@ ADAPT_CLASS ReadAdaptedClass(FILE *File) {
/* first read high level adapted class structure */
Class = (ADAPT_CLASS) Emalloc (sizeof (ADAPT_CLASS_STRUCT));
fread ((char *) Class, sizeof (ADAPT_CLASS_STRUCT), 1, File);
fp->FRead(Class, sizeof(ADAPT_CLASS_STRUCT), 1);
/* then read in the definitions of the permanent protos and configs */
Class->PermProtos = NewBitVector (MAX_NUM_PROTOS);
Class->PermConfigs = NewBitVector (MAX_NUM_CONFIGS);
fread ((char *) Class->PermProtos, sizeof (uinT32),
WordsInVectorOfSize (MAX_NUM_PROTOS), File);
fread ((char *) Class->PermConfigs, sizeof (uinT32),
WordsInVectorOfSize (MAX_NUM_CONFIGS), File);
fp->FRead(Class->PermProtos, sizeof(uinT32),
WordsInVectorOfSize(MAX_NUM_PROTOS));
fp->FRead(Class->PermConfigs, sizeof(uinT32),
WordsInVectorOfSize(MAX_NUM_CONFIGS));
/* then read in the list of temporary protos */
fread ((char *) &NumTempProtos, sizeof (int), 1, File);
fp->FRead(&NumTempProtos, sizeof(int), 1);
Class->TempProtos = NIL_LIST;
for (i = 0; i < NumTempProtos; i++) {
TempProto =
(TEMP_PROTO) alloc_struct (sizeof (TEMP_PROTO_STRUCT),
"TEMP_PROTO_STRUCT");
fread ((char *) TempProto, sizeof (TEMP_PROTO_STRUCT), 1, File);
fp->FRead(TempProto, sizeof(TEMP_PROTO_STRUCT), 1);
Class->TempProtos = push_last (Class->TempProtos, TempProto);
}
/* then read in the adapted configs */
fread ((char *) &NumConfigs, sizeof (int), 1, File);
fp->FRead(&NumConfigs, sizeof(int), 1);
for (i = 0; i < NumConfigs; i++)
if (test_bit (Class->PermConfigs, i))
Class->Config[i].Perm = ReadPermConfig (File);
Class->Config[i].Perm = ReadPermConfig(fp);
else
Class->Config[i].Temp = ReadTempConfig (File);
Class->Config[i].Temp = ReadTempConfig(fp);
return (Class);
@ -366,20 +368,20 @@ namespace tesseract {
* @note Exceptions: none
* @note History: Mon Mar 18 15:18:10 1991, DSJ, Created.
*/
ADAPT_TEMPLATES Classify::ReadAdaptedTemplates(FILE *File) {
ADAPT_TEMPLATES Classify::ReadAdaptedTemplates(TFile *fp) {
int i;
ADAPT_TEMPLATES Templates;
/* first read the high level adaptive template struct */
Templates = (ADAPT_TEMPLATES) Emalloc (sizeof (ADAPT_TEMPLATES_STRUCT));
fread ((char *) Templates, sizeof (ADAPT_TEMPLATES_STRUCT), 1, File);
fp->FRead(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1);
/* then read in the basic integer templates */
Templates->Templates = ReadIntTemplates (File);
Templates->Templates = ReadIntTemplates(false, fp);
/* then read in the adaptive info for each class */
for (i = 0; i < (Templates->Templates)->NumClasses; i++) {
Templates->Class[i] = ReadAdaptedClass (File);
Templates->Class[i] = ReadAdaptedClass(fp);
}
return (Templates);
@ -399,15 +401,15 @@ ADAPT_TEMPLATES Classify::ReadAdaptedTemplates(FILE *File) {
* @note Exceptions: none
* @note History: Tue Mar 19 14:25:26 1991, DSJ, Created.
*/
PERM_CONFIG ReadPermConfig(FILE *File) {
PERM_CONFIG ReadPermConfig(TFile *fp) {
PERM_CONFIG Config = (PERM_CONFIG) alloc_struct(sizeof(PERM_CONFIG_STRUCT),
"PERM_CONFIG_STRUCT");
uinT8 NumAmbigs;
fread ((char *) &NumAmbigs, sizeof(uinT8), 1, File);
fp->FRead(&NumAmbigs, sizeof(uinT8), 1);
Config->Ambigs = new UNICHAR_ID[NumAmbigs + 1];
fread(Config->Ambigs, sizeof(UNICHAR_ID), NumAmbigs, File);
fp->FRead(Config->Ambigs, sizeof(UNICHAR_ID), NumAmbigs);
Config->Ambigs[NumAmbigs] = -1;
fread(&(Config->FontinfoId), sizeof(int), 1, File);
fp->FRead(&(Config->FontinfoId), sizeof(int), 1);
return (Config);
@ -426,17 +428,16 @@ PERM_CONFIG ReadPermConfig(FILE *File) {
* @note Exceptions: none
* @note History: Tue Mar 19 14:29:59 1991, DSJ, Created.
*/
TEMP_CONFIG ReadTempConfig(FILE *File) {
TEMP_CONFIG ReadTempConfig(TFile *fp) {
TEMP_CONFIG Config;
Config =
(TEMP_CONFIG) alloc_struct (sizeof (TEMP_CONFIG_STRUCT),
"TEMP_CONFIG_STRUCT");
fread ((char *) Config, sizeof (TEMP_CONFIG_STRUCT), 1, File);
fp->FRead(Config, sizeof(TEMP_CONFIG_STRUCT), 1);
Config->Protos = NewBitVector (Config->ProtoVectorSize * BITSINLONG);
fread ((char *) Config->Protos, sizeof (uinT32),
Config->ProtoVectorSize, File);
fp->FRead(Config->Protos, sizeof(uinT32), Config->ProtoVectorSize);
return (Config);

View File

@ -126,11 +126,11 @@ TEMP_CONFIG NewTempConfig(int MaxProtoId, int FontinfoId);
TEMP_PROTO NewTempProto();
ADAPT_CLASS ReadAdaptedClass(FILE *File);
ADAPT_CLASS ReadAdaptedClass(tesseract::TFile *File);
PERM_CONFIG ReadPermConfig(FILE *File);
PERM_CONFIG ReadPermConfig(tesseract::TFile *File);
TEMP_CONFIG ReadTempConfig(FILE *File);
TEMP_CONFIG ReadTempConfig(tesseract::TFile *File);
void WriteAdaptedClass(FILE *File, ADAPT_CLASS Class, int NumConfigs);

View File

@ -524,7 +524,7 @@ void Classify::EndAdaptiveClassifier() {
* enables use of pre-adapted templates
* @note History: Mon Mar 11 12:49:34 1991, DSJ, Created.
*/
void Classify::InitAdaptiveClassifier(bool load_pre_trained_templates) {
void Classify::InitAdaptiveClassifier(TessdataManager* mgr) {
if (!classify_enable_adaptive_matcher)
return;
if (AllProtosOn != NULL)
@ -532,37 +532,25 @@ void Classify::InitAdaptiveClassifier(bool load_pre_trained_templates) {
// If there is no language_data_path_prefix, the classifier will be
// adaptive only.
if (language_data_path_prefix.length() > 0 &&
load_pre_trained_templates) {
ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_INTTEMP));
PreTrainedTemplates =
ReadIntTemplates(tessdata_manager.GetDataFilePtr());
if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded inttemp\n");
if (language_data_path_prefix.length() > 0 && mgr != nullptr) {
TFile fp;
ASSERT_HOST(mgr->GetComponent(TESSDATA_INTTEMP, &fp));
PreTrainedTemplates = ReadIntTemplates(mgr->swap(), &fp);
if (tessdata_manager.SeekToStart(TESSDATA_SHAPE_TABLE)) {
if (mgr->GetComponent(TESSDATA_SHAPE_TABLE, &fp)) {
shape_table_ = new ShapeTable(unicharset);
if (!shape_table_->DeSerialize(tessdata_manager.swap(),
tessdata_manager.GetDataFilePtr())) {
if (!shape_table_->DeSerialize(mgr->swap(), &fp)) {
tprintf("Error loading shape table!\n");
delete shape_table_;
shape_table_ = NULL;
} else if (tessdata_manager.DebugLevel() > 0) {
tprintf("Successfully loaded shape table!\n");
}
}
ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_PFFMTABLE));
ReadNewCutoffs(tessdata_manager.GetDataFilePtr(),
tessdata_manager.swap(),
tessdata_manager.GetEndOffset(TESSDATA_PFFMTABLE),
CharNormCutoffs);
if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded pffmtable\n");
ASSERT_HOST(mgr->GetComponent(TESSDATA_PFFMTABLE, &fp));
ReadNewCutoffs(&fp, mgr->swap(), CharNormCutoffs);
ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_NORMPROTO));
NormProtos =
ReadNormProtos(tessdata_manager.GetDataFilePtr(),
tessdata_manager.GetEndOffset(TESSDATA_NORMPROTO));
if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded normproto\n");
ASSERT_HOST(mgr->GetComponent(TESSDATA_NORMPROTO, &fp));
NormProtos = ReadNormProtos(&fp);
static_classifier_ = new TessClassifier(false, this);
}
@ -582,21 +570,19 @@ void Classify::InitAdaptiveClassifier(bool load_pre_trained_templates) {
}
if (classify_use_pre_adapted_templates) {
FILE *File;
TFile fp;
STRING Filename;
Filename = imagefile;
Filename += ADAPT_TEMPLATE_SUFFIX;
File = fopen(Filename.string(), "rb");
if (File == NULL) {
if (!fp.Open(Filename.string(), nullptr)) {
AdaptedTemplates = NewAdaptedTemplates(true);
} else {
cprintf("\nReading pre-adapted templates from %s ...\n",
Filename.string());
fflush(stdout);
AdaptedTemplates = ReadAdaptedTemplates(File);
AdaptedTemplates = ReadAdaptedTemplates(&fp);
cprintf("\n");
fclose(File);
PrintAdaptedTemplates(stdout, AdaptedTemplates);
for (int i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {

View File

@ -103,16 +103,15 @@ class Classify : public CCStruct {
const uinT8* normalization_factors,
const uinT16* expected_num_features,
GenericVector<CP_RESULT_STRUCT>* results);
void ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset,
CLASS_CUTOFF_ARRAY Cutoffs);
void ReadNewCutoffs(TFile* fp, bool swap, CLASS_CUTOFF_ARRAY Cutoffs);
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File);
ADAPT_TEMPLATES ReadAdaptedTemplates(TFile* File);
/* normmatch.cpp ************************************************************/
FLOAT32 ComputeNormMatch(CLASS_ID ClassId,
const FEATURE_STRUCT& feature, BOOL8 DebugMatch);
void FreeNormProtos();
NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset);
NORM_PROTOS* ReadNormProtos(TFile* fp);
/* protos.cpp ***************************************************************/
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class);
INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos,
@ -138,7 +137,7 @@ class Classify : public CCStruct {
void LearnPieces(const char* fontname, int start, int length, float threshold,
CharSegmentationType segmentation, const char* correct_text,
WERD_RES* word);
void InitAdaptiveClassifier(bool load_pre_trained_templates);
void InitAdaptiveClassifier(TessdataManager* mgr);
void InitAdaptedClass(TBLOB *Blob,
CLASS_ID ClassId,
int FontinfoId,
@ -335,7 +334,7 @@ class Classify : public CCStruct {
uinT8* char_norm_array);
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures);
/* intproto.cpp *************************************************************/
INT_TEMPLATES ReadIntTemplates(FILE *File);
INT_TEMPLATES ReadIntTemplates(bool swap, TFile* fp);
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
const UNICHARSET& target_unicharset);
CLASS_ID GetClassToDebug(const char *Prompt, bool* adaptive_on,

View File

@ -25,8 +25,11 @@
#include <stdio.h>
#include <math.h>
using tesseract::TFile;
//---------------Global Data Definitions and Declarations--------------------
#define TOKENSIZE 80 //< max size of tokens read from an input file
#define QUOTED_TOKENSIZE "79"
#define MAXSAMPLESIZE 65535 //< max num of dimensions in feature space
//#define MAXBLOCKSIZE 65535 //< max num of samples in a character (block
// size)
@ -41,11 +44,14 @@
* @note Exceptions: ILLEGALSAMPLESIZE illegal format or range
* @note History: 6/6/89, DSJ, Created.
*/
uinT16 ReadSampleSize(FILE *File) {
int SampleSize;
uinT16 ReadSampleSize(TFile *fp) {
int SampleSize = 0;
if ((tfscanf(File, "%d", &SampleSize) != 1) ||
(SampleSize < 0) || (SampleSize > MAXSAMPLESIZE))
const int kMaxLineSize = 100;
char line[kMaxLineSize];
if (fp->FGets(line, kMaxLineSize) == nullptr ||
sscanf(line, "%d", &SampleSize) != 1 || (SampleSize < 0) ||
(SampleSize > MAXSAMPLESIZE))
DoError (ILLEGALSAMPLESIZE, "Illegal sample size");
return (SampleSize);
}
@ -64,30 +70,28 @@ uinT16 ReadSampleSize(FILE *File) {
* @note Globals: None
* @note History: 6/6/89, DSJ, Created.
*/
PARAM_DESC *ReadParamDesc(FILE *File, uinT16 N) {
int i;
PARAM_DESC *ReadParamDesc(TFile *fp, uinT16 N) {
PARAM_DESC *ParamDesc;
char Token[TOKENSIZE];
char linear_token[TOKENSIZE], essential_token[TOKENSIZE];
ParamDesc = (PARAM_DESC *) Emalloc (N * sizeof (PARAM_DESC));
for (i = 0; i < N; i++) {
if (tfscanf(File, "%s", Token) != 1)
DoError (ILLEGALCIRCULARSPEC,
"Illegal circular/linear specification");
if (Token[0] == 'c')
for (int i = 0; i < N; i++) {
const int kMaxLineSize = TOKENSIZE * 4;
char line[kMaxLineSize];
if (fp->FGets(line, kMaxLineSize) == nullptr ||
sscanf(line, "%" QUOTED_TOKENSIZE "s %" QUOTED_TOKENSIZE "s %f %f",
linear_token, essential_token, &ParamDesc[i].Min,
&ParamDesc[i].Max) != 4)
DoError(ILLEGALCIRCULARSPEC, "Illegal Parameter specification");
if (linear_token[0] == 'c')
ParamDesc[i].Circular = TRUE;
else
ParamDesc[i].Circular = FALSE;
if (tfscanf(File, "%s", Token) != 1)
DoError (ILLEGALESSENTIALSPEC,
"Illegal essential/non-essential spec");
if (Token[0] == 'e')
if (linear_token[0] == 'e')
ParamDesc[i].NonEssential = FALSE;
else
ParamDesc[i].NonEssential = TRUE;
if (tfscanf(File, "%f%f", &(ParamDesc[i].Min), &(ParamDesc[i].Max)) != 2)
DoError (ILLEGALMINMAXSPEC, "Illegal min or max specification");
ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
ParamDesc[i].HalfRange = ParamDesc[i].Range / 2;
ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
@ -111,123 +115,68 @@ PARAM_DESC *ReadParamDesc(FILE *File, uinT16 N) {
* @note Globals: None
* @note History: 6/6/89, DSJ, Created.
*/
PROTOTYPE *ReadPrototype(FILE *File, uinT16 N) {
char Token[TOKENSIZE];
int Status;
PROTOTYPE *ReadPrototype(TFile *fp, uinT16 N) {
char sig_token[TOKENSIZE], shape_token[TOKENSIZE];
PROTOTYPE *Proto;
int SampleCount;
int i;
if ((Status = tfscanf(File, "%s", Token)) == 1) {
Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
Proto->Cluster = NULL;
if (Token[0] == 's')
Proto->Significant = TRUE;
else
Proto->Significant = FALSE;
Proto->Style = ReadProtoStyle (File);
if ((tfscanf(File, "%d", &SampleCount) != 1) || (SampleCount < 0))
DoError (ILLEGALSAMPLECOUNT, "Illegal sample count");
Proto->NumSamples = SampleCount;
Proto->Mean = ReadNFloats (File, N, NULL);
if (Proto->Mean == NULL)
DoError (ILLEGALMEANSPEC, "Illegal prototype mean");
switch (Proto->Style) {
case spherical:
if (ReadNFloats (File, 1, &(Proto->Variance.Spherical)) == NULL)
DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance");
Proto->Magnitude.Spherical =
1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
Proto->TotalMagnitude =
pow (Proto->Magnitude.Spherical, (float) N);
Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
Proto->Distrib = NULL;
break;
case elliptical:
Proto->Variance.Elliptical = ReadNFloats (File, N, NULL);
if (Proto->Variance.Elliptical == NULL)
DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance");
Proto->Magnitude.Elliptical =
(FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
Proto->Weight.Elliptical =
(FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
Proto->TotalMagnitude = 1.0;
for (i = 0; i < N; i++) {
Proto->Magnitude.Elliptical[i] =
1.0 /
sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
Proto->Weight.Elliptical[i] =
1.0 / Proto->Variance.Elliptical[i];
Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
}
Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
Proto->Distrib = NULL;
break;
case mixed:
Proto->Distrib =
(DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
for (i = 0; i < N; i++) {
if (tfscanf(File, "%s", Token) != 1)
DoError (ILLEGALDISTRIBUTION,
"Illegal prototype distribution");
switch (Token[0]) {
case 'n':
Proto->Distrib[i] = normal;
break;
case 'u':
Proto->Distrib[i] = uniform;
break;
case 'r':
Proto->Distrib[i] = D_random;
break;
default:
DoError (ILLEGALDISTRIBUTION,
"Illegal prototype distribution");
}
}
Proto->Variance.Elliptical = ReadNFloats (File, N, NULL);
if (Proto->Variance.Elliptical == NULL)
DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance");
Proto->Magnitude.Elliptical =
(FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
Proto->Weight.Elliptical =
(FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
Proto->TotalMagnitude = 1.0;
for (i = 0; i < N; i++) {
switch (Proto->Distrib[i]) {
case normal:
Proto->Magnitude.Elliptical[i] = 1.0 /
sqrt ((double)
(2.0 * PI * Proto->Variance.Elliptical[i]));
Proto->Weight.Elliptical[i] =
1.0 / Proto->Variance.Elliptical[i];
break;
case uniform:
case D_random:
Proto->Magnitude.Elliptical[i] = 1.0 /
(2.0 * Proto->Variance.Elliptical[i]);
break;
case DISTRIBUTION_COUNT:
ASSERT_HOST(!"Distribution count not allowed!");
}
Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
}
Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
break;
}
return (Proto);
const int kMaxLineSize = TOKENSIZE * 4;
char line[kMaxLineSize];
if (fp->FGets(line, kMaxLineSize) == nullptr ||
sscanf(line, "%" QUOTED_TOKENSIZE "s %" QUOTED_TOKENSIZE "s %d",
sig_token, shape_token, &SampleCount) != 3) {
tprintf("Invalid prototype: %s\n", line);
return nullptr;
}
else if (Status == EOF)
return (NULL);
else {
DoError (ILLEGALSIGNIFICANCESPEC, "Illegal significance specification");
return (NULL);
Proto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
Proto->Cluster = NULL;
if (sig_token[0] == 's')
Proto->Significant = TRUE;
else
Proto->Significant = FALSE;
Proto->Style = ReadProtoStyle(shape_token);
if (SampleCount < 0) DoError(ILLEGALSAMPLECOUNT, "Illegal sample count");
Proto->NumSamples = SampleCount;
Proto->Mean = ReadNFloats(fp, N, NULL);
if (Proto->Mean == NULL) DoError(ILLEGALMEANSPEC, "Illegal prototype mean");
switch (Proto->Style) {
case spherical:
if (ReadNFloats(fp, 1, &(Proto->Variance.Spherical)) == NULL)
DoError(ILLEGALVARIANCESPEC, "Illegal prototype variance");
Proto->Magnitude.Spherical =
1.0 / sqrt((double)(2.0 * PI * Proto->Variance.Spherical));
Proto->TotalMagnitude = pow(Proto->Magnitude.Spherical, (float)N);
Proto->LogMagnitude = log((double)Proto->TotalMagnitude);
Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
Proto->Distrib = NULL;
break;
case elliptical:
Proto->Variance.Elliptical = ReadNFloats(fp, N, NULL);
if (Proto->Variance.Elliptical == NULL)
DoError(ILLEGALVARIANCESPEC, "Illegal prototype variance");
Proto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
Proto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
Proto->TotalMagnitude = 1.0;
for (i = 0; i < N; i++) {
Proto->Magnitude.Elliptical[i] =
1.0 / sqrt((double)(2.0 * PI * Proto->Variance.Elliptical[i]));
Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
}
Proto->LogMagnitude = log((double)Proto->TotalMagnitude);
Proto->Distrib = NULL;
break;
default:
Efree(Proto);
tprintf("Invalid prototype style\n");
return nullptr;
}
return Proto;
}
/**
@ -239,30 +188,19 @@ PROTOTYPE *ReadPrototype(FILE *File, uinT16 N) {
* @note Exceptions: ILLEGALSTYLESPEC illegal prototype style specification
* @note History: 6/8/89, DSJ, Created.
*/
PROTOSTYLE ReadProtoStyle(FILE *File) {
char Token[TOKENSIZE];
PROTOSTYLE Style;
if (tfscanf(File, "%s", Token) != 1)
DoError (ILLEGALSTYLESPEC, "Illegal prototype style specification");
switch (Token[0]) {
PROTOSTYLE ReadProtoStyle(const char *shape) {
switch (shape[0]) {
case 's':
Style = spherical;
break;
return spherical;
case 'e':
Style = elliptical;
break;
case 'm':
Style = mixed;
break;
return elliptical;
case 'a':
Style = automatic;
break;
return automatic;
default:
Style = elliptical;
DoError (ILLEGALSTYLESPEC, "Illegal prototype style specification");
break;
}
return (Style);
tprintf("Invalid prototype style specification:%s\n", shape);
return elliptical;
}
/**
@ -279,28 +217,30 @@ PROTOSTYLE ReadProtoStyle(FILE *File) {
* @note Exceptions: ILLEGALFLOAT
* @note History: 6/6/89, DSJ, Created.
*/
FLOAT32* ReadNFloats(FILE * File, uinT16 N, FLOAT32 Buffer[]) {
FLOAT32 *ReadNFloats(TFile *fp, uinT16 N, FLOAT32 Buffer[]) {
const int kMaxLineSize = 1024;
char line[kMaxLineSize];
if (fp->FGets(line, kMaxLineSize) == nullptr) {
tprintf("Hit EOF in ReadNFloats!\n");
return nullptr;
}
bool needs_free = false;
int i;
int NumFloatsRead;
if (Buffer == NULL) {
Buffer = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
needs_free = true;
}
for (i = 0; i < N; i++) {
NumFloatsRead = tfscanf(File, "%f", &(Buffer[i]));
if (NumFloatsRead != 1) {
if ((NumFloatsRead == EOF) && (i == 0)) {
if (needs_free) {
Efree(Buffer);
}
return NULL;
} else {
DoError(ILLEGALFLOAT, "Illegal float specification");
}
char *startptr = line;
for (int i = 0; i < N; i++) {
char *endptr;
Buffer[i] = strtof(startptr, &endptr);
if (endptr == startptr) {
tprintf("Read of %d floats failed!\n", N);
if (needs_free) Efree(Buffer);
return nullptr;
}
startptr = endptr;
}
return Buffer;
}

View File

@ -20,22 +20,23 @@
#define TESSERACT_CLASSIFY_CLUSTTOOL_H_
//--------------------------Include Files---------------------------------------
#include "host.h"
#include "cluster.h"
#include <stdio.h>
#include "cluster.h"
#include "host.h"
#include "serialis.h"
/*-------------------------------------------------------------------------
Public Function Prototype
--------------------------------------------------------------------------*/
uinT16 ReadSampleSize(FILE *File);
uinT16 ReadSampleSize(tesseract::TFile *fp);
PARAM_DESC *ReadParamDesc(FILE *File, uinT16 N);
PARAM_DESC *ReadParamDesc(tesseract::TFile *fp, uinT16 N);
PROTOTYPE *ReadPrototype(FILE *File, uinT16 N);
PROTOTYPE *ReadPrototype(tesseract::TFile *fp, uinT16 N);
PROTOSTYLE ReadProtoStyle(FILE *File);
PROTOSTYLE ReadProtoStyle(const char *style);
FLOAT32 *ReadNFloats (FILE * File, uinT16 N, FLOAT32 Buffer[]);
FLOAT32 *ReadNFloats(tesseract::TFile *fp, uinT16 N, FLOAT32 Buffer[]);
void WriteParamDesc(FILE *File, uinT16 N, const PARAM_DESC ParamDesc[]);

View File

@ -49,7 +49,7 @@ namespace tesseract {
* @note Exceptions: none
* @note History: Wed Feb 20 09:38:26 1991, DSJ, Created.
*/
void Classify::ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset,
void Classify::ReadNewCutoffs(TFile* fp, bool swap,
CLASS_CUTOFF_ARRAY Cutoffs) {
char Class[UNICHAR_LEN + 1];
CLASS_ID ClassId;
@ -57,23 +57,24 @@ void Classify::ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset,
int i;
if (shape_table_ != NULL) {
if (!shapetable_cutoffs_.DeSerialize(swap, CutoffFile)) {
if (!shapetable_cutoffs_.DeSerialize(swap, fp)) {
tprintf("Error during read of shapetable pffmtable!\n");
}
}
for (i = 0; i < MAX_NUM_CLASSES; i++)
Cutoffs[i] = MAX_CUTOFF;
while ((end_offset < 0 || ftell(CutoffFile) < end_offset) &&
tfscanf(CutoffFile, "%" REALLY_QUOTE_IT(UNICHAR_LEN) "s %d",
Class, &Cutoff) == 2) {
const int kMaxLineSize = 100;
char line[kMaxLineSize];
while (fp->FGets(line, kMaxLineSize) != nullptr &&
sscanf(line, "%" REALLY_QUOTE_IT(UNICHAR_LEN) "s %d", Class,
&Cutoff) == 2) {
if (strcmp(Class, "NULL") == 0) {
ClassId = unicharset.unichar_to_id(" ");
} else {
ClassId = unicharset.unichar_to_id(Class);
}
Cutoffs[ClassId] = Cutoff;
SkipNewline(CutoffFile);
}
}

View File

@ -758,9 +758,8 @@ namespace tesseract {
* @note Exceptions: none
* @note History: Wed Feb 27 11:48:46 1991, DSJ, Created.
*/
INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
INT_TEMPLATES Classify::ReadIntTemplates(bool swap, TFile *fp) {
int i, j, w, x, y, z;
BOOL8 swap;
int nread;
int unicharset_size;
int version_id = 0;
@ -786,29 +785,19 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
/* first read the high level template struct */
Templates = NewIntTemplates();
// Read Templates in parts for 64 bit compatibility.
if (fread(&unicharset_size, sizeof(int), 1, File) != 1)
cprintf("Bad read of inttemp!\n");
if (fread(&Templates->NumClasses,
sizeof(Templates->NumClasses), 1, File) != 1 ||
fread(&Templates->NumClassPruners,
sizeof(Templates->NumClassPruners), 1, File) != 1)
cprintf("Bad read of inttemp!\n");
// Swap status is determined automatically.
swap = Templates->NumClassPruners < 0 ||
Templates->NumClassPruners > MAX_NUM_CLASS_PRUNERS;
if (swap) {
Reverse32(&Templates->NumClassPruners);
Reverse32(&Templates->NumClasses);
Reverse32(&unicharset_size);
}
if (fp->FReadEndian(&unicharset_size, sizeof(unicharset_size), 1, swap) != 1)
tprintf("Bad read of inttemp!\n");
if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses), 1,
swap) != 1 ||
fp->FReadEndian(&Templates->NumClassPruners,
sizeof(Templates->NumClassPruners), 1, swap) != 1)
tprintf("Bad read of inttemp!\n");
if (Templates->NumClasses < 0) {
// This file has a version id!
version_id = -Templates->NumClasses;
if (fread(&Templates->NumClasses, sizeof(Templates->NumClasses),
1, File) != 1)
cprintf("Bad read of inttemp!\n");
if (swap)
Reverse32(&Templates->NumClasses);
if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses),
1, swap) != 1)
tprintf("Bad read of inttemp!\n");
}
if (version_id < 3) {
@ -817,39 +806,24 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
}
if (version_id < 2) {
for (i = 0; i < unicharset_size; ++i) {
if (fread(&IndexFor[i], sizeof(inT16), 1, File) != 1)
cprintf("Bad read of inttemp!\n");
if (fp->FReadEndian(IndexFor, sizeof(IndexFor[0]), unicharset_size, swap) !=
unicharset_size) {
tprintf("Bad read of inttemp!\n");
}
for (i = 0; i < Templates->NumClasses; ++i) {
if (fread(&ClassIdFor[i], sizeof(CLASS_ID), 1, File) != 1)
cprintf("Bad read of inttemp!\n");
}
if (swap) {
for (i = 0; i < Templates->NumClasses; i++)
Reverse16(&IndexFor[i]);
for (i = 0; i < Templates->NumClasses; i++)
Reverse32(&ClassIdFor[i]);
if (fp->FReadEndian(ClassIdFor, sizeof(ClassIdFor[0]),
Templates->NumClasses, swap) != Templates->NumClasses) {
tprintf("Bad read of inttemp!\n");
}
}
/* then read in the class pruners */
const int kNumBuckets =
NUM_CP_BUCKETS * NUM_CP_BUCKETS * NUM_CP_BUCKETS * WERDS_PER_CP_VECTOR;
for (i = 0; i < Templates->NumClassPruners; i++) {
Pruner = new CLASS_PRUNER_STRUCT;
if ((nread =
fread(Pruner, 1, sizeof(CLASS_PRUNER_STRUCT),
File)) != sizeof(CLASS_PRUNER_STRUCT))
cprintf("Bad read of inttemp!\n");
if (swap) {
for (x = 0; x < NUM_CP_BUCKETS; x++) {
for (y = 0; y < NUM_CP_BUCKETS; y++) {
for (z = 0; z < NUM_CP_BUCKETS; z++) {
for (w = 0; w < WERDS_PER_CP_VECTOR; w++) {
Reverse32(&Pruner->p[x][y][z][w]);
}
}
}
}
if (fp->FReadEndian(Pruner, sizeof(Pruner->p[0][0][0][0]), kNumBuckets,
swap) != kNumBuckets) {
tprintf("Bad read of inttemp!\n");
}
if (version_id < 2) {
TempClassPruner[i] = Pruner;
@ -914,39 +888,24 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
for (i = 0; i < Templates->NumClasses; i++) {
/* first read in the high level struct for the class */
Class = (INT_CLASS) Emalloc (sizeof (INT_CLASS_STRUCT));
if (fread(&Class->NumProtos, sizeof(Class->NumProtos), 1, File) != 1 ||
fread(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File) != 1 ||
fread(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File) != 1)
cprintf ("Bad read of inttemp!\n");
if (fp->FReadEndian(&Class->NumProtos, sizeof(Class->NumProtos), 1, swap) !=
1 ||
fp->FRead(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1) != 1 ||
fp->FRead(&Class->NumConfigs, sizeof(Class->NumConfigs), 1) != 1)
tprintf("Bad read of inttemp!\n");
if (version_id == 0) {
// Only version 0 writes 5 pointless pointers to the file.
for (j = 0; j < 5; ++j) {
int junk;
if (fread(&junk, sizeof(junk), 1, File) != 1)
cprintf ("Bad read of inttemp!\n");
inT32 junk;
if (fp->FRead(&junk, sizeof(junk), 1) != 1)
tprintf("Bad read of inttemp!\n");
}
}
if (version_id < 4) {
for (j = 0; j < MaxNumConfigs; ++j) {
if (fread(&Class->ConfigLengths[j], sizeof(uinT16), 1, File) != 1)
cprintf ("Bad read of inttemp!\n");
}
if (swap) {
Reverse16(&Class->NumProtos);
for (j = 0; j < MaxNumConfigs; j++)
Reverse16(&Class->ConfigLengths[j]);
}
} else {
ASSERT_HOST(Class->NumConfigs < MaxNumConfigs);
for (j = 0; j < Class->NumConfigs; ++j) {
if (fread(&Class->ConfigLengths[j], sizeof(uinT16), 1, File) != 1)
cprintf ("Bad read of inttemp!\n");
}
if (swap) {
Reverse16(&Class->NumProtos);
for (j = 0; j < MaxNumConfigs; j++)
Reverse16(&Class->ConfigLengths[j]);
}
int num_configs = version_id < 4 ? MaxNumConfigs : Class->NumConfigs;
ASSERT_HOST(num_configs <= MaxNumConfigs);
if (fp->FReadEndian(Class->ConfigLengths, sizeof(uinT16), num_configs,
swap) != num_configs) {
tprintf("Bad read of inttemp!\n");
}
if (version_id < 2) {
ClassForClassId (Templates, ClassIdFor[i]) = Class;
@ -958,59 +917,41 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
Lengths = NULL;
if (MaxNumIntProtosIn (Class) > 0) {
Lengths = (uinT8 *)Emalloc(sizeof(uinT8) * MaxNumIntProtosIn(Class));
if ((nread =
fread((char *)Lengths, sizeof(uinT8),
MaxNumIntProtosIn(Class), File)) != MaxNumIntProtosIn (Class))
cprintf ("Bad read of inttemp!\n");
if (fp->FRead(Lengths, sizeof(uinT8), MaxNumIntProtosIn(Class)) !=
MaxNumIntProtosIn(Class))
tprintf("Bad read of inttemp!\n");
}
Class->ProtoLengths = Lengths;
/* then read in the proto sets */
for (j = 0; j < Class->NumProtoSets; j++) {
ProtoSet = (PROTO_SET)Emalloc(sizeof(PROTO_SET_STRUCT));
if (version_id < 3) {
if ((nread =
fread((char *) &ProtoSet->ProtoPruner, 1,
sizeof(PROTO_PRUNER), File)) != sizeof(PROTO_PRUNER))
int num_buckets = NUM_PP_PARAMS * NUM_PP_BUCKETS * WERDS_PER_PP_VECTOR;
if (fp->FReadEndian(&ProtoSet->ProtoPruner,
sizeof(ProtoSet->ProtoPruner[0][0][0]), num_buckets,
swap) != num_buckets)
tprintf("Bad read of inttemp!\n");
for (x = 0; x < PROTOS_PER_PROTO_SET; x++) {
if (fp->FRead(&ProtoSet->Protos[x].A, sizeof(ProtoSet->Protos[x].A),
1) != 1 ||
fp->FRead(&ProtoSet->Protos[x].B, sizeof(ProtoSet->Protos[x].B),
1) != 1 ||
fp->FRead(&ProtoSet->Protos[x].C, sizeof(ProtoSet->Protos[x].C),
1) != 1 ||
fp->FRead(&ProtoSet->Protos[x].Angle,
sizeof(ProtoSet->Protos[x].Angle), 1) != 1)
tprintf("Bad read of inttemp!\n");
if (fp->FReadEndian(&ProtoSet->Protos[x].Configs,
sizeof(ProtoSet->Protos[x].Configs[0]),
WerdsPerConfigVec, swap) != WerdsPerConfigVec)
cprintf("Bad read of inttemp!\n");
for (x = 0; x < PROTOS_PER_PROTO_SET; x++) {
if ((nread = fread((char *) &ProtoSet->Protos[x].A, 1,
sizeof(inT8), File)) != sizeof(inT8) ||
(nread = fread((char *) &ProtoSet->Protos[x].B, 1,
sizeof(uinT8), File)) != sizeof(uinT8) ||
(nread = fread((char *) &ProtoSet->Protos[x].C, 1,
sizeof(inT8), File)) != sizeof(inT8) ||
(nread = fread((char *) &ProtoSet->Protos[x].Angle, 1,
sizeof(uinT8), File)) != sizeof(uinT8))
cprintf("Bad read of inttemp!\n");
for (y = 0; y < WerdsPerConfigVec; y++)
if ((nread = fread((char *) &ProtoSet->Protos[x].Configs[y], 1,
sizeof(uinT32), File)) != sizeof(uinT32))
cprintf("Bad read of inttemp!\n");
}
} else {
if ((nread =
fread((char *) ProtoSet, 1, sizeof(PROTO_SET_STRUCT),
File)) != sizeof(PROTO_SET_STRUCT))
cprintf("Bad read of inttemp!\n");
}
if (swap) {
for (x = 0; x < NUM_PP_PARAMS; x++)
for (y = 0; y < NUM_PP_BUCKETS; y++)
for (z = 0; z < WERDS_PER_PP_VECTOR; z++)
Reverse32(&ProtoSet->ProtoPruner[x][y][z]);
for (x = 0; x < PROTOS_PER_PROTO_SET; x++)
for (y = 0; y < WerdsPerConfigVec; y++)
Reverse32(&ProtoSet->Protos[x].Configs[y]);
}
Class->ProtoSets[j] = ProtoSet;
}
if (version_id < 4)
if (version_id < 4) {
Class->font_set_id = -1;
else {
fread(&Class->font_set_id, sizeof(int), 1, File);
if (swap)
Reverse32(&Class->font_set_id);
} else {
fp->FReadEndian(&Class->font_set_id, sizeof(Class->font_set_id), 1, swap);
}
}
@ -1037,13 +978,12 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
}
}
if (version_id >= 4) {
this->fontinfo_table_.read(File, NewPermanentTessCallback(read_info), swap);
this->fontinfo_table_.read(fp, NewPermanentTessCallback(read_info), swap);
if (version_id >= 5) {
this->fontinfo_table_.read(File,
NewPermanentTessCallback(read_spacing_info),
swap);
this->fontinfo_table_.read(
fp, NewPermanentTessCallback(read_spacing_info), swap);
}
this->fontset_table_.read(File, NewPermanentTessCallback(read_set), swap);
this->fontset_table_.read(fp, NewPermanentTessCallback(read_set), swap);
}
// Clean up.

View File

@ -86,27 +86,6 @@ bool MasterTrainer::Serialize(FILE* fp) const {
return true;
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool MasterTrainer::DeSerialize(bool swap, FILE* fp) {
if (fread(&norm_mode_, sizeof(norm_mode_), 1, fp) != 1) return false;
if (swap) {
ReverseN(&norm_mode_, sizeof(norm_mode_));
}
if (!unicharset_.load_from_file(fp)) return false;
charsetsize_ = unicharset_.size();
if (!feature_space_.DeSerialize(swap, fp)) return false;
feature_map_.Init(feature_space_);
if (!samples_.DeSerialize(swap, fp)) return false;
if (!junk_samples_.DeSerialize(swap, fp)) return false;
if (!verify_samples_.DeSerialize(swap, fp)) return false;
if (!master_shapes_.DeSerialize(swap, fp)) return false;
if (!flat_shapes_.DeSerialize(swap, fp)) return false;
if (!fontinfo_table_.DeSerialize(swap, fp)) return false;
if (!xheights_.DeSerialize(swap, fp)) return false;
return true;
}
// Load an initial unicharset, or set one up if the file cannot be read.
void MasterTrainer::LoadUnicharset(const char* filename) {
if (!unicharset_.load_from_file(filename)) {

View File

@ -74,9 +74,6 @@ class MasterTrainer {
// Writes to the given file. Returns false in case of error.
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
// Loads an initial unicharset, or sets one up if the file cannot be read.
void LoadUnicharset(const char* filename);

View File

@ -242,7 +242,7 @@ namespace tesseract {
* @note Exceptions: none
* @note History: Wed Dec 19 16:38:49 1990, DSJ, Created.
*/
NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) {
NORM_PROTOS *Classify::ReadNormProtos(TFile *fp) {
NORM_PROTOS *NormProtos;
int i;
char unichar[2 * UNICHAR_LEN + 1];
@ -258,26 +258,26 @@ NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) {
NormProtos->Protos[i] = NIL_LIST;
/* read file header and save in data structure */
NormProtos->NumParams = ReadSampleSize (File);
NormProtos->ParamDesc = ReadParamDesc (File, NormProtos->NumParams);
NormProtos->NumParams = ReadSampleSize(fp);
NormProtos->ParamDesc = ReadParamDesc(fp, NormProtos->NumParams);
/* read protos for each class into a separate list */
while ((end_offset < 0 || ftell(File) < end_offset) &&
tfscanf(File, "%s %d", unichar, &NumProtos) == 2) {
const int kMaxLineSize = 100;
char line[kMaxLineSize];
while (fp->FGets(line, kMaxLineSize) != nullptr) {
if (sscanf(line, "%s %d", unichar, &NumProtos) != 2) continue;
if (unicharset.contains_unichar(unichar)) {
unichar_id = unicharset.unichar_to_id(unichar);
Protos = NormProtos->Protos[unichar_id];
for (i = 0; i < NumProtos; i++)
Protos =
push_last (Protos, ReadPrototype (File, NormProtos->NumParams));
Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));
NormProtos->Protos[unichar_id] = Protos;
} else {
cprintf("Error: unichar %s in normproto file is not in unichar set.\n",
tprintf("Error: unichar %s in normproto file is not in unichar set.\n",
unichar);
for (i = 0; i < NumProtos; i++)
FreePrototype(ReadPrototype (File, NormProtos->NumParams));
FreePrototype(ReadPrototype(fp, NormProtos->NumParams));
}
SkipNewline(File);
}
return (NormProtos);
} /* ReadNormProtos */

View File

@ -71,10 +71,9 @@ bool UnicharAndFonts::Serialize(FILE* fp) const {
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool UnicharAndFonts::DeSerialize(bool swap, FILE* fp) {
if (fread(&unichar_id, sizeof(unichar_id), 1, fp) != 1) return false;
if (swap)
ReverseN(&unichar_id, sizeof(unichar_id));
bool UnicharAndFonts::DeSerialize(bool swap, TFile* fp) {
if (fp->FReadEndian(&unichar_id, sizeof(unichar_id), 1, swap) != 1)
return false;
if (!font_ids.DeSerialize(swap, fp)) return false;
return true;
}
@ -96,10 +95,9 @@ bool Shape::Serialize(FILE* fp) const {
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool Shape::DeSerialize(bool swap, FILE* fp) {
bool Shape::DeSerialize(bool swap, TFile* fp) {
uinT8 sorted;
if (fread(&sorted, sizeof(sorted), 1, fp) != 1)
return false;
if (fp->FRead(&sorted, sizeof(sorted), 1) != 1) return false;
unichars_sorted_ = sorted != 0;
if (!unichars_.DeSerializeClasses(swap, fp)) return false;
return true;
@ -253,7 +251,7 @@ bool ShapeTable::Serialize(FILE* fp) const {
}
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool ShapeTable::DeSerialize(bool swap, FILE* fp) {
bool ShapeTable::DeSerialize(bool swap, TFile* fp) {
if (!shape_table_.DeSerialize(swap, fp)) return false;
num_fonts_ = 0;
return true;

View File

@ -168,7 +168,7 @@ struct UnicharAndFonts {
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
bool DeSerialize(bool swap, TFile* fp);
// Sort function to sort a pair of UnicharAndFonts by unichar_id.
static int SortByUnicharId(const void* v1, const void* v2);
@ -191,7 +191,7 @@ class Shape {
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
bool DeSerialize(bool swap, TFile* fp);
int destination_index() const {
return destination_index_;
@ -272,7 +272,7 @@ class ShapeTable {
bool Serialize(FILE* fp) const;
// Reads from the given file. Returns false in case of error.
// If swap is true, assumes a big/little-endian swap is needed.
bool DeSerialize(bool swap, FILE* fp);
bool DeSerialize(bool swap, TFile* fp);
// Accessors.
int NumShapes() const {

View File

@ -174,11 +174,7 @@ bool Dawg::match_words(WERD_CHOICE *word, inT32 index,
return false;
}
void Dawg::init(DawgType type, const STRING &lang,
PermuterType perm, int unicharset_size, int debug_level) {
type_ = type;
lang_ = lang;
perm_ = perm;
void Dawg::init(int unicharset_size) {
ASSERT_HOST(unicharset_size > 0);
unicharset_size_ = unicharset_size;
// Set bit masks. We will use the value unicharset_size_ as a null char, so
@ -188,8 +184,6 @@ void Dawg::init(DawgType type, const STRING &lang,
letter_mask_ = ~(~0ull << flag_start_bit_);
next_node_mask_ = ~0ull << (flag_start_bit_ + NUM_FLAG_BITS);
flags_mask_ = ~(letter_mask_ | next_node_mask_);
debug_level_ = debug_level;
}
@ -315,44 +309,34 @@ void SquishedDawg::print_edge(EDGE_REF edge) const {
}
}
void SquishedDawg::read_squished_dawg(FILE *file,
DawgType type,
const STRING &lang,
PermuterType perm,
int debug_level) {
if (debug_level) tprintf("Reading squished dawg\n");
bool SquishedDawg::read_squished_dawg(TFile *file) {
if (debug_level_) tprintf("Reading squished dawg\n");
// Read the magic number and if it does not match kDawgMagicNumber
// set swap to true to indicate that we need to switch endianness.
inT16 magic;
fread(&magic, sizeof(inT16), 1, file);
if (file->FRead(&magic, sizeof(inT16), 1) != 1) return false;
bool swap = (magic != kDawgMagicNumber);
int unicharset_size;
fread(&unicharset_size, sizeof(inT32), 1, file);
fread(&num_edges_, sizeof(inT32), 1, file);
if (swap) {
ReverseN(&unicharset_size, sizeof(unicharset_size));
ReverseN(&num_edges_, sizeof(num_edges_));
}
inT32 unicharset_size;
if (file->FReadEndian(&unicharset_size, sizeof(unicharset_size), 1, swap) !=
1)
return false;
if (file->FReadEndian(&num_edges_, sizeof(num_edges_), 1, swap) != 1)
return false;
ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
Dawg::init(type, lang, perm, unicharset_size, debug_level);
Dawg::init(unicharset_size);
edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_);
fread(&edges_[0], sizeof(EDGE_RECORD), num_edges_, file);
EDGE_REF edge;
if (swap) {
for (edge = 0; edge < num_edges_; ++edge) {
ReverseN(&edges_[edge], sizeof(edges_[edge]));
}
}
if (debug_level > 2) {
if (file->FReadEndian(&edges_[0], sizeof(edges_[0]), num_edges_, swap) !=
num_edges_)
return false;
if (debug_level_ > 2) {
tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
type_, lang_.string(), perm_, unicharset_size_, num_edges_);
for (edge = 0; edge < num_edges_; ++edge)
print_edge(edge);
for (EDGE_REF edge = 0; edge < num_edges_; ++edge) print_edge(edge);
}
return true;
}
NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const {

View File

@ -201,7 +201,12 @@ class Dawg {
}
protected:
Dawg() {}
Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
: type_(type),
lang_(lang),
perm_(perm),
unicharset_size_(0),
debug_level_(debug_level) {}
/// Returns the next node visited by following this edge.
inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {
@ -274,10 +279,9 @@ class Dawg {
(!word_end || (word_end == other_word_end)));
}
/// Sets type_, lang_, perm_, unicharset_size_.
/// Sets unicharset_size_.
/// Initializes the values of various masks from unicharset_size_.
void init(DawgType type, const STRING &lang,
PermuterType perm, int unicharset_size, int debug_level);
void init(int unicharset_size);
/// Matches all of the words that are represented by this string.
/// If wilcard is set to something other than INVALID_UNICHAR_ID,
@ -407,32 +411,36 @@ class DawgPositionVector : public GenericVector<DawgPosition> {
//
class SquishedDawg : public Dawg {
public:
SquishedDawg(FILE *file, DawgType type, const STRING &lang,
PermuterType perm, int debug_level) {
read_squished_dawg(file, type, lang, perm, debug_level);
SquishedDawg(DawgType type, const STRING &lang, PermuterType perm,
int debug_level)
: Dawg(type, lang, perm, debug_level) {}
SquishedDawg(const char *filename, DawgType type, const STRING &lang,
PermuterType perm, int debug_level)
: Dawg(type, lang, perm, debug_level) {
TFile file;
ASSERT_HOST(file.Open(filename, nullptr));
ASSERT_HOST(read_squished_dawg(&file));
num_forward_edges_in_node0 = num_forward_edges(0);
}
SquishedDawg(const char* filename, DawgType type,
const STRING &lang, PermuterType perm, int debug_level) {
FILE *file = fopen(filename, "rb");
if (file == NULL) {
tprintf("Failed to open dawg file %s\n", filename);
exit(1);
}
read_squished_dawg(file, type, lang, perm, debug_level);
num_forward_edges_in_node0 = num_forward_edges(0);
fclose(file);
}
SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,
const STRING &lang, PermuterType perm,
int unicharset_size, int debug_level) :
edges_(edges), num_edges_(num_edges) {
init(type, lang, perm, unicharset_size, debug_level);
const STRING &lang, PermuterType perm, int unicharset_size,
int debug_level)
: Dawg(type, lang, perm, debug_level),
edges_(edges),
num_edges_(num_edges) {
init(unicharset_size);
num_forward_edges_in_node0 = num_forward_edges(0);
if (debug_level > 3) print_all("SquishedDawg:");
}
~SquishedDawg();
// Loads using the given TFile. Returns false on failure.
bool Load(TFile *fp) {
if (!read_squished_dawg(fp)) return false;
num_forward_edges_in_node0 = num_forward_edges(0);
return true;
}
int NumEdges() { return num_edges_; }
/// Returns the edge that corresponds to the letter out of this node.
@ -529,8 +537,7 @@ class SquishedDawg : public Dawg {
inT32 num_forward_edges(NODE_REF node) const;
/// Reads SquishedDawg from a file.
void read_squished_dawg(FILE *file, DawgType type, const STRING &lang,
PermuterType perm, int debug_level);
bool read_squished_dawg(TFile *file);
/// Prints the contents of an edge indicated by the given EDGE_REF.
void print_edge(EDGE_REF edge) const;
@ -547,7 +554,7 @@ class SquishedDawg : public Dawg {
// Member variables.
EDGE_ARRAY edges_;
int num_edges_;
inT32 num_edges_;
int num_forward_edges_in_node0;
};

View File

@ -27,44 +27,33 @@
namespace tesseract {
struct DawgLoader {
DawgLoader(const STRING &lang,
const char *data_file_name,
TessdataType tessdata_dawg_type,
int dawg_debug_level)
DawgLoader(const STRING &lang, TessdataType tessdata_dawg_type,
int dawg_debug_level, TessdataManager *data_file)
: lang_(lang),
data_file_name_(data_file_name),
data_file_(data_file),
tessdata_dawg_type_(tessdata_dawg_type),
dawg_debug_level_(dawg_debug_level) {}
Dawg *Load();
STRING lang_;
const char *data_file_name_;
TessdataManager *data_file_;
TessdataType tessdata_dawg_type_;
int dawg_debug_level_;
};
Dawg *DawgCache::GetSquishedDawg(
const STRING &lang,
const char *data_file_name,
TessdataType tessdata_dawg_type,
int debug_level) {
STRING data_id = data_file_name;
Dawg *DawgCache::GetSquishedDawg(const STRING &lang,
TessdataType tessdata_dawg_type,
int debug_level, TessdataManager *data_file) {
STRING data_id = data_file->GetDataFileName();
data_id += kTessdataFileSuffixes[tessdata_dawg_type];
DawgLoader loader(lang, data_file_name, tessdata_dawg_type, debug_level);
DawgLoader loader(lang, tessdata_dawg_type, debug_level, data_file);
return dawgs_.Get(data_id, NewTessCallback(&loader, &DawgLoader::Load));
}
Dawg *DawgLoader::Load() {
TessdataManager data_loader;
if (!data_loader.Init(data_file_name_, dawg_debug_level_)) {
return NULL;
}
if (!data_loader.SeekToStart(tessdata_dawg_type_)) {
data_loader.End();
return NULL;
}
FILE *fp = data_loader.GetDataFilePtr();
TFile fp;
if (!data_file_->GetComponent(tessdata_dawg_type_, &fp)) return nullptr;
DawgType dawg_type;
PermuterType perm_type;
switch (tessdata_dawg_type_) {
@ -96,13 +85,13 @@ Dawg *DawgLoader::Load() {
perm_type = FREQ_DAWG_PERM;
break;
default:
data_loader.End();
return NULL;
return nullptr;
}
SquishedDawg *retval =
new SquishedDawg(fp, dawg_type, lang_, perm_type, dawg_debug_level_);
data_loader.End();
return retval;
new SquishedDawg(dawg_type, lang_, perm_type, dawg_debug_level_);
if (retval->Load(&fp)) return retval;
delete retval;
return nullptr;
}
} // namespace tesseract

View File

@ -29,11 +29,8 @@ namespace tesseract {
class DawgCache {
public:
Dawg *GetSquishedDawg(
const STRING &lang,
const char *data_file_name,
TessdataType tessdata_dawg_type,
int debug_level);
Dawg *GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type,
int debug_level, TessdataManager *data_file);
// If we manage the given dawg, decrement its count,
// and possibly delete it if the count reaches zero.

View File

@ -221,35 +221,35 @@ void Dict::SetupForLoad(DawgCache *dawg_cache) {
}
// Loads the dawgs needed by Tesseract. Call FinishLoad() after.
void Dict::Load(const char *data_file_name, const STRING &lang) {
void Dict::Load(const STRING &lang, TessdataManager *data_file) {
// Load dawgs_.
if (load_punc_dawg) {
punc_dawg_ = dawg_cache_->GetSquishedDawg(
lang, data_file_name, TESSDATA_PUNC_DAWG, dawg_debug_level);
punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG,
dawg_debug_level, data_file);
if (punc_dawg_) dawgs_ += punc_dawg_;
}
if (load_system_dawg) {
Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
lang, data_file_name, TESSDATA_SYSTEM_DAWG, dawg_debug_level);
lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
if (system_dawg) dawgs_ += system_dawg;
}
if (load_number_dawg) {
Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
lang, data_file_name, TESSDATA_NUMBER_DAWG, dawg_debug_level);
lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
if (number_dawg) dawgs_ += number_dawg;
}
if (load_bigram_dawg) {
bigram_dawg_ = dawg_cache_->GetSquishedDawg(
lang, data_file_name, TESSDATA_BIGRAM_DAWG, dawg_debug_level);
bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG,
dawg_debug_level, data_file);
}
if (load_freq_dawg) {
freq_dawg_ = dawg_cache_->GetSquishedDawg(
lang, data_file_name, TESSDATA_FREQ_DAWG, dawg_debug_level);
freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG,
dawg_debug_level, data_file);
if (freq_dawg_) { dawgs_ += freq_dawg_; }
}
if (load_unambig_dawg) {
unambig_dawg_ = dawg_cache_->GetSquishedDawg(
lang, data_file_name, TESSDATA_UNAMBIG_DAWG, dawg_debug_level);
unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG,
dawg_debug_level, data_file);
if (unambig_dawg_) dawgs_ += unambig_dawg_;
}
@ -302,21 +302,21 @@ void Dict::Load(const char *data_file_name, const STRING &lang) {
}
// Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
void Dict::LoadLSTM(const char *data_file_name, const STRING &lang) {
void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) {
// Load dawgs_.
if (load_punc_dawg) {
punc_dawg_ = dawg_cache_->GetSquishedDawg(
lang, data_file_name, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level);
punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG,
dawg_debug_level, data_file);
if (punc_dawg_) dawgs_ += punc_dawg_;
}
if (load_system_dawg) {
Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
lang, data_file_name, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level);
lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
if (system_dawg) dawgs_ += system_dawg;
}
if (load_number_dawg) {
Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
lang, data_file_name, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level);
lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
if (number_dawg) dawgs_ += number_dawg;
}
}

View File

@ -298,9 +298,9 @@ class Dict {
// Sets up ready for a Load or LoadLSTM.
void SetupForLoad(DawgCache *dawg_cache);
// Loads the dawgs needed by Tesseract. Call FinishLoad() after.
void Load(const char *data_file_name, const STRING &lang);
void Load(const STRING &lang, TessdataManager *data_file);
// Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
void LoadLSTM(const char *data_file_name, const STRING &lang);
void LoadLSTM(const STRING &lang, TessdataManager *data_file);
// Completes the loading process after Load() and/or LoadLSTM().
// Returns false if no dictionaries were loaded.
bool FinishLoad();

View File

@ -87,8 +87,9 @@ class Trie : public Dawg {
// contain more edges than max_num_edges, all the edges are cleared
// so that new inserts can proceed).
Trie(DawgType type, const STRING &lang, PermuterType perm,
int unicharset_size, int debug_level) {
init(type, lang, perm, unicharset_size, debug_level);
int unicharset_size, int debug_level)
: Dawg(type, lang, perm, debug_level) {
init(unicharset_size);
num_edges_ = 0;
deref_node_index_mask_ = ~letter_mask_;
new_dawg_node(); // need to allocate node 0

View File

@ -127,12 +127,11 @@ bool LSTMRecognizer::DeSerialize(bool swap, TFile* fp) {
// on the unicharset matching. This enables training to deserialize a model
// from checkpoint or restore without having to go back and reload the
// dictionary.
bool LSTMRecognizer::LoadDictionary(const char* data_file_name,
const char* lang) {
bool LSTMRecognizer::LoadDictionary(const char* lang, TessdataManager* mgr) {
delete dict_;
dict_ = new Dict(&ccutil_);
dict_->SetupForLoad(Dict::GlobalDawgCache());
dict_->LoadLSTM(data_file_name, lang);
dict_->LoadLSTM(lang, mgr);
if (dict_->FinishLoad()) return true; // Success.
tprintf("Failed to load any lstm-specific dictionaries for lang %s!!\n",
lang);

View File

@ -167,7 +167,7 @@ class LSTMRecognizer {
// on the unicharset matching. This enables training to deserialize a model
// from checkpoint or restore without having to go back and reload the
// dictionary.
bool LoadDictionary(const char* data_file_name, const char* lang);
bool LoadDictionary(const char* lang, TessdataManager* mgr);
// Recognizes the line image, contained within image_data, returning the
// ratings matrix and matching box_word for each WERD_RES in the output.

View File

@ -1223,7 +1223,7 @@ double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) {
std::string truth_word(truth_words[i].string());
StrMap::iterator it = word_counts.find(truth_word);
if (it == word_counts.end())
word_counts.insert(make_pair(truth_word, 1));
word_counts.insert(std::make_pair(truth_word, 1));
else
++it->second;
}
@ -1231,7 +1231,7 @@ double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) {
std::string ocr_word(ocr_words[i].string());
StrMap::iterator it = word_counts.find(ocr_word);
if (it == word_counts.end())
word_counts.insert(make_pair(ocr_word, -1));
word_counts.insert(std::make_pair(ocr_word, -1));
else
--it->second;
}

View File

@ -31,7 +31,6 @@ STRING_PARAM_FLAG(classifier, "", "Classifier to test");
STRING_PARAM_FLAG(lang, "eng", "Language to test");
STRING_PARAM_FLAG(tessdata_dir, "", "Directory of traineddata files");
DECLARE_INT_PARAM_FLAG(debug_level);
DECLARE_STRING_PARAM_FLAG(T);
enum ClassifierName {
CN_PRUNER,
@ -79,13 +78,6 @@ static tesseract::ShapeClassifier* InitializeClassifier(
}
tesseract::ShapeClassifier* shape_classifier = nullptr;
if (!FLAGS_T.empty()) {
const char* config_name;
while ((config_name = GetNextFilename(argc, argv)) != nullptr) {
tprintf("Reading config file %s ...\n", config_name);
(*api)->ReadConfigFile(config_name);
}
}
if (classifier == CN_PRUNER) {
shape_classifier = new tesseract::TessClassifier(true, classify);
} else if (classifier == CN_FULL) {

View File

@ -65,6 +65,7 @@
//
int main(int argc, char **argv) {
int i;
tesseract::TessdataManager tm;
if (argc == 2) {
printf("Combining tessdata files\n");
STRING lang = argv[1];
@ -73,8 +74,7 @@ int main(int argc, char **argv) {
lang += '.';
STRING output_file = lang;
output_file += kTrainedDataSuffix;
if (!tesseract::TessdataManager::CombineDataFiles(
lang.string(), output_file.string())) {
if (!tm.CombineDataFiles(lang.string(), output_file.string())) {
printf("Error combining tessdata files into %s\n",
output_file.string());
} else {
@ -83,8 +83,7 @@ int main(int argc, char **argv) {
} else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 ||
strcmp(argv[1], "-u") == 0)) {
// Initialize TessdataManager with the data in the given traineddata file.
tesseract::TessdataManager tm;
tm.Init(argv[2], 0);
tm.Init(argv[2]);
printf("Extracting tessdata components from %s\n", argv[2]);
if (strcmp(argv[1], "-e") == 0) {
for (i = 3; i < argc; ++i) {
@ -107,7 +106,6 @@ int main(int argc, char **argv) {
}
}
}
tm.End();
} else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
// Rename the current traineddata file to a temporary name.
const char *new_traineddata_filename = argv[2];
@ -120,12 +118,10 @@ int main(int argc, char **argv) {
}
// Initialize TessdataManager with the data in the given traineddata file.
tesseract::TessdataManager tm;
tm.Init(traineddata_filename.string(), 0);
tm.Init(traineddata_filename.string());
// Write the updated traineddata file.
tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
tm.End();
} else {
printf("Usage for combining tessdata components:\n"
" %s language_data_path_prefix\n"
@ -143,4 +139,5 @@ int main(int argc, char **argv) {
" (e.g. %s -u eng.traineddata tmp/eng.)\n", argv[0], argv[0]);
return 1;
}
tm.Directory();
}

View File

@ -59,7 +59,6 @@ STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
STRING_PARAM_FLAG(X, "", "File listing font xheights");
STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
STRING_PARAM_FLAG(O, "", "File to write unicharset to");
STRING_PARAM_FLAG(T, "", "File to load trainer from");
STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples,
@ -118,10 +117,10 @@ ShapeTable* LoadShapeTable(const STRING& file_prefix) {
ShapeTable* shape_table = nullptr;
STRING shape_table_file = file_prefix;
shape_table_file += kShapeTableFileSuffix;
FILE* shape_fp = fopen(shape_table_file.string(), "rb");
if (shape_fp != nullptr) {
TFile shape_fp;
if (shape_fp.Open(shape_table_file.string(), nullptr)) {
shape_table = new ShapeTable;
if (!shape_table->DeSerialize(false, shape_fp)) {
if (!shape_table->DeSerialize(false, &shape_fp)) {
delete shape_table;
shape_table = nullptr;
tprintf("Error: Failed to read shape table %s\n",
@ -131,7 +130,6 @@ ShapeTable* LoadShapeTable(const STRING& file_prefix) {
tprintf("Read shape table %s of %d shapes\n",
shape_table_file.string(), num_shapes);
}
fclose(shape_fp);
} else {
tprintf("Warning: No shape table file present: %s\n",
shape_table_file.string());
@ -199,75 +197,55 @@ MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
FLAGS_debug_level);
IntFeatureSpace fs;
fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets);
if (FLAGS_T.empty()) {
trainer->LoadUnicharset(FLAGS_U.c_str());
// Get basic font information from font_properties.
if (!FLAGS_F.empty()) {
if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
delete trainer;
return nullptr;
}
}
if (!FLAGS_X.empty()) {
if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
delete trainer;
return nullptr;
}
}
trainer->SetFeatureSpace(fs);
const char* page_name;
// Load training data from .tr files on the command line.
while ((page_name = GetNextFilename(argc, argv)) != nullptr) {
tprintf("Reading %s ...\n", page_name);
trainer->ReadTrainingSamples(page_name, feature_defs, false);
// If there is a file with [lang].[fontname].exp[num].fontinfo present,
// read font spacing information in to fontinfo_table.
int pagename_len = strlen(page_name);
char *fontinfo_file_name = new char[pagename_len + 7];
strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr"
strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
trainer->AddSpacingInfo(fontinfo_file_name);
delete[] fontinfo_file_name;
// Load the images into memory if required by the classifier.
if (FLAGS_load_images) {
STRING image_name = page_name;
// Chop off the tr and replace with tif. Extension must be tif!
image_name.truncate_at(image_name.length() - 2);
image_name += "tif";
trainer->LoadPageImages(image_name.string());
}
}
trainer->PostLoadCleanup();
// Write the master trainer if required.
if (!FLAGS_output_trainer.empty()) {
FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
if (fp == nullptr) {
tprintf("Can't create saved trainer data!\n");
} else {
trainer->Serialize(fp);
fclose(fp);
}
}
} else {
bool success = false;
tprintf("Loading master trainer from file:%s\n",
FLAGS_T.c_str());
FILE* fp = fopen(FLAGS_T.c_str(), "rb");
if (fp == nullptr) {
tprintf("Can't read file %s to initialize master trainer\n",
FLAGS_T.c_str());
} else {
success = trainer->DeSerialize(false, fp);
fclose(fp);
}
if (!success) {
tprintf("Deserialize of master trainer failed!\n");
trainer->LoadUnicharset(FLAGS_U.c_str());
// Get basic font information from font_properties.
if (!FLAGS_F.empty()) {
if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
delete trainer;
return nullptr;
}
trainer->SetFeatureSpace(fs);
}
if (!FLAGS_X.empty()) {
if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
delete trainer;
return nullptr;
}
}
trainer->SetFeatureSpace(fs);
const char* page_name;
// Load training data from .tr files on the command line.
while ((page_name = GetNextFilename(argc, argv)) != nullptr) {
tprintf("Reading %s ...\n", page_name);
trainer->ReadTrainingSamples(page_name, feature_defs, false);
// If there is a file with [lang].[fontname].exp[num].fontinfo present,
// read font spacing information in to fontinfo_table.
int pagename_len = strlen(page_name);
char* fontinfo_file_name = new char[pagename_len + 7];
strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr"
strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
trainer->AddSpacingInfo(fontinfo_file_name);
delete[] fontinfo_file_name;
// Load the images into memory if required by the classifier.
if (FLAGS_load_images) {
STRING image_name = page_name;
// Chop off the tr and replace with tif. Extension must be tif!
image_name.truncate_at(image_name.length() - 2);
image_name += "tif";
trainer->LoadPageImages(image_name.string());
}
}
trainer->PostLoadCleanup();
// Write the master trainer if required.
if (!FLAGS_output_trainer.empty()) {
FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
if (fp == nullptr) {
tprintf("Can't create saved trainer data!\n");
} else {
trainer->Serialize(fp);
fclose(fp);
}
}
trainer->PreTrainingSetup();
if (!FLAGS_O.empty() &&

View File

@ -19,6 +19,7 @@
#include "dawg.h"
#include "host.h"
#include "serialis.h"
#include "tesscallback.h"
#include "trie.h"
#include "unicharset.h"
@ -28,17 +29,20 @@ const int kDictDebugLevel = 1;
tesseract::Dawg *LoadSquishedDawg(const UNICHARSET &unicharset,
const char *filename) {
const int kDictDebugLevel = 1;
FILE *dawg_file = fopen(filename, "rb");
if (dawg_file == nullptr) {
tesseract::TFile dawg_file;
if (!dawg_file.Open(filename, nullptr)) {
tprintf("Could not open %s for reading.\n", filename);
return nullptr;
}
tprintf("Loading word list from %s\n", filename);
tesseract::Dawg *retval = new tesseract::SquishedDawg(
dawg_file, tesseract::DAWG_TYPE_WORD, "eng", SYSTEM_DAWG_PERM,
kDictDebugLevel);
tesseract::SquishedDawg *retval = new tesseract::SquishedDawg(
tesseract::DAWG_TYPE_WORD, "eng", SYSTEM_DAWG_PERM, kDictDebugLevel);
if (!retval->Load(&dawg_file)) {
tprintf("Could not read %s\n", filename);
delete retval;
return nullptr;
}
tprintf("Word list loaded.\n");
fclose(dawg_file);
return retval;
}

View File

@ -100,17 +100,15 @@ bool ParamsModel::Equivalent(const ParamsModel &that) const {
bool ParamsModel::LoadFromFile(
const char *lang,
const char *full_path) {
FILE *fp = fopen(full_path, "rb");
if (!fp) {
TFile fp;
if (!fp.Open(full_path, nullptr)) {
tprintf("Error opening file %s\n", full_path);
return false;
}
bool result = LoadFromFp(lang, fp, -1);
fclose(fp);
return result;
return LoadFromFp(lang, &fp);
}
bool ParamsModel::LoadFromFp(const char *lang, FILE *fp, inT64 end_offset) {
bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) {
const int kMaxLineSize = 100;
char line[kMaxLineSize];
BitVector present;
@ -120,9 +118,8 @@ bool ParamsModel::LoadFromFp(const char *lang, FILE *fp, inT64 end_offset) {
GenericVector<float> &weights = weights_vec_[pass_];
weights.init_to_size(PTRAIN_NUM_FEATURE_TYPES, 0.0);
while ((end_offset < 0 || ftell(fp) < end_offset) &&
fgets(line, kMaxLineSize, fp)) {
char *key = NULL;
while (fp->FGets(line, kMaxLineSize) != nullptr) {
char *key = nullptr;
float value;
if (!ParseLine(line, &key, &value))
continue;

View File

@ -61,7 +61,7 @@ class ParamsModel {
// Returns true on success.
bool LoadFromFile(const char *lang, const char *full_path);
bool LoadFromFp(const char *lang, FILE *fp, inT64 end_offset);
bool LoadFromFp(const char *lang, TFile *fp);
const GenericVector<float>& weights() const {
return weights_vec_[pass_];

View File

@ -44,14 +44,14 @@ namespace tesseract {
* and Dawg models.
*/
void Wordrec::program_editup(const char *textbase,
bool init_classifier,
bool init_dict) {
TessdataManager *init_classifier,
TessdataManager *init_dict) {
if (textbase != NULL) imagefile = textbase;
InitFeatureDefs(&feature_defs_);
InitAdaptiveClassifier(init_classifier);
if (init_dict) {
getDict().SetupForLoad(Dict::GlobalDawgCache());
getDict().Load(tessdata_manager.GetDataFileName().string(), lang);
getDict().Load(lang, init_dict);
getDict().FinishLoad();
}
pass2_ok_split = chop_ok_split;

View File

@ -200,9 +200,8 @@ class Wordrec : public Classify {
}
// tface.cpp
void program_editup(const char *textbase,
bool init_classifier,
bool init_permute);
void program_editup(const char *textbase, TessdataManager *init_classifier,
TessdataManager *init_dict);
void cc_recog(WERD_RES *word);
void program_editdown(inT32 elasped_time);
void set_pass1();