mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 06:30:14 +08:00
Added extra Init that takes a memory buffer or a filereader function pointer to enable read of traineddata from memory or foreign file systems. Updated existing readers to use TFile API instead of FILE. This does not yet add big-endian capability to LSTM, but it is very easy from here.
This commit is contained in:
parent
10e04ffe99
commit
1cc511188d
104
api/baseapi.cpp
104
api/baseapi.cpp
@ -108,26 +108,30 @@ const int kMinCredibleResolution = 70;
|
||||
const int kMaxCredibleResolution = 2400;
|
||||
|
||||
TessBaseAPI::TessBaseAPI()
|
||||
: tesseract_(NULL),
|
||||
osd_tesseract_(NULL),
|
||||
equ_detect_(NULL),
|
||||
// Thresholder is initialized to NULL here, but will be set before use by:
|
||||
// A constructor of a derived API, SetThresholder(), or
|
||||
// created implicitly when used in InternalSetImage.
|
||||
thresholder_(NULL),
|
||||
paragraph_models_(NULL),
|
||||
block_list_(NULL),
|
||||
page_res_(NULL),
|
||||
input_file_(NULL),
|
||||
output_file_(NULL),
|
||||
datapath_(NULL),
|
||||
language_(NULL),
|
||||
last_oem_requested_(OEM_DEFAULT),
|
||||
recognition_done_(false),
|
||||
truth_cb_(NULL),
|
||||
rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0),
|
||||
image_width_(0), image_height_(0) {
|
||||
}
|
||||
: tesseract_(nullptr),
|
||||
osd_tesseract_(nullptr),
|
||||
equ_detect_(nullptr),
|
||||
reader_(nullptr),
|
||||
// Thresholder is initialized to NULL here, but will be set before use by:
|
||||
// A constructor of a derived API, SetThresholder(), or
|
||||
// created implicitly when used in InternalSetImage.
|
||||
thresholder_(nullptr),
|
||||
paragraph_models_(nullptr),
|
||||
block_list_(nullptr),
|
||||
page_res_(nullptr),
|
||||
input_file_(nullptr),
|
||||
output_file_(nullptr),
|
||||
datapath_(nullptr),
|
||||
language_(nullptr),
|
||||
last_oem_requested_(OEM_DEFAULT),
|
||||
recognition_done_(false),
|
||||
truth_cb_(NULL),
|
||||
rect_left_(0),
|
||||
rect_top_(0),
|
||||
rect_width_(0),
|
||||
rect_height_(0),
|
||||
image_width_(0),
|
||||
image_height_(0) {}
|
||||
|
||||
TessBaseAPI::~TessBaseAPI() {
|
||||
End();
|
||||
@ -275,20 +279,33 @@ int TessBaseAPI::Init(const char* datapath, const char* language,
|
||||
const GenericVector<STRING> *vars_vec,
|
||||
const GenericVector<STRING> *vars_values,
|
||||
bool set_only_non_debug_params) {
|
||||
return Init(datapath, 0, language, oem, configs, configs_size, vars_vec,
|
||||
vars_values, set_only_non_debug_params, nullptr);
|
||||
}
|
||||
|
||||
// In-memory version reads the traineddata file directly from the given
|
||||
// data[data_size] array. Also implements the version with a datapath in data,
|
||||
// flagged by data_size = 0.
|
||||
int TessBaseAPI::Init(const char* data, int data_size, const char* language,
|
||||
OcrEngineMode oem, char** configs, int configs_size,
|
||||
const GenericVector<STRING>* vars_vec,
|
||||
const GenericVector<STRING>* vars_values,
|
||||
bool set_only_non_debug_params, FileReader reader) {
|
||||
PERF_COUNT_START("TessBaseAPI::Init")
|
||||
// Default language is "eng".
|
||||
if (language == NULL) language = "eng";
|
||||
if (language == nullptr) language = "eng";
|
||||
STRING datapath = data_size == 0 ? data : language;
|
||||
// If the datapath, OcrEngineMode or the language have changed - start again.
|
||||
// Note that the language_ field stores the last requested language that was
|
||||
// initialized successfully, while tesseract_->lang stores the language
|
||||
// actually used. They differ only if the requested language was NULL, in
|
||||
// which case tesseract_->lang is set to the Tesseract default ("eng").
|
||||
if (tesseract_ != NULL &&
|
||||
(datapath_ == NULL || language_ == NULL ||
|
||||
*datapath_ != datapath || last_oem_requested_ != oem ||
|
||||
if (tesseract_ != nullptr &&
|
||||
(datapath_ == nullptr || language_ == nullptr || *datapath_ != datapath ||
|
||||
last_oem_requested_ != oem ||
|
||||
(*language_ != language && tesseract_->lang != language))) {
|
||||
delete tesseract_;
|
||||
tesseract_ = NULL;
|
||||
tesseract_ = nullptr;
|
||||
}
|
||||
// PERF_COUNT_SUB("delete tesseract_")
|
||||
#ifdef USE_OPENCL
|
||||
@ -297,19 +314,25 @@ int TessBaseAPI::Init(const char* datapath, const char* language,
|
||||
#endif
|
||||
PERF_COUNT_SUB("OD::InitEnv()")
|
||||
bool reset_classifier = true;
|
||||
if (tesseract_ == NULL) {
|
||||
if (tesseract_ == nullptr) {
|
||||
reset_classifier = false;
|
||||
tesseract_ = new Tesseract;
|
||||
if (reader != nullptr) reader_ = reader;
|
||||
TessdataManager mgr(reader_);
|
||||
if (data_size != 0) {
|
||||
mgr.LoadMemBuffer(language, data, data_size);
|
||||
}
|
||||
if (tesseract_->init_tesseract(
|
||||
datapath, output_file_ != NULL ? output_file_->string() : NULL,
|
||||
language, oem, configs, configs_size, vars_vec, vars_values,
|
||||
set_only_non_debug_params) != 0) {
|
||||
datapath.string(),
|
||||
output_file_ != nullptr ? output_file_->string() : nullptr,
|
||||
language, oem, configs, configs_size, vars_vec, vars_values,
|
||||
set_only_non_debug_params, &mgr) != 0) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
PERF_COUNT_SUB("update tesseract_")
|
||||
// Update datapath and language requested for the last valid initialization.
|
||||
if (datapath_ == NULL)
|
||||
if (datapath_ == nullptr)
|
||||
datapath_ = new STRING(datapath);
|
||||
else
|
||||
*datapath_ = datapath;
|
||||
@ -317,7 +340,7 @@ int TessBaseAPI::Init(const char* datapath, const char* language,
|
||||
(strcmp(tesseract_->datadir.string(), "") != 0))
|
||||
*datapath_ = tesseract_->datadir;
|
||||
|
||||
if (language_ == NULL)
|
||||
if (language_ == nullptr)
|
||||
language_ = new STRING(language);
|
||||
else
|
||||
*language_ = language;
|
||||
@ -421,7 +444,8 @@ int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
|
||||
tesseract_ = new Tesseract;
|
||||
else
|
||||
ParamUtils::ResetToDefaults(tesseract_->params());
|
||||
return tesseract_->init_tesseract_lm(datapath, NULL, language);
|
||||
TessdataManager mgr;
|
||||
return tesseract_->init_tesseract_lm(datapath, NULL, language, &mgr);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -431,7 +455,7 @@ int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
|
||||
void TessBaseAPI::InitForAnalysePage() {
|
||||
if (tesseract_ == NULL) {
|
||||
tesseract_ = new Tesseract;
|
||||
tesseract_->InitAdaptiveClassifier(false);
|
||||
tesseract_->InitAdaptiveClassifier(nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2239,7 +2263,7 @@ int TessBaseAPI::FindLines() {
|
||||
}
|
||||
if (tesseract_ == NULL) {
|
||||
tesseract_ = new Tesseract;
|
||||
tesseract_->InitAdaptiveClassifier(false);
|
||||
tesseract_->InitAdaptiveClassifier(nullptr);
|
||||
}
|
||||
if (tesseract_->pix_binary() == NULL)
|
||||
Threshold(tesseract_->mutable_pix_binary());
|
||||
@ -2261,14 +2285,16 @@ int TessBaseAPI::FindLines() {
|
||||
|
||||
Tesseract* osd_tess = osd_tesseract_;
|
||||
OSResults osr;
|
||||
if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == NULL) {
|
||||
if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) &&
|
||||
osd_tess == nullptr) {
|
||||
if (strcmp(language_->string(), "osd") == 0) {
|
||||
osd_tess = tesseract_;
|
||||
} else {
|
||||
osd_tesseract_ = new Tesseract;
|
||||
if (osd_tesseract_->init_tesseract(
|
||||
datapath_->string(), NULL, "osd", OEM_TESSERACT_ONLY,
|
||||
NULL, 0, NULL, NULL, false) == 0) {
|
||||
TessdataManager mgr(reader_);
|
||||
if (osd_tesseract_->init_tesseract(datapath_->string(), nullptr, "osd",
|
||||
OEM_TESSERACT_ONLY, nullptr, 0,
|
||||
nullptr, nullptr, false, &mgr) == 0) {
|
||||
osd_tess = osd_tesseract_;
|
||||
osd_tesseract_->set_source_resolution(
|
||||
thresholder_->GetSourceYResolution());
|
||||
@ -2276,7 +2302,7 @@ int TessBaseAPI::FindLines() {
|
||||
tprintf("Warning: Auto orientation and script detection requested,"
|
||||
" but osd language failed to load\n");
|
||||
delete osd_tesseract_;
|
||||
osd_tesseract_ = NULL;
|
||||
osd_tesseract_ = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -29,14 +29,15 @@
|
||||
// To avoid collision with other typenames include the ABSOLUTE MINIMUM
|
||||
// complexity of includes here. Use forward declarations wherever possible
|
||||
// and hide includes of complex types in baseapi.cpp.
|
||||
#include "platform.h"
|
||||
#include "apitypes.h"
|
||||
#include "pageiterator.h"
|
||||
#include "platform.h"
|
||||
#include "publictypes.h"
|
||||
#include "resultiterator.h"
|
||||
#include "serialis.h"
|
||||
#include "tesscallback.h"
|
||||
#include "thresholder.h"
|
||||
#include "unichar.h"
|
||||
#include "tesscallback.h"
|
||||
#include "publictypes.h"
|
||||
#include "pageiterator.h"
|
||||
#include "resultiterator.h"
|
||||
|
||||
template <typename T> class GenericVector;
|
||||
class PAGE_RES;
|
||||
@ -237,6 +238,13 @@ class TESS_API TessBaseAPI {
|
||||
int Init(const char* datapath, const char* language) {
|
||||
return Init(datapath, language, OEM_DEFAULT, NULL, 0, NULL, NULL, false);
|
||||
}
|
||||
// In-memory version reads the traineddata file directly from the given
|
||||
// data[data_size] array, and/or reads data via a FileReader.
|
||||
int Init(const char* data, int data_size, const char* language,
|
||||
OcrEngineMode mode, char** configs, int configs_size,
|
||||
const GenericVector<STRING>* vars_vec,
|
||||
const GenericVector<STRING>* vars_values,
|
||||
bool set_only_non_debug_params, FileReader reader);
|
||||
|
||||
/**
|
||||
* Returns the languages string used in the last valid initialization.
|
||||
@ -859,6 +867,7 @@ class TESS_API TessBaseAPI {
|
||||
Tesseract* tesseract_; ///< The underlying data object.
|
||||
Tesseract* osd_tesseract_; ///< For orientation & script detection.
|
||||
EquationDetect* equ_detect_; ///<The equation detector.
|
||||
FileReader reader_; ///< Reads files from any filesystem.
|
||||
ImageThresholder* thresholder_; ///< Image thresholding module.
|
||||
GenericVector<ParagraphModel *>* paragraph_models_;
|
||||
BLOCK_LIST* block_list_; ///< The page layout.
|
||||
|
@ -92,8 +92,8 @@ bool Tesseract::init_tesseract_lang_data(
|
||||
const char *arg0, const char *textbase, const char *language,
|
||||
OcrEngineMode oem, char **configs, int configs_size,
|
||||
const GenericVector<STRING> *vars_vec,
|
||||
const GenericVector<STRING> *vars_values,
|
||||
bool set_only_non_debug_params) {
|
||||
const GenericVector<STRING> *vars_values, bool set_only_non_debug_params,
|
||||
TessdataManager *mgr) {
|
||||
// Set the basename, compute the data directory.
|
||||
main_setup(arg0, textbase);
|
||||
|
||||
@ -105,16 +105,28 @@ bool Tesseract::init_tesseract_lang_data(
|
||||
|
||||
// Initialize TessdataManager.
|
||||
STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
|
||||
if (!tessdata_manager.Init(tessdata_path.string(),
|
||||
tessdata_manager_debug_level)) {
|
||||
return false;
|
||||
if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string())) {
|
||||
// Try without tessdata.
|
||||
m_data_sub_dir.set_value("");
|
||||
main_setup(arg0, textbase);
|
||||
language_data_path_prefix = datadir;
|
||||
language_data_path_prefix += lang;
|
||||
language_data_path_prefix += ".";
|
||||
tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
|
||||
if (!mgr->Init(tessdata_path.string())) {
|
||||
tprintf("Error opening data file %s\n", tessdata_path.string());
|
||||
tprintf(
|
||||
"Please make sure the TESSDATA_PREFIX environment variable is set"
|
||||
" to your \"tessdata\" directory.\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (oem == OEM_DEFAULT) {
|
||||
// Set the engine mode from availability, which can then be overidden by
|
||||
// the config file when we read it below.
|
||||
if (!tessdata_manager.IsLSTMAvailable()) {
|
||||
if (!mgr->IsLSTMAvailable()) {
|
||||
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
|
||||
} else if (!tessdata_manager.IsBaseAvailable()) {
|
||||
} else if (!mgr->IsBaseAvailable()) {
|
||||
tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
|
||||
} else {
|
||||
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
|
||||
@ -122,14 +134,10 @@ bool Tesseract::init_tesseract_lang_data(
|
||||
}
|
||||
|
||||
// If a language specific config file (lang.config) exists, load it in.
|
||||
if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) {
|
||||
ParamUtils::ReadParamsFromFp(
|
||||
tessdata_manager.GetDataFilePtr(),
|
||||
tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG),
|
||||
SET_PARAM_CONSTRAINT_NONE, this->params());
|
||||
if (tessdata_manager_debug_level) {
|
||||
tprintf("Loaded language config file\n");
|
||||
}
|
||||
TFile fp;
|
||||
if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
|
||||
ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp,
|
||||
this->params());
|
||||
}
|
||||
|
||||
SetParamConstraint set_params_constraint = set_only_non_debug_params ?
|
||||
@ -159,10 +167,6 @@ bool Tesseract::init_tesseract_lang_data(
|
||||
if (params_file != NULL) {
|
||||
ParamUtils::PrintParams(params_file, this->params());
|
||||
fclose(params_file);
|
||||
if (tessdata_manager_debug_level > 0) {
|
||||
tprintf("Wrote parameters to %s\n",
|
||||
tessedit_write_params_to_file.string());
|
||||
}
|
||||
} else {
|
||||
tprintf("Failed to open %s for writing params.\n",
|
||||
tessedit_write_params_to_file.string());
|
||||
@ -171,17 +175,10 @@ bool Tesseract::init_tesseract_lang_data(
|
||||
|
||||
// Determine which ocr engine(s) should be loaded and used for recognition.
|
||||
if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
|
||||
if (tessdata_manager_debug_level) {
|
||||
tprintf("Loading Tesseract/LSTM with tessedit_ocr_engine_mode %d\n",
|
||||
static_cast<int>(tessedit_ocr_engine_mode));
|
||||
}
|
||||
|
||||
// If we are only loading the config file (and so not planning on doing any
|
||||
// recognition) then there's nothing else do here.
|
||||
if (tessedit_init_config_only) {
|
||||
if (tessdata_manager_debug_level) {
|
||||
tprintf("Returning after loading config file\n");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -191,17 +188,14 @@ bool Tesseract::init_tesseract_lang_data(
|
||||
#ifndef ANDROID_BUILD
|
||||
if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
|
||||
tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
|
||||
if (tessdata_manager.swap()) {
|
||||
if (mgr->swap()) {
|
||||
tprintf("Error: LSTM requested on big-endian hardware!!\n");
|
||||
tprintf("Big-endian not yet supported! Loading tesseract.\n");
|
||||
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
|
||||
} else if (tessdata_manager.SeekToStart(TESSDATA_LSTM)) {
|
||||
} else if (mgr->GetComponent(TESSDATA_LSTM, &fp)) {
|
||||
lstm_recognizer_ = new LSTMRecognizer;
|
||||
TFile fp;
|
||||
fp.Open(tessdata_manager.GetDataFilePtr(), -1);
|
||||
ASSERT_HOST(lstm_recognizer_->DeSerialize(tessdata_manager.swap(), &fp));
|
||||
if (lstm_use_matrix)
|
||||
lstm_recognizer_->LoadDictionary(tessdata_path.string(), language);
|
||||
ASSERT_HOST(lstm_recognizer_->DeSerialize(mgr->swap(), &fp));
|
||||
if (lstm_use_matrix) lstm_recognizer_->LoadDictionary(language, mgr);
|
||||
} else {
|
||||
tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
|
||||
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
|
||||
@ -215,15 +209,14 @@ bool Tesseract::init_tesseract_lang_data(
|
||||
#ifndef ANDROID_BUILD
|
||||
unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
|
||||
#endif
|
||||
} else if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
|
||||
!unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
|
||||
} else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) ||
|
||||
!unicharset.load_from_file(&fp, false)) {
|
||||
return false;
|
||||
}
|
||||
if (unicharset.size() > MAX_NUM_CLASSES) {
|
||||
tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
|
||||
return false;
|
||||
}
|
||||
if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
|
||||
right_to_left_ = unicharset.major_right_to_left();
|
||||
|
||||
// Setup initial unichar ambigs table and read universal ambigs.
|
||||
@ -232,16 +225,10 @@ bool Tesseract::init_tesseract_lang_data(
|
||||
unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
|
||||
unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
|
||||
|
||||
if (!tessedit_ambigs_training &&
|
||||
tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) {
|
||||
TFile ambigs_file;
|
||||
ambigs_file.Open(tessdata_manager.GetDataFilePtr(),
|
||||
tessdata_manager.GetEndOffset(TESSDATA_AMBIGS) + 1);
|
||||
unichar_ambigs.LoadUnicharAmbigs(
|
||||
encoder_unicharset,
|
||||
&ambigs_file,
|
||||
ambigs_debug_level, use_ambigs_for_adaption, &unicharset);
|
||||
if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
|
||||
if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
|
||||
unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp,
|
||||
ambigs_debug_level,
|
||||
use_ambigs_for_adaption, &unicharset);
|
||||
}
|
||||
// Init ParamsModel.
|
||||
// Load pass1 and pass2 weights (for now these two sets are the same, but in
|
||||
@ -250,15 +237,12 @@ bool Tesseract::init_tesseract_lang_data(
|
||||
p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
|
||||
language_model_->getParamsModel().SetPass(
|
||||
static_cast<ParamsModel::PassEnum>(p));
|
||||
if (tessdata_manager.SeekToStart(TESSDATA_PARAMS_MODEL)) {
|
||||
if (!language_model_->getParamsModel().LoadFromFp(
|
||||
lang.string(), tessdata_manager.GetDataFilePtr(),
|
||||
tessdata_manager.GetEndOffset(TESSDATA_PARAMS_MODEL))) {
|
||||
if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
|
||||
if (!language_model_->getParamsModel().LoadFromFp(lang.string(), &fp)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (tessdata_manager_debug_level) language_model_->getParamsModel().Print();
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -303,8 +287,6 @@ void Tesseract::ParseLanguageString(const char* lang_str,
|
||||
remains = next;
|
||||
// Check whether lang_code is already in the target vector and add.
|
||||
if (!IsStrInList(lang_code, *target)) {
|
||||
if (tessdata_manager_debug_level)
|
||||
tprintf("Adding language '%s' to list\n", lang_code.string());
|
||||
target->push_back(lang_code);
|
||||
}
|
||||
}
|
||||
@ -314,12 +296,13 @@ void Tesseract::ParseLanguageString(const char* lang_str,
|
||||
// string and recursively any additional languages required by any language
|
||||
// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
|
||||
// See init_tesseract_internal for args.
|
||||
int Tesseract::init_tesseract(
|
||||
const char *arg0, const char *textbase, const char *language,
|
||||
OcrEngineMode oem, char **configs, int configs_size,
|
||||
const GenericVector<STRING> *vars_vec,
|
||||
const GenericVector<STRING> *vars_values,
|
||||
bool set_only_non_debug_params) {
|
||||
int Tesseract::init_tesseract(const char *arg0, const char *textbase,
|
||||
const char *language, OcrEngineMode oem,
|
||||
char **configs, int configs_size,
|
||||
const GenericVector<STRING> *vars_vec,
|
||||
const GenericVector<STRING> *vars_values,
|
||||
bool set_only_non_debug_params,
|
||||
TessdataManager *mgr) {
|
||||
GenericVector<STRING> langs_to_load;
|
||||
GenericVector<STRING> langs_not_to_load;
|
||||
ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
|
||||
@ -341,15 +324,15 @@ int Tesseract::init_tesseract(
|
||||
}
|
||||
|
||||
int result = tess_to_init->init_tesseract_internal(
|
||||
arg0, textbase, lang_str, oem, configs, configs_size,
|
||||
vars_vec, vars_values, set_only_non_debug_params);
|
||||
arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
|
||||
vars_values, set_only_non_debug_params, mgr);
|
||||
// Forget that language, but keep any reader we were given.
|
||||
mgr->Clear();
|
||||
|
||||
if (!loaded_primary) {
|
||||
if (result < 0) {
|
||||
tprintf("Failed loading language '%s'\n", lang_str);
|
||||
} else {
|
||||
if (tessdata_manager_debug_level)
|
||||
tprintf("Loaded language '%s' as main language\n", lang_str);
|
||||
ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
|
||||
&langs_to_load, &langs_not_to_load);
|
||||
loaded_primary = true;
|
||||
@ -359,8 +342,6 @@ int Tesseract::init_tesseract(
|
||||
tprintf("Failed loading language '%s'\n", lang_str);
|
||||
delete tess_to_init;
|
||||
} else {
|
||||
if (tessdata_manager_debug_level)
|
||||
tprintf("Loaded language '%s' as secondary language\n", lang_str);
|
||||
sub_langs_.push_back(tess_to_init);
|
||||
// Add any languages that this language requires
|
||||
ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
|
||||
@ -385,16 +366,11 @@ int Tesseract::init_tesseract(
|
||||
this->language_model_->getParamsModel());
|
||||
}
|
||||
tprintf("Using params model of the primary language\n");
|
||||
if (tessdata_manager_debug_level) {
|
||||
this->language_model_->getParamsModel().Print();
|
||||
}
|
||||
} else {
|
||||
this->language_model_->getParamsModel().Clear();
|
||||
for (int s = 0; s < sub_langs_.size(); ++s) {
|
||||
sub_langs_[s]->language_model_->getParamsModel().Clear();
|
||||
}
|
||||
if (tessdata_manager_debug_level)
|
||||
tprintf("Using default language params\n");
|
||||
}
|
||||
}
|
||||
|
||||
@ -418,26 +394,26 @@ int Tesseract::init_tesseract(
|
||||
// in vars_vec.
|
||||
// If set_only_init_params is true, then only the initialization variables
|
||||
// will be set.
|
||||
int Tesseract::init_tesseract_internal(
|
||||
const char *arg0, const char *textbase, const char *language,
|
||||
OcrEngineMode oem, char **configs, int configs_size,
|
||||
const GenericVector<STRING> *vars_vec,
|
||||
const GenericVector<STRING> *vars_values,
|
||||
bool set_only_non_debug_params) {
|
||||
int Tesseract::init_tesseract_internal(const char *arg0, const char *textbase,
|
||||
const char *language, OcrEngineMode oem,
|
||||
char **configs, int configs_size,
|
||||
const GenericVector<STRING> *vars_vec,
|
||||
const GenericVector<STRING> *vars_values,
|
||||
bool set_only_non_debug_params,
|
||||
TessdataManager *mgr) {
|
||||
if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
|
||||
configs_size, vars_vec, vars_values,
|
||||
set_only_non_debug_params)) {
|
||||
set_only_non_debug_params, mgr)) {
|
||||
return -1;
|
||||
}
|
||||
if (tessedit_init_config_only) {
|
||||
tessdata_manager.End();
|
||||
return 0;
|
||||
}
|
||||
// If only LSTM will be used, skip loading Tesseract classifier's
|
||||
// pre-trained templates and dictionary.
|
||||
bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
|
||||
program_editup(textbase, init_tesseract, init_tesseract);
|
||||
tessdata_manager.End();
|
||||
program_editup(textbase, init_tesseract ? mgr : nullptr,
|
||||
init_tesseract ? mgr : nullptr);
|
||||
return 0; //Normal exit
|
||||
}
|
||||
|
||||
@ -482,16 +458,14 @@ void Tesseract::SetupUniversalFontIds() {
|
||||
}
|
||||
|
||||
// init the LM component
|
||||
int Tesseract::init_tesseract_lm(const char *arg0,
|
||||
const char *textbase,
|
||||
const char *language) {
|
||||
int Tesseract::init_tesseract_lm(const char *arg0, const char *textbase,
|
||||
const char *language, TessdataManager *mgr) {
|
||||
if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
|
||||
NULL, 0, NULL, NULL, false))
|
||||
NULL, 0, NULL, NULL, false, mgr))
|
||||
return -1;
|
||||
getDict().SetupForLoad(Dict::GlobalDawgCache());
|
||||
getDict().Load(tessdata_manager.GetDataFileName().string(), lang);
|
||||
getDict().Load(lang, mgr);
|
||||
getDict().FinishLoad();
|
||||
tessdata_manager.End();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -466,10 +466,6 @@ Tesseract::Tesseract()
|
||||
STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
|
||||
BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word",
|
||||
this->params()),
|
||||
INT_MEMBER(tessdata_manager_debug_level, 0,
|
||||
"Debug level for"
|
||||
" TessdataManager functions.",
|
||||
this->params()),
|
||||
STRING_MEMBER(tessedit_load_sublangs, "",
|
||||
"List of languages to load with this one", this->params()),
|
||||
BOOL_MEMBER(tessedit_use_primary_params_model, false,
|
||||
|
@ -496,20 +496,17 @@ class Tesseract : public Wordrec {
|
||||
// string and recursively any additional languages required by any language
|
||||
// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
|
||||
// See init_tesseract_internal for args.
|
||||
int init_tesseract(const char *arg0,
|
||||
const char *textbase,
|
||||
const char *language,
|
||||
OcrEngineMode oem,
|
||||
char **configs,
|
||||
int configs_size,
|
||||
const GenericVector<STRING> *vars_vec,
|
||||
const GenericVector<STRING> *vars_values,
|
||||
bool set_only_init_params);
|
||||
int init_tesseract(const char* arg0, const char* textbase,
|
||||
const char* language, OcrEngineMode oem, char** configs,
|
||||
int configs_size, const GenericVector<STRING>* vars_vec,
|
||||
const GenericVector<STRING>* vars_values,
|
||||
bool set_only_init_params, TessdataManager* mgr);
|
||||
int init_tesseract(const char *datapath,
|
||||
const char *language,
|
||||
OcrEngineMode oem) {
|
||||
return init_tesseract(datapath, NULL, language, oem,
|
||||
NULL, 0, NULL, NULL, false);
|
||||
TessdataManager mgr;
|
||||
return init_tesseract(datapath, NULL, language, oem, NULL, 0, NULL, NULL,
|
||||
false, &mgr);
|
||||
}
|
||||
// Common initialization for a single language.
|
||||
// arg0 is the datapath for the tessdata directory, which could be the
|
||||
@ -527,36 +524,30 @@ class Tesseract : public Wordrec {
|
||||
// in vars_vec.
|
||||
// If set_only_init_params is true, then only the initialization variables
|
||||
// will be set.
|
||||
int init_tesseract_internal(const char *arg0,
|
||||
const char *textbase,
|
||||
const char *language,
|
||||
OcrEngineMode oem,
|
||||
char **configs,
|
||||
int configs_size,
|
||||
const GenericVector<STRING> *vars_vec,
|
||||
const GenericVector<STRING> *vars_values,
|
||||
bool set_only_init_params);
|
||||
int init_tesseract_internal(const char* arg0, const char* textbase,
|
||||
const char* language, OcrEngineMode oem,
|
||||
char** configs, int configs_size,
|
||||
const GenericVector<STRING>* vars_vec,
|
||||
const GenericVector<STRING>* vars_values,
|
||||
bool set_only_init_params, TessdataManager* mgr);
|
||||
|
||||
// Set the universal_id member of each font to be unique among all
|
||||
// instances of the same font loaded.
|
||||
void SetupUniversalFontIds();
|
||||
|
||||
int init_tesseract_lm(const char *arg0,
|
||||
const char *textbase,
|
||||
const char *language);
|
||||
int init_tesseract_lm(const char* arg0, const char* textbase,
|
||||
const char* language, TessdataManager* mgr);
|
||||
|
||||
void recognize_page(STRING& image_name);
|
||||
void end_tesseract();
|
||||
|
||||
bool init_tesseract_lang_data(const char *arg0,
|
||||
const char *textbase,
|
||||
const char *language,
|
||||
OcrEngineMode oem,
|
||||
char **configs,
|
||||
int configs_size,
|
||||
const GenericVector<STRING> *vars_vec,
|
||||
const GenericVector<STRING> *vars_values,
|
||||
bool set_only_init_params);
|
||||
bool init_tesseract_lang_data(const char* arg0, const char* textbase,
|
||||
const char* language, OcrEngineMode oem,
|
||||
char** configs, int configs_size,
|
||||
const GenericVector<STRING>* vars_vec,
|
||||
const GenericVector<STRING>* vars_values,
|
||||
bool set_only_init_params,
|
||||
TessdataManager* mgr);
|
||||
|
||||
void ParseLanguageString(const char* lang_str,
|
||||
GenericVector<STRING>* to_load,
|
||||
@ -1074,8 +1065,6 @@ class Tesseract : public Wordrec {
|
||||
BOOL_VAR_H(interactive_display_mode, false, "Run interactively?");
|
||||
STRING_VAR_H(file_type, ".tif", "Filename extension");
|
||||
BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word");
|
||||
INT_VAR_H(tessdata_manager_debug_level, 0,
|
||||
"Debug level for TessdataManager functions.");
|
||||
STRING_VAR_H(tessedit_load_sublangs, "",
|
||||
"List of languages to load with this one");
|
||||
BOOL_VAR_H(tessedit_use_primary_params_model, false,
|
||||
|
@ -31,7 +31,7 @@ bool FontInfo::Serialize(FILE* fp) const {
|
||||
}
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool FontInfo::DeSerialize(bool swap, FILE* fp) {
|
||||
bool FontInfo::DeSerialize(bool swap, TFile* fp) {
|
||||
if (!read_info(fp, this, swap)) return false;
|
||||
if (!read_spacing_info(fp, this, swap)) return false;
|
||||
return true;
|
||||
@ -51,7 +51,7 @@ bool FontInfoTable::Serialize(FILE* fp) const {
|
||||
}
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool FontInfoTable::DeSerialize(bool swap, FILE* fp) {
|
||||
bool FontInfoTable::DeSerialize(bool swap, TFile* fp) {
|
||||
truncate(0);
|
||||
return this->DeSerializeClasses(swap, fp);
|
||||
}
|
||||
@ -149,19 +149,15 @@ void FontSetDeleteCallback(FontSet fs) {
|
||||
|
||||
/*---------------------------------------------------------------------------*/
|
||||
// Callbacks used by UnicityTable to read/write FontInfo/FontSet structures.
|
||||
bool read_info(FILE* f, FontInfo* fi, bool swap) {
|
||||
bool read_info(TFile* f, FontInfo* fi, bool swap) {
|
||||
inT32 size;
|
||||
if (fread(&size, sizeof(size), 1, f) != 1) return false;
|
||||
if (swap)
|
||||
Reverse32(&size);
|
||||
if (f->FReadEndian(&size, sizeof(size), 1, swap) != 1) return false;
|
||||
char* font_name = new char[size + 1];
|
||||
fi->name = font_name;
|
||||
if (static_cast<int>(fread(font_name, sizeof(*font_name), size, f)) != size)
|
||||
return false;
|
||||
if (f->FRead(font_name, sizeof(*font_name), size) != size) return false;
|
||||
font_name[size] = '\0';
|
||||
if (fread(&fi->properties, sizeof(fi->properties), 1, f) != 1) return false;
|
||||
if (swap)
|
||||
Reverse32(&fi->properties);
|
||||
if (f->FReadEndian(&fi->properties, sizeof(fi->properties), 1, swap) != 1)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -174,26 +170,22 @@ bool write_info(FILE* f, const FontInfo& fi) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool read_spacing_info(FILE *f, FontInfo* fi, bool swap) {
|
||||
bool read_spacing_info(TFile* f, FontInfo* fi, bool swap) {
|
||||
inT32 vec_size, kern_size;
|
||||
if (fread(&vec_size, sizeof(vec_size), 1, f) != 1) return false;
|
||||
if (swap) Reverse32(&vec_size);
|
||||
if (f->FReadEndian(&vec_size, sizeof(vec_size), 1, swap) != 1) return false;
|
||||
ASSERT_HOST(vec_size >= 0);
|
||||
if (vec_size == 0) return true;
|
||||
fi->init_spacing(vec_size);
|
||||
for (int i = 0; i < vec_size; ++i) {
|
||||
FontSpacingInfo *fs = new FontSpacingInfo();
|
||||
if (fread(&fs->x_gap_before, sizeof(fs->x_gap_before), 1, f) != 1 ||
|
||||
fread(&fs->x_gap_after, sizeof(fs->x_gap_after), 1, f) != 1 ||
|
||||
fread(&kern_size, sizeof(kern_size), 1, f) != 1) {
|
||||
if (f->FReadEndian(&fs->x_gap_before, sizeof(fs->x_gap_before), 1, swap) !=
|
||||
1 ||
|
||||
f->FReadEndian(&fs->x_gap_after, sizeof(fs->x_gap_after), 1, swap) !=
|
||||
1 ||
|
||||
f->FReadEndian(&kern_size, sizeof(kern_size), 1, swap) != 1) {
|
||||
delete fs;
|
||||
return false;
|
||||
}
|
||||
if (swap) {
|
||||
ReverseN(&(fs->x_gap_before), sizeof(fs->x_gap_before));
|
||||
ReverseN(&(fs->x_gap_after), sizeof(fs->x_gap_after));
|
||||
Reverse32(&kern_size);
|
||||
}
|
||||
if (kern_size < 0) { // indication of a NULL entry in fi->spacing_vec
|
||||
delete fs;
|
||||
continue;
|
||||
@ -237,16 +229,12 @@ bool write_spacing_info(FILE* f, const FontInfo& fi) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool read_set(FILE* f, FontSet* fs, bool swap) {
|
||||
if (fread(&fs->size, sizeof(fs->size), 1, f) != 1) return false;
|
||||
if (swap)
|
||||
Reverse32(&fs->size);
|
||||
bool read_set(TFile* f, FontSet* fs, bool swap) {
|
||||
if (f->FReadEndian(&fs->size, sizeof(fs->size), 1, swap) != 1) return false;
|
||||
fs->configs = new int[fs->size];
|
||||
for (int i = 0; i < fs->size; ++i) {
|
||||
if (fread(&fs->configs[i], sizeof(fs->configs[i]), 1, f) != 1) return false;
|
||||
if (swap)
|
||||
Reverse32(&fs->configs[i]);
|
||||
}
|
||||
if (f->FReadEndian(fs->configs, sizeof(fs->configs[0]), fs->size, swap) !=
|
||||
fs->size)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -67,7 +67,7 @@ struct FontInfo {
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
bool DeSerialize(bool swap, TFile* fp);
|
||||
|
||||
// Reserves unicharset_size spots in spacing_vec.
|
||||
void init_spacing(int unicharset_size) {
|
||||
@ -152,7 +152,7 @@ class FontInfoTable : public GenericVector<FontInfo> {
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
bool DeSerialize(bool swap, TFile* fp);
|
||||
|
||||
// Returns true if the given set of fonts includes one with the same
|
||||
// properties as font_id.
|
||||
@ -177,11 +177,11 @@ void FontInfoDeleteCallback(FontInfo f);
|
||||
void FontSetDeleteCallback(FontSet fs);
|
||||
|
||||
// Callbacks used by UnicityTable to read/write FontInfo/FontSet structures.
|
||||
bool read_info(FILE* f, FontInfo* fi, bool swap);
|
||||
bool read_info(TFile* f, FontInfo* fi, bool swap);
|
||||
bool write_info(FILE* f, const FontInfo& fi);
|
||||
bool read_spacing_info(FILE *f, FontInfo* fi, bool swap);
|
||||
bool read_spacing_info(TFile* f, FontInfo* fi, bool swap);
|
||||
bool write_spacing_info(FILE* f, const FontInfo& fi);
|
||||
bool read_set(FILE* f, FontSet* fs, bool swap);
|
||||
bool read_set(TFile* f, FontSet* fs, bool swap);
|
||||
bool write_set(FILE* f, const FontSet& fs);
|
||||
|
||||
} // namespace tesseract.
|
||||
|
@ -66,7 +66,6 @@ class CCUtil {
|
||||
STRING imagebasename; // name of image
|
||||
STRING lang;
|
||||
STRING language_data_path_prefix;
|
||||
TessdataManager tessdata_manager;
|
||||
UNICHARSET unicharset;
|
||||
UnicharAmbigs unichar_ambigs;
|
||||
STRING imagefile; // image file name
|
||||
|
@ -162,7 +162,9 @@ class GenericVector {
|
||||
// Returns false on error or if the callback returns false.
|
||||
// DEPRECATED. Use [De]Serialize[Classes] instead.
|
||||
bool write(FILE* f, TessResultCallback2<bool, FILE*, T const &>* cb) const;
|
||||
bool read(FILE* f, TessResultCallback3<bool, FILE*, T*, bool>* cb, bool swap);
|
||||
bool read(tesseract::TFile* f,
|
||||
TessResultCallback3<bool, tesseract::TFile*, T*, bool>* cb,
|
||||
bool swap);
|
||||
// Writes a vector of simple types to the given file. Assumes that bitwise
|
||||
// read/write of T will work. Returns false in case of error.
|
||||
// TODO(rays) Change all callers to use TFile and remove deprecated methods.
|
||||
@ -885,15 +887,14 @@ bool GenericVector<T>::write(
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool GenericVector<T>::read(FILE* f,
|
||||
TessResultCallback3<bool, FILE*, T*, bool>* cb,
|
||||
bool swap) {
|
||||
bool GenericVector<T>::read(
|
||||
tesseract::TFile* f,
|
||||
TessResultCallback3<bool, tesseract::TFile*, T*, bool>* cb, bool swap) {
|
||||
inT32 reserved;
|
||||
if (fread(&reserved, sizeof(reserved), 1, f) != 1) return false;
|
||||
if (swap) Reverse32(&reserved);
|
||||
if (f->FReadEndian(&reserved, sizeof(reserved), 1, swap) != 1) return false;
|
||||
reserve(reserved);
|
||||
if (fread(&size_used_, sizeof(size_used_), 1, f) != 1) return false;
|
||||
if (swap) Reverse32(&size_used_);
|
||||
if (f->FReadEndian(&size_used_, sizeof(size_used_), 1, swap) != 1)
|
||||
return false;
|
||||
if (cb != NULL) {
|
||||
for (int i = 0; i < size_used_; ++i) {
|
||||
if (!cb->Run(f, data_ + i, swap)) {
|
||||
@ -903,11 +904,8 @@ bool GenericVector<T>::read(FILE* f,
|
||||
}
|
||||
delete cb;
|
||||
} else {
|
||||
if (fread(data_, sizeof(T), size_used_, f) != size_used_) return false;
|
||||
if (swap) {
|
||||
for (int i = 0; i < size_used_; ++i)
|
||||
ReverseN(&data_[i], sizeof(T));
|
||||
}
|
||||
if (f->FReadEndian(data_, sizeof(T), size_used_, swap) != size_used_)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
@ -55,7 +55,7 @@ void CCUtil::main_setup(const char *argv0, const char *basename) {
|
||||
|
||||
char *tessdata_prefix = getenv("TESSDATA_PREFIX");
|
||||
|
||||
if (argv0 != NULL) {
|
||||
if (argv0 != NULL && *argv0 != '\0') {
|
||||
/* Use tessdata prefix from the command line. */
|
||||
datadir = argv0;
|
||||
} else if (tessdata_prefix) {
|
||||
|
@ -41,8 +41,6 @@ bool ParamUtils::ReadParamsFile(const char *file,
|
||||
SetParamConstraint constraint,
|
||||
ParamsVectors *member_params) {
|
||||
inT16 nameoffset; // offset for real name
|
||||
FILE *fp; // file pointer
|
||||
// iterators
|
||||
|
||||
if (*file == PLUS) {
|
||||
nameoffset = 1;
|
||||
@ -52,26 +50,22 @@ bool ParamUtils::ReadParamsFile(const char *file,
|
||||
nameoffset = 0;
|
||||
}
|
||||
|
||||
fp = fopen(file + nameoffset, "rb");
|
||||
if (fp == NULL) {
|
||||
TFile fp;
|
||||
if (!fp.Open(file + nameoffset, nullptr)) {
|
||||
tprintf("read_params_file: Can't open %s\n", file + nameoffset);
|
||||
return true;
|
||||
}
|
||||
const bool anyerr = ReadParamsFromFp(fp, -1, constraint, member_params);
|
||||
fclose(fp);
|
||||
return anyerr;
|
||||
return ReadParamsFromFp(constraint, &fp, member_params);
|
||||
}
|
||||
|
||||
bool ParamUtils::ReadParamsFromFp(FILE *fp, inT64 end_offset,
|
||||
SetParamConstraint constraint,
|
||||
bool ParamUtils::ReadParamsFromFp(SetParamConstraint constraint, TFile *fp,
|
||||
ParamsVectors *member_params) {
|
||||
char line[MAX_PATH]; // input line
|
||||
bool anyerr = false; // true if any error
|
||||
bool foundit; // found parameter
|
||||
char *valptr; // value field
|
||||
|
||||
while ((end_offset < 0 || ftell(fp) < end_offset) &&
|
||||
fgets(line, MAX_PATH, fp)) {
|
||||
while (fp->FGets(line, MAX_PATH) != nullptr) {
|
||||
if (line[0] != '\r' && line[0] != '\n' && line[0] != '#') {
|
||||
chomp_string(line); // remove newline
|
||||
for (valptr = line; *valptr && *valptr != ' ' && *valptr != '\t';
|
||||
|
@ -60,9 +60,8 @@ class ParamUtils {
|
||||
SetParamConstraint constraint,
|
||||
ParamsVectors *member_params);
|
||||
|
||||
// Read parameters from the given file pointer (stop at end_offset).
|
||||
static bool ReadParamsFromFp(FILE *fp, inT64 end_offset,
|
||||
SetParamConstraint constraint,
|
||||
// Read parameters from the given file pointer.
|
||||
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp,
|
||||
ParamsVectors *member_params);
|
||||
|
||||
// Set a parameters to have the given value.
|
||||
|
@ -88,6 +88,17 @@ char* TFile::FGets(char* buffer, int buffer_size) {
|
||||
return size > 0 ? buffer : NULL;
|
||||
}
|
||||
|
||||
int TFile::FReadEndian(void* buffer, int size, int count, bool swap) {
|
||||
int num_read = FRead(buffer, size, count);
|
||||
if (swap) {
|
||||
char* char_buffer = reinterpret_cast<char*>(buffer);
|
||||
for (int i = 0; i < num_read; ++i, char_buffer += size) {
|
||||
ReverseN(char_buffer, size);
|
||||
}
|
||||
}
|
||||
return num_read;
|
||||
}
|
||||
|
||||
int TFile::FRead(void* buffer, int size, int count) {
|
||||
ASSERT_HOST(!is_writing_);
|
||||
int required_size = size * count;
|
||||
|
@ -67,6 +67,10 @@ class TFile {
|
||||
// the line is longer. Does nothing if buffer_size <= 0.
|
||||
// To use fscanf use FGets and sscanf.
|
||||
char* FGets(char* buffer, int buffer_size);
|
||||
// Replicates fread, followed by a swap of the bytes if needed, returning the
|
||||
// number of items read. If swap is true then the count items will each have
|
||||
// size bytes reversed.
|
||||
int FReadEndian(void* buffer, int size, int count, bool swap);
|
||||
// Replicates fread, returning the number of items read.
|
||||
int FRead(void* buffer, int size, int count);
|
||||
// Resets the TFile as if it has been Opened, but nothing read.
|
||||
|
@ -33,206 +33,192 @@
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
bool TessdataManager::Init(const char *data_file_name, int debug_level) {
|
||||
int i;
|
||||
debug_level_ = debug_level;
|
||||
// Lazily loads from the the given filename. Won't actually read the file
|
||||
// until it needs it.
|
||||
void TessdataManager::LoadFileLater(const char *data_file_name) {
|
||||
Clear();
|
||||
data_file_name_ = data_file_name;
|
||||
data_file_ = fopen(data_file_name, "rb");
|
||||
if (data_file_ == NULL) {
|
||||
tprintf("Error opening data file %s\n", data_file_name);
|
||||
tprintf("Please make sure the TESSDATA_PREFIX environment variable is set "
|
||||
"to the parent directory of your \"tessdata\" directory.\n");
|
||||
}
|
||||
|
||||
bool TessdataManager::Init(const char *data_file_name) {
|
||||
GenericVector<char> data;
|
||||
bool result = true;
|
||||
if (reader_ == nullptr) {
|
||||
if (!LoadDataFromFile(data_file_name, &data)) return false;
|
||||
} else {
|
||||
if (!(*reader_)(data_file_name, &data)) return false;
|
||||
}
|
||||
return LoadMemBuffer(data_file_name, &data[0], data.size());
|
||||
}
|
||||
|
||||
// Loads from the given memory buffer as if a file.
|
||||
bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
|
||||
int size) {
|
||||
data_file_name_ = name;
|
||||
TFile fp;
|
||||
fp.Open(data, size);
|
||||
inT32 num_entries = TESSDATA_NUM_ENTRIES;
|
||||
if (fp.FRead(&num_entries, sizeof(num_entries), 1) != 1) return false;
|
||||
swap_ = num_entries > kMaxNumTessdataEntries || num_entries < 0;
|
||||
if (swap_) ReverseN(&num_entries, sizeof(num_entries));
|
||||
GenericVector<inT64> offset_table;
|
||||
offset_table.init_to_size(num_entries, -1);
|
||||
if (fp.FReadEndian(&offset_table[0], sizeof(offset_table[0]), num_entries,
|
||||
swap_) != num_entries)
|
||||
return false;
|
||||
}
|
||||
fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
|
||||
swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
|
||||
if (swap_) {
|
||||
ReverseN(&actual_tessdata_num_entries_,
|
||||
sizeof(actual_tessdata_num_entries_));
|
||||
}
|
||||
if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) {
|
||||
// For forward compatibility, truncate to the number we can handle.
|
||||
actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES;
|
||||
}
|
||||
fread(offset_table_, sizeof(inT64),
|
||||
actual_tessdata_num_entries_, data_file_);
|
||||
if (swap_) {
|
||||
for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
|
||||
ReverseN(&offset_table_[i], sizeof(offset_table_[i]));
|
||||
}
|
||||
}
|
||||
if (debug_level_) {
|
||||
tprintf("TessdataManager loaded %d types of tesseract data files.\n",
|
||||
actual_tessdata_num_entries_);
|
||||
for (i = 0; i < actual_tessdata_num_entries_; ++i) {
|
||||
tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
|
||||
for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
if (offset_table[i] >= 0) {
|
||||
inT64 entry_size = size - offset_table[i];
|
||||
int j = i + 1;
|
||||
while (j < num_entries && offset_table[j] == -1) ++j;
|
||||
if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
|
||||
entries_[i].init_to_size(entry_size, 0);
|
||||
if (fp.FRead(&entries_[i][0], 1, entry_size) != entry_size) return false;
|
||||
}
|
||||
}
|
||||
is_loaded_ = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
|
||||
bool newline_end, inT64 num_bytes_to_copy) {
|
||||
if (num_bytes_to_copy == 0) return;
|
||||
int buffer_size = 1024;
|
||||
if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) {
|
||||
buffer_size = num_bytes_to_copy;
|
||||
}
|
||||
inT64 num_bytes_copied = 0;
|
||||
char *chunk = new char[buffer_size];
|
||||
int bytes_read;
|
||||
char last_char = 0x0;
|
||||
while ((bytes_read = fread(chunk, sizeof(char),
|
||||
buffer_size, input_file))) {
|
||||
fwrite(chunk, sizeof(char), bytes_read, output_file);
|
||||
last_char = chunk[bytes_read-1];
|
||||
if (num_bytes_to_copy > 0) {
|
||||
num_bytes_copied += bytes_read;
|
||||
if (num_bytes_copied == num_bytes_to_copy) break;
|
||||
if (num_bytes_copied + buffer_size > num_bytes_to_copy) {
|
||||
buffer_size = num_bytes_to_copy - num_bytes_copied;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (newline_end) ASSERT_HOST(last_char == '\n');
|
||||
delete[] chunk;
|
||||
// Overwrites a single entry of the given type.
|
||||
void TessdataManager::OverwriteEntry(TessdataType type, const char *data,
|
||||
int size) {
|
||||
is_loaded_ = true;
|
||||
entries_[type].init_to_size(size, 0);
|
||||
memcpy(&entries_[type][0], data, size);
|
||||
}
|
||||
|
||||
bool TessdataManager::WriteMetadata(inT64 *offset_table,
|
||||
const char * language_data_path_prefix,
|
||||
FILE *output_file) {
|
||||
inT32 num_entries = TESSDATA_NUM_ENTRIES;
|
||||
bool result = true;
|
||||
if (fseek(output_file, 0, SEEK_SET) != 0 ||
|
||||
fwrite(&num_entries, sizeof(inT32), 1, output_file) != 1 ||
|
||||
fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES,
|
||||
output_file) != TESSDATA_NUM_ENTRIES) {
|
||||
fclose(output_file);
|
||||
result = false;
|
||||
tprintf("WriteMetadata failed in TessdataManager!\n");
|
||||
} else if (fclose(output_file)) {
|
||||
result = false;
|
||||
tprintf("WriteMetadata failed to close file!\n");
|
||||
} else {
|
||||
tprintf("TessdataManager combined tesseract data files.\n");
|
||||
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
tprintf("Offset for type %2d (%s%-22s) is %lld\n", i,
|
||||
language_data_path_prefix, kTessdataFileSuffixes[i],
|
||||
offset_table[i]);
|
||||
// Saves to the given filename.
|
||||
bool TessdataManager::SaveFile(const STRING &filename,
|
||||
FileWriter writer) const {
|
||||
ASSERT_HOST(is_loaded_);
|
||||
GenericVector<char> data;
|
||||
Serialize(&data);
|
||||
if (writer == nullptr)
|
||||
return SaveDataToFile(data, filename);
|
||||
else
|
||||
return (*writer)(data, filename);
|
||||
}
|
||||
|
||||
// Serializes to the given vector.
|
||||
void TessdataManager::Serialize(GenericVector<char> *data) const {
|
||||
ASSERT_HOST(is_loaded_);
|
||||
// Compute the offset_table and total size.
|
||||
inT64 offset_table[TESSDATA_NUM_ENTRIES];
|
||||
inT64 offset = sizeof(inT32) + sizeof(offset_table);
|
||||
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
if (entries_[i].empty()) {
|
||||
offset_table[i] = -1;
|
||||
} else {
|
||||
offset_table[i] = offset;
|
||||
offset += entries_[i].size();
|
||||
}
|
||||
}
|
||||
return result;
|
||||
data->init_to_size(offset, 0);
|
||||
inT32 num_entries = TESSDATA_NUM_ENTRIES;
|
||||
TFile fp;
|
||||
fp.OpenWrite(data);
|
||||
fp.FWrite(&num_entries, sizeof(num_entries), 1);
|
||||
fp.FWrite(offset_table, sizeof(offset_table), 1);
|
||||
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
if (!entries_[i].empty()) {
|
||||
fp.FWrite(&entries_[i][0], entries_[i].size(), 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Resets to the initial state, keeping the reader.
|
||||
void TessdataManager::Clear() {
|
||||
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
entries_[i].clear();
|
||||
}
|
||||
is_loaded_ = false;
|
||||
}
|
||||
|
||||
// Prints a directory of contents.
|
||||
void TessdataManager::Directory() const {
|
||||
int offset = TESSDATA_NUM_ENTRIES * sizeof(inT64);
|
||||
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
if (!entries_[i].empty()) {
|
||||
tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
|
||||
entries_[i].size(), offset);
|
||||
offset += entries_[i].size();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Opens the given TFile pointer to the given component type.
|
||||
// Returns false in case of failure.
|
||||
bool TessdataManager::GetComponent(TessdataType type, TFile *fp) {
|
||||
if (!is_loaded_ && !Init(data_file_name_.string())) return false;
|
||||
if (entries_[type].empty()) return false;
|
||||
fp->Open(&entries_[type][0], entries_[type].size());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TessdataManager::CombineDataFiles(
|
||||
const char *language_data_path_prefix,
|
||||
const char *output_filename) {
|
||||
int i;
|
||||
inT64 offset_table[TESSDATA_NUM_ENTRIES];
|
||||
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
|
||||
FILE *output_file = fopen(output_filename, "wb");
|
||||
if (output_file == NULL) {
|
||||
tprintf("Error opening %s for writing\n", output_filename);
|
||||
return false;
|
||||
}
|
||||
// Leave some space for recording the offset_table.
|
||||
if (fseek(output_file,
|
||||
sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
|
||||
tprintf("Error seeking %s\n", output_filename);
|
||||
fclose(output_file);
|
||||
return false;
|
||||
}
|
||||
|
||||
TessdataType type = TESSDATA_NUM_ENTRIES;
|
||||
bool text_file = false;
|
||||
FILE *file_ptr[TESSDATA_NUM_ENTRIES];
|
||||
|
||||
// Load individual tessdata components from files.
|
||||
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
ASSERT_HOST(TessdataTypeFromFileSuffix(
|
||||
kTessdataFileSuffixes[i], &type, &text_file));
|
||||
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
TessdataType type;
|
||||
ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
|
||||
STRING filename = language_data_path_prefix;
|
||||
filename += kTessdataFileSuffixes[i];
|
||||
file_ptr[i] = fopen(filename.string(), "rb");
|
||||
if (file_ptr[i] != NULL) {
|
||||
offset_table[type] = ftell(output_file);
|
||||
CopyFile(file_ptr[i], output_file, text_file, -1);
|
||||
fclose(file_ptr[i]);
|
||||
FILE *fp = fopen(filename.string(), "rb");
|
||||
if (fp != nullptr) {
|
||||
fclose(fp);
|
||||
if (!LoadDataFromFile(filename, &entries_[type])) {
|
||||
tprintf("Load of file %s failed!\n", filename.string());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
is_loaded_ = true;
|
||||
|
||||
// Make sure that the required components are present.
|
||||
if (!IncludesBaseComponents(offset_table) &&
|
||||
!IncludesLSTMComponents(offset_table)) {
|
||||
if (!IsBaseAvailable() && !IsLSTMAvailable()) {
|
||||
tprintf(
|
||||
"Error: traineddata file must contain at least (a unicharset file"
|
||||
"and inttemp) OR an lstm file.\n");
|
||||
fclose(output_file);
|
||||
return false;
|
||||
}
|
||||
return WriteMetadata(offset_table, language_data_path_prefix, output_file);
|
||||
// Write updated data to the output traineddata file.
|
||||
return SaveFile(output_filename, nullptr);
|
||||
}
|
||||
|
||||
bool TessdataManager::OverwriteComponents(
|
||||
const char *new_traineddata_filename,
|
||||
char **component_filenames,
|
||||
int num_new_components) {
|
||||
int i;
|
||||
inT64 offset_table[TESSDATA_NUM_ENTRIES];
|
||||
TessdataType type = TESSDATA_NUM_ENTRIES;
|
||||
bool text_file = false;
|
||||
FILE *file_ptr[TESSDATA_NUM_ENTRIES];
|
||||
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
offset_table[i] = -1;
|
||||
file_ptr[i] = NULL;
|
||||
}
|
||||
FILE *output_file = fopen(new_traineddata_filename, "wb");
|
||||
if (output_file == NULL) {
|
||||
tprintf("Error opening %s for writing\n", new_traineddata_filename);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Leave some space for recording the offset_table.
|
||||
if (fseek(output_file,
|
||||
sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
|
||||
fclose(output_file);
|
||||
tprintf("Error seeking %s\n", new_traineddata_filename);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Open the files with the new components.
|
||||
for (i = 0; i < num_new_components; ++i) {
|
||||
if (TessdataTypeFromFileName(component_filenames[i], &type, &text_file))
|
||||
file_ptr[type] = fopen(component_filenames[i], "rb");
|
||||
}
|
||||
|
||||
// Write updated data to the output traineddata file.
|
||||
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
if (file_ptr[i] != NULL) {
|
||||
// Get the data from the opened component file.
|
||||
offset_table[i] = ftell(output_file);
|
||||
CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1);
|
||||
fclose(file_ptr[i]);
|
||||
} else {
|
||||
// Get this data component from the loaded data file.
|
||||
if (SeekToStart(static_cast<TessdataType>(i))) {
|
||||
offset_table[i] = ftell(output_file);
|
||||
CopyFile(data_file_, output_file, kTessdataFileIsText[i],
|
||||
GetEndOffset(static_cast<TessdataType>(i)) -
|
||||
ftell(data_file_) + 1);
|
||||
for (int i = 0; i < num_new_components; ++i) {
|
||||
TessdataType type;
|
||||
if (TessdataTypeFromFileName(component_filenames[i], &type)) {
|
||||
if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
|
||||
tprintf("Failed to read component file:%s\n", component_filenames[i]);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
const char *language_data_path_prefix = strchr(new_traineddata_filename, '.');
|
||||
return WriteMetadata(offset_table, language_data_path_prefix, output_file);
|
||||
|
||||
// Write updated data to the output traineddata file.
|
||||
return SaveFile(new_traineddata_filename, nullptr);
|
||||
}
|
||||
|
||||
bool TessdataManager::TessdataTypeFromFileSuffix(
|
||||
const char *suffix, TessdataType *type, bool *text_file) {
|
||||
bool TessdataManager::ExtractToFile(const char *filename) {
|
||||
TessdataType type = TESSDATA_NUM_ENTRIES;
|
||||
ASSERT_HOST(
|
||||
tesseract::TessdataManager::TessdataTypeFromFileName(filename, &type));
|
||||
if (entries_[type].empty()) return false;
|
||||
return SaveDataToFile(entries_[type], filename);
|
||||
}
|
||||
|
||||
bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix,
|
||||
TessdataType *type) {
|
||||
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
|
||||
*type = static_cast<TessdataType>(i);
|
||||
*text_file = kTessdataFileIsText[i];
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@ -241,46 +227,12 @@ bool TessdataManager::TessdataTypeFromFileSuffix(
|
||||
return false;
|
||||
}
|
||||
|
||||
bool TessdataManager::TessdataTypeFromFileName(
|
||||
const char *filename, TessdataType *type, bool *text_file) {
|
||||
bool TessdataManager::TessdataTypeFromFileName(const char *filename,
|
||||
TessdataType *type) {
|
||||
// Get the file suffix (extension)
|
||||
const char *suffix = strrchr(filename, '.');
|
||||
if (suffix == NULL || *(++suffix) == '\0') return false;
|
||||
return TessdataTypeFromFileSuffix(suffix, type, text_file);
|
||||
}
|
||||
|
||||
// Returns true if the base Tesseract components are present.
|
||||
/* static */
|
||||
bool TessdataManager::IncludesBaseComponents(const inT64 *offset_table) {
|
||||
return offset_table[TESSDATA_UNICHARSET] >= 0 &&
|
||||
offset_table[TESSDATA_INTTEMP] >= 0;
|
||||
}
|
||||
|
||||
// Returns true if the LSTM components are present.
|
||||
/* static */
|
||||
bool TessdataManager::IncludesLSTMComponents(const inT64 *offset_table) {
|
||||
return offset_table[TESSDATA_LSTM] >= 0;
|
||||
}
|
||||
|
||||
bool TessdataManager::ExtractToFile(const char *filename) {
|
||||
TessdataType type = TESSDATA_NUM_ENTRIES;
|
||||
bool text_file = false;
|
||||
ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(
|
||||
filename, &type, &text_file));
|
||||
if (!SeekToStart(type)) return false;
|
||||
|
||||
FILE *output_file = fopen(filename, "wb");
|
||||
if (output_file == NULL) {
|
||||
tprintf("Error opening %s\n", filename);
|
||||
exit(1);
|
||||
}
|
||||
inT64 begin_offset = ftell(GetDataFilePtr());
|
||||
inT64 end_offset = GetEndOffset(type);
|
||||
tesseract::TessdataManager::CopyFile(
|
||||
GetDataFilePtr(), output_file, text_file,
|
||||
end_offset - begin_offset + 1);
|
||||
fclose(output_file);
|
||||
return true;
|
||||
if (suffix == nullptr || *(++suffix) == '\0') return false;
|
||||
return TessdataTypeFromFileSuffix(suffix, type);
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
|
@ -108,34 +108,6 @@ static const char *const kTessdataFileSuffixes[] = {
|
||||
kLSTMNumberDawgFileSuffix, // 20
|
||||
};
|
||||
|
||||
/**
|
||||
* If kTessdataFileIsText[i] is true - the tessdata component
|
||||
* of type i (from TessdataType enum) is text, and is binary otherwise.
|
||||
*/
|
||||
static const bool kTessdataFileIsText[] = {
|
||||
true, // 0
|
||||
true, // 1
|
||||
true, // 2
|
||||
false, // 3
|
||||
true, // 4
|
||||
true, // 5
|
||||
false, // 6
|
||||
false, // 7
|
||||
false, // 8
|
||||
false, // 9
|
||||
false, // 10 // deprecated
|
||||
true, // 11 // deprecated
|
||||
false, // 12 // deprecated
|
||||
false, // 13
|
||||
false, // 14
|
||||
false, // 15
|
||||
true, // 16
|
||||
false, // 17
|
||||
false, // 18
|
||||
false, // 19
|
||||
false, // 20
|
||||
};
|
||||
|
||||
/**
|
||||
* TessdataType could be updated to contain more entries, however
|
||||
* we do not expect that number to be astronomically high.
|
||||
@ -148,93 +120,61 @@ static const int kMaxNumTessdataEntries = 1000;
|
||||
|
||||
class TessdataManager {
|
||||
public:
|
||||
TessdataManager() {
|
||||
data_file_ = NULL;
|
||||
actual_tessdata_num_entries_ = 0;
|
||||
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
offset_table_[i] = -1;
|
||||
}
|
||||
}
|
||||
TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) {}
|
||||
explicit TessdataManager(FileReader reader)
|
||||
: reader_(reader), is_loaded_(false), swap_(false) {}
|
||||
~TessdataManager() {}
|
||||
int DebugLevel() { return debug_level_; }
|
||||
|
||||
bool swap() const { return swap_; }
|
||||
bool is_loaded() const { return is_loaded_; }
|
||||
|
||||
// Lazily loads from the the given filename. Won't actually read the file
|
||||
// until it needs it.
|
||||
void LoadFileLater(const char *data_file_name);
|
||||
/**
|
||||
* Opens the given data file and reads the offset table.
|
||||
* Opens and reads the given data file right now.
|
||||
* @return true on success.
|
||||
*/
|
||||
bool Init(const char *data_file_name, int debug_level);
|
||||
bool Init(const char *data_file_name);
|
||||
// Loads from the given memory buffer as if a file, remembering name as some
|
||||
// arbitrary source id for caching.
|
||||
bool LoadMemBuffer(const char *name, const char *data, int size);
|
||||
// Overwrites a single entry of the given type.
|
||||
void OverwriteEntry(TessdataType type, const char *data, int size);
|
||||
|
||||
// Saves to the given filename.
|
||||
bool SaveFile(const STRING &filename, FileWriter writer) const;
|
||||
// Serializes to the given vector.
|
||||
void Serialize(GenericVector<char> *data) const;
|
||||
// Resets to the initial state, keeping the reader.
|
||||
void Clear();
|
||||
|
||||
// Prints a directory of contents.
|
||||
void Directory() const;
|
||||
|
||||
// Opens the given TFile pointer to the given component type.
|
||||
// Returns false in case of failure.
|
||||
bool GetComponent(TessdataType type, TFile *fp);
|
||||
|
||||
// Returns true if the base Tesseract components are present.
|
||||
bool IsBaseAvailable() const { return IncludesBaseComponents(offset_table_); }
|
||||
bool IsBaseAvailable() const {
|
||||
return !entries_[TESSDATA_UNICHARSET].empty() &&
|
||||
!entries_[TESSDATA_INTTEMP].empty();
|
||||
}
|
||||
|
||||
// Returns true if the LSTM components are present.
|
||||
bool IsLSTMAvailable() const { return IncludesLSTMComponents(offset_table_); }
|
||||
bool IsLSTMAvailable() const { return !entries_[TESSDATA_LSTM].empty(); }
|
||||
|
||||
// Return the name of the underlying data file.
|
||||
const STRING &GetDataFileName() const { return data_file_name_; }
|
||||
|
||||
/** Returns data file pointer. */
|
||||
inline FILE *GetDataFilePtr() const { return data_file_; }
|
||||
|
||||
/**
|
||||
* Returns false if there is no data of the given type.
|
||||
* Otherwise does a seek on the data_file_ to position the pointer
|
||||
* at the start of the data of the given type.
|
||||
*/
|
||||
inline bool SeekToStart(TessdataType tessdata_type) {
|
||||
if (debug_level_) {
|
||||
tprintf("TessdataManager: seek to offset %lld - start of tessdata"
|
||||
"type %d (%s))\n", offset_table_[tessdata_type],
|
||||
tessdata_type, kTessdataFileSuffixes[tessdata_type]);
|
||||
}
|
||||
if (offset_table_[tessdata_type] < 0) {
|
||||
return false;
|
||||
} else {
|
||||
ASSERT_HOST(fseek(data_file_,
|
||||
static_cast<size_t>(offset_table_[tessdata_type]),
|
||||
SEEK_SET) == 0);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
/** Returns the end offset for the given tesseract data file type. */
|
||||
inline inT64 GetEndOffset(TessdataType tessdata_type) const {
|
||||
int index = tessdata_type + 1;
|
||||
while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) {
|
||||
++index; // skip tessdata types not present in the combined file
|
||||
}
|
||||
if (debug_level_) {
|
||||
tprintf("TessdataManager: end offset for type %d is %lld\n",
|
||||
tessdata_type,
|
||||
(index == actual_tessdata_num_entries_) ? -1
|
||||
: offset_table_[index]);
|
||||
}
|
||||
return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1;
|
||||
}
|
||||
/** Closes data_file_ (if it was opened by Init()). */
|
||||
inline void End() {
|
||||
if (data_file_ != NULL) {
|
||||
fclose(data_file_);
|
||||
data_file_ = NULL;
|
||||
}
|
||||
}
|
||||
bool swap() const {
|
||||
return swap_;
|
||||
}
|
||||
|
||||
/** Writes the number of entries and the given offset table to output_file.
|
||||
* Returns false on error.
|
||||
*/
|
||||
static bool WriteMetadata(inT64 *offset_table,
|
||||
const char *language_data_path_prefix,
|
||||
FILE *output_file);
|
||||
|
||||
/**
|
||||
* Reads all the standard tesseract config and data files for a language
|
||||
* at the given path and bundles them up into one binary data file.
|
||||
* Returns true if the combined traineddata file was successfully written.
|
||||
*/
|
||||
static bool CombineDataFiles(const char *language_data_path_prefix,
|
||||
const char *output_filename);
|
||||
bool CombineDataFiles(const char *language_data_path_prefix,
|
||||
const char *output_filename);
|
||||
|
||||
/**
|
||||
* Gets the individual components from the data_file_ with which the class was
|
||||
@ -257,69 +197,35 @@ class TessdataManager {
|
||||
*/
|
||||
bool ExtractToFile(const char *filename);
|
||||
|
||||
/**
|
||||
* Copies data from the given input file to the output_file provided.
|
||||
* If num_bytes_to_copy is >= 0, only num_bytes_to_copy is copied from
|
||||
* the input file, otherwise all the data in the input file is copied.
|
||||
*/
|
||||
static void CopyFile(FILE *input_file, FILE *output_file,
|
||||
bool newline_end, inT64 num_bytes_to_copy);
|
||||
|
||||
/**
|
||||
* Fills type with TessdataType of the tessdata component represented by the
|
||||
* given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.
|
||||
* Sets *text_file to true if the component is in text format (e.g.
|
||||
* unicharset, unichar ambigs, config, etc).
|
||||
* @return true if the tessdata component type could be determined
|
||||
* from the given file name.
|
||||
*/
|
||||
static bool TessdataTypeFromFileSuffix(const char *suffix,
|
||||
TessdataType *type,
|
||||
bool *text_file);
|
||||
TessdataType *type);
|
||||
|
||||
/**
|
||||
* Tries to determine tessdata component file suffix from filename,
|
||||
* returns true on success.
|
||||
*/
|
||||
static bool TessdataTypeFromFileName(const char *filename,
|
||||
TessdataType *type,
|
||||
bool *text_file);
|
||||
TessdataType *type);
|
||||
|
||||
private:
|
||||
// Returns true if the base Tesseract components are present.
|
||||
static bool IncludesBaseComponents(const inT64 *offset_table);
|
||||
// Returns true if the LSTM components are present.
|
||||
static bool IncludesLSTMComponents(const inT64 *offset_table);
|
||||
|
||||
/**
|
||||
* Opens the file whose name is a concatenation of language_data_path_prefix
|
||||
* and file_suffix. Returns a file pointer to the opened file.
|
||||
*/
|
||||
static FILE *GetFilePtr(const char *language_data_path_prefix,
|
||||
const char *file_suffix, bool text_file);
|
||||
|
||||
/**
|
||||
* Each offset_table_[i] contains a file offset in the combined data file
|
||||
* where the data of TessdataFileType i is stored.
|
||||
*/
|
||||
inT64 offset_table_[TESSDATA_NUM_ENTRIES];
|
||||
/**
|
||||
* Actual number of entries in the tessdata table. This value can only be
|
||||
* same or smaller than TESSDATA_NUM_ENTRIES, but can never be larger,
|
||||
* since then it would be impossible to interpret the type of tessdata at
|
||||
* indices same and higher than TESSDATA_NUM_ENTRIES.
|
||||
* This parameter is used to allow for backward compatibility
|
||||
* when new tessdata types are introduced.
|
||||
*/
|
||||
inT32 actual_tessdata_num_entries_;
|
||||
STRING data_file_name_; // name of the data file.
|
||||
FILE *data_file_; ///< pointer to the data file.
|
||||
int debug_level_;
|
||||
// Name of file it came from.
|
||||
STRING data_file_name_;
|
||||
// Function to load the file when we need it.
|
||||
FileReader reader_;
|
||||
// True if the file has been loaded.
|
||||
bool is_loaded_;
|
||||
// True if the bytes need swapping.
|
||||
bool swap_;
|
||||
// Contents of each element of the traineddata file.
|
||||
GenericVector<char> entries_[TESSDATA_NUM_ENTRIES];
|
||||
};
|
||||
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_
|
||||
|
@ -87,7 +87,9 @@ class UnicityTable {
|
||||
/// Returns false on read/write error.
|
||||
bool write(FILE* f, TessResultCallback2<bool, FILE*, T const &>* cb) const;
|
||||
/// swap is used to switch the endianness.
|
||||
bool read(FILE* f, TessResultCallback3<bool, FILE*, T*, bool>* cb, bool swap);
|
||||
bool read(tesseract::TFile* f,
|
||||
TessResultCallback3<bool, tesseract::TFile*, T*, bool>* cb,
|
||||
bool swap);
|
||||
|
||||
private:
|
||||
GenericVector<T> table_;
|
||||
@ -194,7 +196,8 @@ bool UnicityTable<T>::write(
|
||||
|
||||
template <typename T>
|
||||
bool UnicityTable<T>::read(
|
||||
FILE* f, TessResultCallback3<bool, FILE*, T*, bool>* cb, bool swap) {
|
||||
tesseract::TFile* f,
|
||||
TessResultCallback3<bool, tesseract::TFile*, T*, bool>* cb, bool swap) {
|
||||
return table_.read(f, cb, swap);
|
||||
}
|
||||
|
||||
|
@ -30,6 +30,8 @@
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
|
||||
using tesseract::TFile;
|
||||
|
||||
/*----------------------------------------------------------------------------
|
||||
Public Code
|
||||
----------------------------------------------------------------------------*/
|
||||
@ -310,7 +312,7 @@ void Classify::PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates) {
|
||||
* @note Exceptions: none
|
||||
* @note History: Tue Mar 19 14:11:01 1991, DSJ, Created.
|
||||
*/
|
||||
ADAPT_CLASS ReadAdaptedClass(FILE *File) {
|
||||
ADAPT_CLASS ReadAdaptedClass(TFile *fp) {
|
||||
int NumTempProtos;
|
||||
int NumConfigs;
|
||||
int i;
|
||||
@ -319,34 +321,34 @@ ADAPT_CLASS ReadAdaptedClass(FILE *File) {
|
||||
|
||||
/* first read high level adapted class structure */
|
||||
Class = (ADAPT_CLASS) Emalloc (sizeof (ADAPT_CLASS_STRUCT));
|
||||
fread ((char *) Class, sizeof (ADAPT_CLASS_STRUCT), 1, File);
|
||||
fp->FRead(Class, sizeof(ADAPT_CLASS_STRUCT), 1);
|
||||
|
||||
/* then read in the definitions of the permanent protos and configs */
|
||||
Class->PermProtos = NewBitVector (MAX_NUM_PROTOS);
|
||||
Class->PermConfigs = NewBitVector (MAX_NUM_CONFIGS);
|
||||
fread ((char *) Class->PermProtos, sizeof (uinT32),
|
||||
WordsInVectorOfSize (MAX_NUM_PROTOS), File);
|
||||
fread ((char *) Class->PermConfigs, sizeof (uinT32),
|
||||
WordsInVectorOfSize (MAX_NUM_CONFIGS), File);
|
||||
fp->FRead(Class->PermProtos, sizeof(uinT32),
|
||||
WordsInVectorOfSize(MAX_NUM_PROTOS));
|
||||
fp->FRead(Class->PermConfigs, sizeof(uinT32),
|
||||
WordsInVectorOfSize(MAX_NUM_CONFIGS));
|
||||
|
||||
/* then read in the list of temporary protos */
|
||||
fread ((char *) &NumTempProtos, sizeof (int), 1, File);
|
||||
fp->FRead(&NumTempProtos, sizeof(int), 1);
|
||||
Class->TempProtos = NIL_LIST;
|
||||
for (i = 0; i < NumTempProtos; i++) {
|
||||
TempProto =
|
||||
(TEMP_PROTO) alloc_struct (sizeof (TEMP_PROTO_STRUCT),
|
||||
"TEMP_PROTO_STRUCT");
|
||||
fread ((char *) TempProto, sizeof (TEMP_PROTO_STRUCT), 1, File);
|
||||
fp->FRead(TempProto, sizeof(TEMP_PROTO_STRUCT), 1);
|
||||
Class->TempProtos = push_last (Class->TempProtos, TempProto);
|
||||
}
|
||||
|
||||
/* then read in the adapted configs */
|
||||
fread ((char *) &NumConfigs, sizeof (int), 1, File);
|
||||
fp->FRead(&NumConfigs, sizeof(int), 1);
|
||||
for (i = 0; i < NumConfigs; i++)
|
||||
if (test_bit (Class->PermConfigs, i))
|
||||
Class->Config[i].Perm = ReadPermConfig (File);
|
||||
Class->Config[i].Perm = ReadPermConfig(fp);
|
||||
else
|
||||
Class->Config[i].Temp = ReadTempConfig (File);
|
||||
Class->Config[i].Temp = ReadTempConfig(fp);
|
||||
|
||||
return (Class);
|
||||
|
||||
@ -366,20 +368,20 @@ namespace tesseract {
|
||||
* @note Exceptions: none
|
||||
* @note History: Mon Mar 18 15:18:10 1991, DSJ, Created.
|
||||
*/
|
||||
ADAPT_TEMPLATES Classify::ReadAdaptedTemplates(FILE *File) {
|
||||
ADAPT_TEMPLATES Classify::ReadAdaptedTemplates(TFile *fp) {
|
||||
int i;
|
||||
ADAPT_TEMPLATES Templates;
|
||||
|
||||
/* first read the high level adaptive template struct */
|
||||
Templates = (ADAPT_TEMPLATES) Emalloc (sizeof (ADAPT_TEMPLATES_STRUCT));
|
||||
fread ((char *) Templates, sizeof (ADAPT_TEMPLATES_STRUCT), 1, File);
|
||||
fp->FRead(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1);
|
||||
|
||||
/* then read in the basic integer templates */
|
||||
Templates->Templates = ReadIntTemplates (File);
|
||||
Templates->Templates = ReadIntTemplates(false, fp);
|
||||
|
||||
/* then read in the adaptive info for each class */
|
||||
for (i = 0; i < (Templates->Templates)->NumClasses; i++) {
|
||||
Templates->Class[i] = ReadAdaptedClass (File);
|
||||
Templates->Class[i] = ReadAdaptedClass(fp);
|
||||
}
|
||||
return (Templates);
|
||||
|
||||
@ -399,15 +401,15 @@ ADAPT_TEMPLATES Classify::ReadAdaptedTemplates(FILE *File) {
|
||||
* @note Exceptions: none
|
||||
* @note History: Tue Mar 19 14:25:26 1991, DSJ, Created.
|
||||
*/
|
||||
PERM_CONFIG ReadPermConfig(FILE *File) {
|
||||
PERM_CONFIG ReadPermConfig(TFile *fp) {
|
||||
PERM_CONFIG Config = (PERM_CONFIG) alloc_struct(sizeof(PERM_CONFIG_STRUCT),
|
||||
"PERM_CONFIG_STRUCT");
|
||||
uinT8 NumAmbigs;
|
||||
fread ((char *) &NumAmbigs, sizeof(uinT8), 1, File);
|
||||
fp->FRead(&NumAmbigs, sizeof(uinT8), 1);
|
||||
Config->Ambigs = new UNICHAR_ID[NumAmbigs + 1];
|
||||
fread(Config->Ambigs, sizeof(UNICHAR_ID), NumAmbigs, File);
|
||||
fp->FRead(Config->Ambigs, sizeof(UNICHAR_ID), NumAmbigs);
|
||||
Config->Ambigs[NumAmbigs] = -1;
|
||||
fread(&(Config->FontinfoId), sizeof(int), 1, File);
|
||||
fp->FRead(&(Config->FontinfoId), sizeof(int), 1);
|
||||
|
||||
return (Config);
|
||||
|
||||
@ -426,17 +428,16 @@ PERM_CONFIG ReadPermConfig(FILE *File) {
|
||||
* @note Exceptions: none
|
||||
* @note History: Tue Mar 19 14:29:59 1991, DSJ, Created.
|
||||
*/
|
||||
TEMP_CONFIG ReadTempConfig(FILE *File) {
|
||||
TEMP_CONFIG ReadTempConfig(TFile *fp) {
|
||||
TEMP_CONFIG Config;
|
||||
|
||||
Config =
|
||||
(TEMP_CONFIG) alloc_struct (sizeof (TEMP_CONFIG_STRUCT),
|
||||
"TEMP_CONFIG_STRUCT");
|
||||
fread ((char *) Config, sizeof (TEMP_CONFIG_STRUCT), 1, File);
|
||||
fp->FRead(Config, sizeof(TEMP_CONFIG_STRUCT), 1);
|
||||
|
||||
Config->Protos = NewBitVector (Config->ProtoVectorSize * BITSINLONG);
|
||||
fread ((char *) Config->Protos, sizeof (uinT32),
|
||||
Config->ProtoVectorSize, File);
|
||||
fp->FRead(Config->Protos, sizeof(uinT32), Config->ProtoVectorSize);
|
||||
|
||||
return (Config);
|
||||
|
||||
|
@ -126,11 +126,11 @@ TEMP_CONFIG NewTempConfig(int MaxProtoId, int FontinfoId);
|
||||
|
||||
TEMP_PROTO NewTempProto();
|
||||
|
||||
ADAPT_CLASS ReadAdaptedClass(FILE *File);
|
||||
ADAPT_CLASS ReadAdaptedClass(tesseract::TFile *File);
|
||||
|
||||
PERM_CONFIG ReadPermConfig(FILE *File);
|
||||
PERM_CONFIG ReadPermConfig(tesseract::TFile *File);
|
||||
|
||||
TEMP_CONFIG ReadTempConfig(FILE *File);
|
||||
TEMP_CONFIG ReadTempConfig(tesseract::TFile *File);
|
||||
|
||||
void WriteAdaptedClass(FILE *File, ADAPT_CLASS Class, int NumConfigs);
|
||||
|
||||
|
@ -524,7 +524,7 @@ void Classify::EndAdaptiveClassifier() {
|
||||
* enables use of pre-adapted templates
|
||||
* @note History: Mon Mar 11 12:49:34 1991, DSJ, Created.
|
||||
*/
|
||||
void Classify::InitAdaptiveClassifier(bool load_pre_trained_templates) {
|
||||
void Classify::InitAdaptiveClassifier(TessdataManager* mgr) {
|
||||
if (!classify_enable_adaptive_matcher)
|
||||
return;
|
||||
if (AllProtosOn != NULL)
|
||||
@ -532,37 +532,25 @@ void Classify::InitAdaptiveClassifier(bool load_pre_trained_templates) {
|
||||
|
||||
// If there is no language_data_path_prefix, the classifier will be
|
||||
// adaptive only.
|
||||
if (language_data_path_prefix.length() > 0 &&
|
||||
load_pre_trained_templates) {
|
||||
ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_INTTEMP));
|
||||
PreTrainedTemplates =
|
||||
ReadIntTemplates(tessdata_manager.GetDataFilePtr());
|
||||
if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded inttemp\n");
|
||||
if (language_data_path_prefix.length() > 0 && mgr != nullptr) {
|
||||
TFile fp;
|
||||
ASSERT_HOST(mgr->GetComponent(TESSDATA_INTTEMP, &fp));
|
||||
PreTrainedTemplates = ReadIntTemplates(mgr->swap(), &fp);
|
||||
|
||||
if (tessdata_manager.SeekToStart(TESSDATA_SHAPE_TABLE)) {
|
||||
if (mgr->GetComponent(TESSDATA_SHAPE_TABLE, &fp)) {
|
||||
shape_table_ = new ShapeTable(unicharset);
|
||||
if (!shape_table_->DeSerialize(tessdata_manager.swap(),
|
||||
tessdata_manager.GetDataFilePtr())) {
|
||||
if (!shape_table_->DeSerialize(mgr->swap(), &fp)) {
|
||||
tprintf("Error loading shape table!\n");
|
||||
delete shape_table_;
|
||||
shape_table_ = NULL;
|
||||
} else if (tessdata_manager.DebugLevel() > 0) {
|
||||
tprintf("Successfully loaded shape table!\n");
|
||||
}
|
||||
}
|
||||
|
||||
ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_PFFMTABLE));
|
||||
ReadNewCutoffs(tessdata_manager.GetDataFilePtr(),
|
||||
tessdata_manager.swap(),
|
||||
tessdata_manager.GetEndOffset(TESSDATA_PFFMTABLE),
|
||||
CharNormCutoffs);
|
||||
if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded pffmtable\n");
|
||||
ASSERT_HOST(mgr->GetComponent(TESSDATA_PFFMTABLE, &fp));
|
||||
ReadNewCutoffs(&fp, mgr->swap(), CharNormCutoffs);
|
||||
|
||||
ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_NORMPROTO));
|
||||
NormProtos =
|
||||
ReadNormProtos(tessdata_manager.GetDataFilePtr(),
|
||||
tessdata_manager.GetEndOffset(TESSDATA_NORMPROTO));
|
||||
if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded normproto\n");
|
||||
ASSERT_HOST(mgr->GetComponent(TESSDATA_NORMPROTO, &fp));
|
||||
NormProtos = ReadNormProtos(&fp);
|
||||
static_classifier_ = new TessClassifier(false, this);
|
||||
}
|
||||
|
||||
@ -582,21 +570,19 @@ void Classify::InitAdaptiveClassifier(bool load_pre_trained_templates) {
|
||||
}
|
||||
|
||||
if (classify_use_pre_adapted_templates) {
|
||||
FILE *File;
|
||||
TFile fp;
|
||||
STRING Filename;
|
||||
|
||||
Filename = imagefile;
|
||||
Filename += ADAPT_TEMPLATE_SUFFIX;
|
||||
File = fopen(Filename.string(), "rb");
|
||||
if (File == NULL) {
|
||||
if (!fp.Open(Filename.string(), nullptr)) {
|
||||
AdaptedTemplates = NewAdaptedTemplates(true);
|
||||
} else {
|
||||
cprintf("\nReading pre-adapted templates from %s ...\n",
|
||||
Filename.string());
|
||||
fflush(stdout);
|
||||
AdaptedTemplates = ReadAdaptedTemplates(File);
|
||||
AdaptedTemplates = ReadAdaptedTemplates(&fp);
|
||||
cprintf("\n");
|
||||
fclose(File);
|
||||
PrintAdaptedTemplates(stdout, AdaptedTemplates);
|
||||
|
||||
for (int i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
|
||||
|
@ -103,16 +103,15 @@ class Classify : public CCStruct {
|
||||
const uinT8* normalization_factors,
|
||||
const uinT16* expected_num_features,
|
||||
GenericVector<CP_RESULT_STRUCT>* results);
|
||||
void ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset,
|
||||
CLASS_CUTOFF_ARRAY Cutoffs);
|
||||
void ReadNewCutoffs(TFile* fp, bool swap, CLASS_CUTOFF_ARRAY Cutoffs);
|
||||
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
|
||||
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
|
||||
ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File);
|
||||
ADAPT_TEMPLATES ReadAdaptedTemplates(TFile* File);
|
||||
/* normmatch.cpp ************************************************************/
|
||||
FLOAT32 ComputeNormMatch(CLASS_ID ClassId,
|
||||
const FEATURE_STRUCT& feature, BOOL8 DebugMatch);
|
||||
void FreeNormProtos();
|
||||
NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset);
|
||||
NORM_PROTOS* ReadNormProtos(TFile* fp);
|
||||
/* protos.cpp ***************************************************************/
|
||||
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class);
|
||||
INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos,
|
||||
@ -138,7 +137,7 @@ class Classify : public CCStruct {
|
||||
void LearnPieces(const char* fontname, int start, int length, float threshold,
|
||||
CharSegmentationType segmentation, const char* correct_text,
|
||||
WERD_RES* word);
|
||||
void InitAdaptiveClassifier(bool load_pre_trained_templates);
|
||||
void InitAdaptiveClassifier(TessdataManager* mgr);
|
||||
void InitAdaptedClass(TBLOB *Blob,
|
||||
CLASS_ID ClassId,
|
||||
int FontinfoId,
|
||||
@ -335,7 +334,7 @@ class Classify : public CCStruct {
|
||||
uinT8* char_norm_array);
|
||||
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures);
|
||||
/* intproto.cpp *************************************************************/
|
||||
INT_TEMPLATES ReadIntTemplates(FILE *File);
|
||||
INT_TEMPLATES ReadIntTemplates(bool swap, TFile* fp);
|
||||
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
|
||||
const UNICHARSET& target_unicharset);
|
||||
CLASS_ID GetClassToDebug(const char *Prompt, bool* adaptive_on,
|
||||
|
@ -25,8 +25,11 @@
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
using tesseract::TFile;
|
||||
|
||||
//---------------Global Data Definitions and Declarations--------------------
|
||||
#define TOKENSIZE 80 //< max size of tokens read from an input file
|
||||
#define QUOTED_TOKENSIZE "79"
|
||||
#define MAXSAMPLESIZE 65535 //< max num of dimensions in feature space
|
||||
//#define MAXBLOCKSIZE 65535 //< max num of samples in a character (block
|
||||
// size)
|
||||
@ -41,11 +44,14 @@
|
||||
* @note Exceptions: ILLEGALSAMPLESIZE illegal format or range
|
||||
* @note History: 6/6/89, DSJ, Created.
|
||||
*/
|
||||
uinT16 ReadSampleSize(FILE *File) {
|
||||
int SampleSize;
|
||||
uinT16 ReadSampleSize(TFile *fp) {
|
||||
int SampleSize = 0;
|
||||
|
||||
if ((tfscanf(File, "%d", &SampleSize) != 1) ||
|
||||
(SampleSize < 0) || (SampleSize > MAXSAMPLESIZE))
|
||||
const int kMaxLineSize = 100;
|
||||
char line[kMaxLineSize];
|
||||
if (fp->FGets(line, kMaxLineSize) == nullptr ||
|
||||
sscanf(line, "%d", &SampleSize) != 1 || (SampleSize < 0) ||
|
||||
(SampleSize > MAXSAMPLESIZE))
|
||||
DoError (ILLEGALSAMPLESIZE, "Illegal sample size");
|
||||
return (SampleSize);
|
||||
}
|
||||
@ -64,30 +70,28 @@ uinT16 ReadSampleSize(FILE *File) {
|
||||
* @note Globals: None
|
||||
* @note History: 6/6/89, DSJ, Created.
|
||||
*/
|
||||
PARAM_DESC *ReadParamDesc(FILE *File, uinT16 N) {
|
||||
int i;
|
||||
PARAM_DESC *ReadParamDesc(TFile *fp, uinT16 N) {
|
||||
PARAM_DESC *ParamDesc;
|
||||
char Token[TOKENSIZE];
|
||||
char linear_token[TOKENSIZE], essential_token[TOKENSIZE];
|
||||
|
||||
ParamDesc = (PARAM_DESC *) Emalloc (N * sizeof (PARAM_DESC));
|
||||
for (i = 0; i < N; i++) {
|
||||
if (tfscanf(File, "%s", Token) != 1)
|
||||
DoError (ILLEGALCIRCULARSPEC,
|
||||
"Illegal circular/linear specification");
|
||||
if (Token[0] == 'c')
|
||||
for (int i = 0; i < N; i++) {
|
||||
const int kMaxLineSize = TOKENSIZE * 4;
|
||||
char line[kMaxLineSize];
|
||||
if (fp->FGets(line, kMaxLineSize) == nullptr ||
|
||||
sscanf(line, "%" QUOTED_TOKENSIZE "s %" QUOTED_TOKENSIZE "s %f %f",
|
||||
linear_token, essential_token, &ParamDesc[i].Min,
|
||||
&ParamDesc[i].Max) != 4)
|
||||
DoError(ILLEGALCIRCULARSPEC, "Illegal Parameter specification");
|
||||
if (linear_token[0] == 'c')
|
||||
ParamDesc[i].Circular = TRUE;
|
||||
else
|
||||
ParamDesc[i].Circular = FALSE;
|
||||
|
||||
if (tfscanf(File, "%s", Token) != 1)
|
||||
DoError (ILLEGALESSENTIALSPEC,
|
||||
"Illegal essential/non-essential spec");
|
||||
if (Token[0] == 'e')
|
||||
if (linear_token[0] == 'e')
|
||||
ParamDesc[i].NonEssential = FALSE;
|
||||
else
|
||||
ParamDesc[i].NonEssential = TRUE;
|
||||
if (tfscanf(File, "%f%f", &(ParamDesc[i].Min), &(ParamDesc[i].Max)) != 2)
|
||||
DoError (ILLEGALMINMAXSPEC, "Illegal min or max specification");
|
||||
ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
|
||||
ParamDesc[i].HalfRange = ParamDesc[i].Range / 2;
|
||||
ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
|
||||
@ -111,123 +115,68 @@ PARAM_DESC *ReadParamDesc(FILE *File, uinT16 N) {
|
||||
* @note Globals: None
|
||||
* @note History: 6/6/89, DSJ, Created.
|
||||
*/
|
||||
PROTOTYPE *ReadPrototype(FILE *File, uinT16 N) {
|
||||
char Token[TOKENSIZE];
|
||||
int Status;
|
||||
PROTOTYPE *ReadPrototype(TFile *fp, uinT16 N) {
|
||||
char sig_token[TOKENSIZE], shape_token[TOKENSIZE];
|
||||
PROTOTYPE *Proto;
|
||||
int SampleCount;
|
||||
int i;
|
||||
|
||||
if ((Status = tfscanf(File, "%s", Token)) == 1) {
|
||||
Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
|
||||
Proto->Cluster = NULL;
|
||||
if (Token[0] == 's')
|
||||
Proto->Significant = TRUE;
|
||||
else
|
||||
Proto->Significant = FALSE;
|
||||
|
||||
Proto->Style = ReadProtoStyle (File);
|
||||
|
||||
if ((tfscanf(File, "%d", &SampleCount) != 1) || (SampleCount < 0))
|
||||
DoError (ILLEGALSAMPLECOUNT, "Illegal sample count");
|
||||
Proto->NumSamples = SampleCount;
|
||||
|
||||
Proto->Mean = ReadNFloats (File, N, NULL);
|
||||
if (Proto->Mean == NULL)
|
||||
DoError (ILLEGALMEANSPEC, "Illegal prototype mean");
|
||||
|
||||
switch (Proto->Style) {
|
||||
case spherical:
|
||||
if (ReadNFloats (File, 1, &(Proto->Variance.Spherical)) == NULL)
|
||||
DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance");
|
||||
Proto->Magnitude.Spherical =
|
||||
1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
|
||||
Proto->TotalMagnitude =
|
||||
pow (Proto->Magnitude.Spherical, (float) N);
|
||||
Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
|
||||
Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
|
||||
Proto->Distrib = NULL;
|
||||
break;
|
||||
case elliptical:
|
||||
Proto->Variance.Elliptical = ReadNFloats (File, N, NULL);
|
||||
if (Proto->Variance.Elliptical == NULL)
|
||||
DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance");
|
||||
Proto->Magnitude.Elliptical =
|
||||
(FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
|
||||
Proto->Weight.Elliptical =
|
||||
(FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
|
||||
Proto->TotalMagnitude = 1.0;
|
||||
for (i = 0; i < N; i++) {
|
||||
Proto->Magnitude.Elliptical[i] =
|
||||
1.0 /
|
||||
sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
|
||||
Proto->Weight.Elliptical[i] =
|
||||
1.0 / Proto->Variance.Elliptical[i];
|
||||
Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
|
||||
}
|
||||
Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
|
||||
Proto->Distrib = NULL;
|
||||
break;
|
||||
case mixed:
|
||||
Proto->Distrib =
|
||||
(DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
|
||||
for (i = 0; i < N; i++) {
|
||||
if (tfscanf(File, "%s", Token) != 1)
|
||||
DoError (ILLEGALDISTRIBUTION,
|
||||
"Illegal prototype distribution");
|
||||
switch (Token[0]) {
|
||||
case 'n':
|
||||
Proto->Distrib[i] = normal;
|
||||
break;
|
||||
case 'u':
|
||||
Proto->Distrib[i] = uniform;
|
||||
break;
|
||||
case 'r':
|
||||
Proto->Distrib[i] = D_random;
|
||||
break;
|
||||
default:
|
||||
DoError (ILLEGALDISTRIBUTION,
|
||||
"Illegal prototype distribution");
|
||||
}
|
||||
}
|
||||
Proto->Variance.Elliptical = ReadNFloats (File, N, NULL);
|
||||
if (Proto->Variance.Elliptical == NULL)
|
||||
DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance");
|
||||
Proto->Magnitude.Elliptical =
|
||||
(FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
|
||||
Proto->Weight.Elliptical =
|
||||
(FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
|
||||
Proto->TotalMagnitude = 1.0;
|
||||
for (i = 0; i < N; i++) {
|
||||
switch (Proto->Distrib[i]) {
|
||||
case normal:
|
||||
Proto->Magnitude.Elliptical[i] = 1.0 /
|
||||
sqrt ((double)
|
||||
(2.0 * PI * Proto->Variance.Elliptical[i]));
|
||||
Proto->Weight.Elliptical[i] =
|
||||
1.0 / Proto->Variance.Elliptical[i];
|
||||
break;
|
||||
case uniform:
|
||||
case D_random:
|
||||
Proto->Magnitude.Elliptical[i] = 1.0 /
|
||||
(2.0 * Proto->Variance.Elliptical[i]);
|
||||
break;
|
||||
case DISTRIBUTION_COUNT:
|
||||
ASSERT_HOST(!"Distribution count not allowed!");
|
||||
}
|
||||
Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
|
||||
}
|
||||
Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
|
||||
break;
|
||||
}
|
||||
return (Proto);
|
||||
const int kMaxLineSize = TOKENSIZE * 4;
|
||||
char line[kMaxLineSize];
|
||||
if (fp->FGets(line, kMaxLineSize) == nullptr ||
|
||||
sscanf(line, "%" QUOTED_TOKENSIZE "s %" QUOTED_TOKENSIZE "s %d",
|
||||
sig_token, shape_token, &SampleCount) != 3) {
|
||||
tprintf("Invalid prototype: %s\n", line);
|
||||
return nullptr;
|
||||
}
|
||||
else if (Status == EOF)
|
||||
return (NULL);
|
||||
else {
|
||||
DoError (ILLEGALSIGNIFICANCESPEC, "Illegal significance specification");
|
||||
return (NULL);
|
||||
Proto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
|
||||
Proto->Cluster = NULL;
|
||||
if (sig_token[0] == 's')
|
||||
Proto->Significant = TRUE;
|
||||
else
|
||||
Proto->Significant = FALSE;
|
||||
|
||||
Proto->Style = ReadProtoStyle(shape_token);
|
||||
|
||||
if (SampleCount < 0) DoError(ILLEGALSAMPLECOUNT, "Illegal sample count");
|
||||
Proto->NumSamples = SampleCount;
|
||||
|
||||
Proto->Mean = ReadNFloats(fp, N, NULL);
|
||||
if (Proto->Mean == NULL) DoError(ILLEGALMEANSPEC, "Illegal prototype mean");
|
||||
|
||||
switch (Proto->Style) {
|
||||
case spherical:
|
||||
if (ReadNFloats(fp, 1, &(Proto->Variance.Spherical)) == NULL)
|
||||
DoError(ILLEGALVARIANCESPEC, "Illegal prototype variance");
|
||||
Proto->Magnitude.Spherical =
|
||||
1.0 / sqrt((double)(2.0 * PI * Proto->Variance.Spherical));
|
||||
Proto->TotalMagnitude = pow(Proto->Magnitude.Spherical, (float)N);
|
||||
Proto->LogMagnitude = log((double)Proto->TotalMagnitude);
|
||||
Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
|
||||
Proto->Distrib = NULL;
|
||||
break;
|
||||
case elliptical:
|
||||
Proto->Variance.Elliptical = ReadNFloats(fp, N, NULL);
|
||||
if (Proto->Variance.Elliptical == NULL)
|
||||
DoError(ILLEGALVARIANCESPEC, "Illegal prototype variance");
|
||||
Proto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
|
||||
Proto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
|
||||
Proto->TotalMagnitude = 1.0;
|
||||
for (i = 0; i < N; i++) {
|
||||
Proto->Magnitude.Elliptical[i] =
|
||||
1.0 / sqrt((double)(2.0 * PI * Proto->Variance.Elliptical[i]));
|
||||
Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
|
||||
Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
|
||||
}
|
||||
Proto->LogMagnitude = log((double)Proto->TotalMagnitude);
|
||||
Proto->Distrib = NULL;
|
||||
break;
|
||||
default:
|
||||
Efree(Proto);
|
||||
tprintf("Invalid prototype style\n");
|
||||
return nullptr;
|
||||
}
|
||||
return Proto;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -239,30 +188,19 @@ PROTOTYPE *ReadPrototype(FILE *File, uinT16 N) {
|
||||
* @note Exceptions: ILLEGALSTYLESPEC illegal prototype style specification
|
||||
* @note History: 6/8/89, DSJ, Created.
|
||||
*/
|
||||
PROTOSTYLE ReadProtoStyle(FILE *File) {
|
||||
char Token[TOKENSIZE];
|
||||
PROTOSTYLE Style;
|
||||
|
||||
if (tfscanf(File, "%s", Token) != 1)
|
||||
DoError (ILLEGALSTYLESPEC, "Illegal prototype style specification");
|
||||
switch (Token[0]) {
|
||||
PROTOSTYLE ReadProtoStyle(const char *shape) {
|
||||
switch (shape[0]) {
|
||||
case 's':
|
||||
Style = spherical;
|
||||
break;
|
||||
return spherical;
|
||||
case 'e':
|
||||
Style = elliptical;
|
||||
break;
|
||||
case 'm':
|
||||
Style = mixed;
|
||||
break;
|
||||
return elliptical;
|
||||
case 'a':
|
||||
Style = automatic;
|
||||
break;
|
||||
return automatic;
|
||||
default:
|
||||
Style = elliptical;
|
||||
DoError (ILLEGALSTYLESPEC, "Illegal prototype style specification");
|
||||
break;
|
||||
}
|
||||
return (Style);
|
||||
tprintf("Invalid prototype style specification:%s\n", shape);
|
||||
return elliptical;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -279,28 +217,30 @@ PROTOSTYLE ReadProtoStyle(FILE *File) {
|
||||
* @note Exceptions: ILLEGALFLOAT
|
||||
* @note History: 6/6/89, DSJ, Created.
|
||||
*/
|
||||
FLOAT32* ReadNFloats(FILE * File, uinT16 N, FLOAT32 Buffer[]) {
|
||||
FLOAT32 *ReadNFloats(TFile *fp, uinT16 N, FLOAT32 Buffer[]) {
|
||||
const int kMaxLineSize = 1024;
|
||||
char line[kMaxLineSize];
|
||||
if (fp->FGets(line, kMaxLineSize) == nullptr) {
|
||||
tprintf("Hit EOF in ReadNFloats!\n");
|
||||
return nullptr;
|
||||
}
|
||||
bool needs_free = false;
|
||||
int i;
|
||||
int NumFloatsRead;
|
||||
|
||||
if (Buffer == NULL) {
|
||||
Buffer = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
|
||||
needs_free = true;
|
||||
}
|
||||
|
||||
for (i = 0; i < N; i++) {
|
||||
NumFloatsRead = tfscanf(File, "%f", &(Buffer[i]));
|
||||
if (NumFloatsRead != 1) {
|
||||
if ((NumFloatsRead == EOF) && (i == 0)) {
|
||||
if (needs_free) {
|
||||
Efree(Buffer);
|
||||
}
|
||||
return NULL;
|
||||
} else {
|
||||
DoError(ILLEGALFLOAT, "Illegal float specification");
|
||||
}
|
||||
char *startptr = line;
|
||||
for (int i = 0; i < N; i++) {
|
||||
char *endptr;
|
||||
Buffer[i] = strtof(startptr, &endptr);
|
||||
if (endptr == startptr) {
|
||||
tprintf("Read of %d floats failed!\n", N);
|
||||
if (needs_free) Efree(Buffer);
|
||||
return nullptr;
|
||||
}
|
||||
startptr = endptr;
|
||||
}
|
||||
return Buffer;
|
||||
}
|
||||
|
@ -20,22 +20,23 @@
|
||||
#define TESSERACT_CLASSIFY_CLUSTTOOL_H_
|
||||
|
||||
//--------------------------Include Files---------------------------------------
|
||||
#include "host.h"
|
||||
#include "cluster.h"
|
||||
#include <stdio.h>
|
||||
#include "cluster.h"
|
||||
#include "host.h"
|
||||
#include "serialis.h"
|
||||
|
||||
/*-------------------------------------------------------------------------
|
||||
Public Function Prototype
|
||||
--------------------------------------------------------------------------*/
|
||||
uinT16 ReadSampleSize(FILE *File);
|
||||
uinT16 ReadSampleSize(tesseract::TFile *fp);
|
||||
|
||||
PARAM_DESC *ReadParamDesc(FILE *File, uinT16 N);
|
||||
PARAM_DESC *ReadParamDesc(tesseract::TFile *fp, uinT16 N);
|
||||
|
||||
PROTOTYPE *ReadPrototype(FILE *File, uinT16 N);
|
||||
PROTOTYPE *ReadPrototype(tesseract::TFile *fp, uinT16 N);
|
||||
|
||||
PROTOSTYLE ReadProtoStyle(FILE *File);
|
||||
PROTOSTYLE ReadProtoStyle(const char *style);
|
||||
|
||||
FLOAT32 *ReadNFloats (FILE * File, uinT16 N, FLOAT32 Buffer[]);
|
||||
FLOAT32 *ReadNFloats(tesseract::TFile *fp, uinT16 N, FLOAT32 Buffer[]);
|
||||
|
||||
void WriteParamDesc(FILE *File, uinT16 N, const PARAM_DESC ParamDesc[]);
|
||||
|
||||
|
@ -49,7 +49,7 @@ namespace tesseract {
|
||||
* @note Exceptions: none
|
||||
* @note History: Wed Feb 20 09:38:26 1991, DSJ, Created.
|
||||
*/
|
||||
void Classify::ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset,
|
||||
void Classify::ReadNewCutoffs(TFile* fp, bool swap,
|
||||
CLASS_CUTOFF_ARRAY Cutoffs) {
|
||||
char Class[UNICHAR_LEN + 1];
|
||||
CLASS_ID ClassId;
|
||||
@ -57,23 +57,24 @@ void Classify::ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset,
|
||||
int i;
|
||||
|
||||
if (shape_table_ != NULL) {
|
||||
if (!shapetable_cutoffs_.DeSerialize(swap, CutoffFile)) {
|
||||
if (!shapetable_cutoffs_.DeSerialize(swap, fp)) {
|
||||
tprintf("Error during read of shapetable pffmtable!\n");
|
||||
}
|
||||
}
|
||||
for (i = 0; i < MAX_NUM_CLASSES; i++)
|
||||
Cutoffs[i] = MAX_CUTOFF;
|
||||
|
||||
while ((end_offset < 0 || ftell(CutoffFile) < end_offset) &&
|
||||
tfscanf(CutoffFile, "%" REALLY_QUOTE_IT(UNICHAR_LEN) "s %d",
|
||||
Class, &Cutoff) == 2) {
|
||||
const int kMaxLineSize = 100;
|
||||
char line[kMaxLineSize];
|
||||
while (fp->FGets(line, kMaxLineSize) != nullptr &&
|
||||
sscanf(line, "%" REALLY_QUOTE_IT(UNICHAR_LEN) "s %d", Class,
|
||||
&Cutoff) == 2) {
|
||||
if (strcmp(Class, "NULL") == 0) {
|
||||
ClassId = unicharset.unichar_to_id(" ");
|
||||
} else {
|
||||
ClassId = unicharset.unichar_to_id(Class);
|
||||
}
|
||||
Cutoffs[ClassId] = Cutoff;
|
||||
SkipNewline(CutoffFile);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -758,9 +758,8 @@ namespace tesseract {
|
||||
* @note Exceptions: none
|
||||
* @note History: Wed Feb 27 11:48:46 1991, DSJ, Created.
|
||||
*/
|
||||
INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
|
||||
INT_TEMPLATES Classify::ReadIntTemplates(bool swap, TFile *fp) {
|
||||
int i, j, w, x, y, z;
|
||||
BOOL8 swap;
|
||||
int nread;
|
||||
int unicharset_size;
|
||||
int version_id = 0;
|
||||
@ -786,29 +785,19 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
|
||||
/* first read the high level template struct */
|
||||
Templates = NewIntTemplates();
|
||||
// Read Templates in parts for 64 bit compatibility.
|
||||
if (fread(&unicharset_size, sizeof(int), 1, File) != 1)
|
||||
cprintf("Bad read of inttemp!\n");
|
||||
if (fread(&Templates->NumClasses,
|
||||
sizeof(Templates->NumClasses), 1, File) != 1 ||
|
||||
fread(&Templates->NumClassPruners,
|
||||
sizeof(Templates->NumClassPruners), 1, File) != 1)
|
||||
cprintf("Bad read of inttemp!\n");
|
||||
// Swap status is determined automatically.
|
||||
swap = Templates->NumClassPruners < 0 ||
|
||||
Templates->NumClassPruners > MAX_NUM_CLASS_PRUNERS;
|
||||
if (swap) {
|
||||
Reverse32(&Templates->NumClassPruners);
|
||||
Reverse32(&Templates->NumClasses);
|
||||
Reverse32(&unicharset_size);
|
||||
}
|
||||
if (fp->FReadEndian(&unicharset_size, sizeof(unicharset_size), 1, swap) != 1)
|
||||
tprintf("Bad read of inttemp!\n");
|
||||
if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses), 1,
|
||||
swap) != 1 ||
|
||||
fp->FReadEndian(&Templates->NumClassPruners,
|
||||
sizeof(Templates->NumClassPruners), 1, swap) != 1)
|
||||
tprintf("Bad read of inttemp!\n");
|
||||
if (Templates->NumClasses < 0) {
|
||||
// This file has a version id!
|
||||
version_id = -Templates->NumClasses;
|
||||
if (fread(&Templates->NumClasses, sizeof(Templates->NumClasses),
|
||||
1, File) != 1)
|
||||
cprintf("Bad read of inttemp!\n");
|
||||
if (swap)
|
||||
Reverse32(&Templates->NumClasses);
|
||||
if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses),
|
||||
1, swap) != 1)
|
||||
tprintf("Bad read of inttemp!\n");
|
||||
}
|
||||
|
||||
if (version_id < 3) {
|
||||
@ -817,39 +806,24 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
|
||||
}
|
||||
|
||||
if (version_id < 2) {
|
||||
for (i = 0; i < unicharset_size; ++i) {
|
||||
if (fread(&IndexFor[i], sizeof(inT16), 1, File) != 1)
|
||||
cprintf("Bad read of inttemp!\n");
|
||||
if (fp->FReadEndian(IndexFor, sizeof(IndexFor[0]), unicharset_size, swap) !=
|
||||
unicharset_size) {
|
||||
tprintf("Bad read of inttemp!\n");
|
||||
}
|
||||
for (i = 0; i < Templates->NumClasses; ++i) {
|
||||
if (fread(&ClassIdFor[i], sizeof(CLASS_ID), 1, File) != 1)
|
||||
cprintf("Bad read of inttemp!\n");
|
||||
}
|
||||
if (swap) {
|
||||
for (i = 0; i < Templates->NumClasses; i++)
|
||||
Reverse16(&IndexFor[i]);
|
||||
for (i = 0; i < Templates->NumClasses; i++)
|
||||
Reverse32(&ClassIdFor[i]);
|
||||
if (fp->FReadEndian(ClassIdFor, sizeof(ClassIdFor[0]),
|
||||
Templates->NumClasses, swap) != Templates->NumClasses) {
|
||||
tprintf("Bad read of inttemp!\n");
|
||||
}
|
||||
}
|
||||
|
||||
/* then read in the class pruners */
|
||||
const int kNumBuckets =
|
||||
NUM_CP_BUCKETS * NUM_CP_BUCKETS * NUM_CP_BUCKETS * WERDS_PER_CP_VECTOR;
|
||||
for (i = 0; i < Templates->NumClassPruners; i++) {
|
||||
Pruner = new CLASS_PRUNER_STRUCT;
|
||||
if ((nread =
|
||||
fread(Pruner, 1, sizeof(CLASS_PRUNER_STRUCT),
|
||||
File)) != sizeof(CLASS_PRUNER_STRUCT))
|
||||
cprintf("Bad read of inttemp!\n");
|
||||
if (swap) {
|
||||
for (x = 0; x < NUM_CP_BUCKETS; x++) {
|
||||
for (y = 0; y < NUM_CP_BUCKETS; y++) {
|
||||
for (z = 0; z < NUM_CP_BUCKETS; z++) {
|
||||
for (w = 0; w < WERDS_PER_CP_VECTOR; w++) {
|
||||
Reverse32(&Pruner->p[x][y][z][w]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (fp->FReadEndian(Pruner, sizeof(Pruner->p[0][0][0][0]), kNumBuckets,
|
||||
swap) != kNumBuckets) {
|
||||
tprintf("Bad read of inttemp!\n");
|
||||
}
|
||||
if (version_id < 2) {
|
||||
TempClassPruner[i] = Pruner;
|
||||
@ -914,39 +888,24 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
|
||||
for (i = 0; i < Templates->NumClasses; i++) {
|
||||
/* first read in the high level struct for the class */
|
||||
Class = (INT_CLASS) Emalloc (sizeof (INT_CLASS_STRUCT));
|
||||
if (fread(&Class->NumProtos, sizeof(Class->NumProtos), 1, File) != 1 ||
|
||||
fread(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File) != 1 ||
|
||||
fread(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File) != 1)
|
||||
cprintf ("Bad read of inttemp!\n");
|
||||
if (fp->FReadEndian(&Class->NumProtos, sizeof(Class->NumProtos), 1, swap) !=
|
||||
1 ||
|
||||
fp->FRead(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1) != 1 ||
|
||||
fp->FRead(&Class->NumConfigs, sizeof(Class->NumConfigs), 1) != 1)
|
||||
tprintf("Bad read of inttemp!\n");
|
||||
if (version_id == 0) {
|
||||
// Only version 0 writes 5 pointless pointers to the file.
|
||||
for (j = 0; j < 5; ++j) {
|
||||
int junk;
|
||||
if (fread(&junk, sizeof(junk), 1, File) != 1)
|
||||
cprintf ("Bad read of inttemp!\n");
|
||||
inT32 junk;
|
||||
if (fp->FRead(&junk, sizeof(junk), 1) != 1)
|
||||
tprintf("Bad read of inttemp!\n");
|
||||
}
|
||||
}
|
||||
if (version_id < 4) {
|
||||
for (j = 0; j < MaxNumConfigs; ++j) {
|
||||
if (fread(&Class->ConfigLengths[j], sizeof(uinT16), 1, File) != 1)
|
||||
cprintf ("Bad read of inttemp!\n");
|
||||
}
|
||||
if (swap) {
|
||||
Reverse16(&Class->NumProtos);
|
||||
for (j = 0; j < MaxNumConfigs; j++)
|
||||
Reverse16(&Class->ConfigLengths[j]);
|
||||
}
|
||||
} else {
|
||||
ASSERT_HOST(Class->NumConfigs < MaxNumConfigs);
|
||||
for (j = 0; j < Class->NumConfigs; ++j) {
|
||||
if (fread(&Class->ConfigLengths[j], sizeof(uinT16), 1, File) != 1)
|
||||
cprintf ("Bad read of inttemp!\n");
|
||||
}
|
||||
if (swap) {
|
||||
Reverse16(&Class->NumProtos);
|
||||
for (j = 0; j < MaxNumConfigs; j++)
|
||||
Reverse16(&Class->ConfigLengths[j]);
|
||||
}
|
||||
int num_configs = version_id < 4 ? MaxNumConfigs : Class->NumConfigs;
|
||||
ASSERT_HOST(num_configs <= MaxNumConfigs);
|
||||
if (fp->FReadEndian(Class->ConfigLengths, sizeof(uinT16), num_configs,
|
||||
swap) != num_configs) {
|
||||
tprintf("Bad read of inttemp!\n");
|
||||
}
|
||||
if (version_id < 2) {
|
||||
ClassForClassId (Templates, ClassIdFor[i]) = Class;
|
||||
@ -958,59 +917,41 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
|
||||
Lengths = NULL;
|
||||
if (MaxNumIntProtosIn (Class) > 0) {
|
||||
Lengths = (uinT8 *)Emalloc(sizeof(uinT8) * MaxNumIntProtosIn(Class));
|
||||
if ((nread =
|
||||
fread((char *)Lengths, sizeof(uinT8),
|
||||
MaxNumIntProtosIn(Class), File)) != MaxNumIntProtosIn (Class))
|
||||
cprintf ("Bad read of inttemp!\n");
|
||||
if (fp->FRead(Lengths, sizeof(uinT8), MaxNumIntProtosIn(Class)) !=
|
||||
MaxNumIntProtosIn(Class))
|
||||
tprintf("Bad read of inttemp!\n");
|
||||
}
|
||||
Class->ProtoLengths = Lengths;
|
||||
|
||||
/* then read in the proto sets */
|
||||
for (j = 0; j < Class->NumProtoSets; j++) {
|
||||
ProtoSet = (PROTO_SET)Emalloc(sizeof(PROTO_SET_STRUCT));
|
||||
if (version_id < 3) {
|
||||
if ((nread =
|
||||
fread((char *) &ProtoSet->ProtoPruner, 1,
|
||||
sizeof(PROTO_PRUNER), File)) != sizeof(PROTO_PRUNER))
|
||||
int num_buckets = NUM_PP_PARAMS * NUM_PP_BUCKETS * WERDS_PER_PP_VECTOR;
|
||||
if (fp->FReadEndian(&ProtoSet->ProtoPruner,
|
||||
sizeof(ProtoSet->ProtoPruner[0][0][0]), num_buckets,
|
||||
swap) != num_buckets)
|
||||
tprintf("Bad read of inttemp!\n");
|
||||
for (x = 0; x < PROTOS_PER_PROTO_SET; x++) {
|
||||
if (fp->FRead(&ProtoSet->Protos[x].A, sizeof(ProtoSet->Protos[x].A),
|
||||
1) != 1 ||
|
||||
fp->FRead(&ProtoSet->Protos[x].B, sizeof(ProtoSet->Protos[x].B),
|
||||
1) != 1 ||
|
||||
fp->FRead(&ProtoSet->Protos[x].C, sizeof(ProtoSet->Protos[x].C),
|
||||
1) != 1 ||
|
||||
fp->FRead(&ProtoSet->Protos[x].Angle,
|
||||
sizeof(ProtoSet->Protos[x].Angle), 1) != 1)
|
||||
tprintf("Bad read of inttemp!\n");
|
||||
if (fp->FReadEndian(&ProtoSet->Protos[x].Configs,
|
||||
sizeof(ProtoSet->Protos[x].Configs[0]),
|
||||
WerdsPerConfigVec, swap) != WerdsPerConfigVec)
|
||||
cprintf("Bad read of inttemp!\n");
|
||||
for (x = 0; x < PROTOS_PER_PROTO_SET; x++) {
|
||||
if ((nread = fread((char *) &ProtoSet->Protos[x].A, 1,
|
||||
sizeof(inT8), File)) != sizeof(inT8) ||
|
||||
(nread = fread((char *) &ProtoSet->Protos[x].B, 1,
|
||||
sizeof(uinT8), File)) != sizeof(uinT8) ||
|
||||
(nread = fread((char *) &ProtoSet->Protos[x].C, 1,
|
||||
sizeof(inT8), File)) != sizeof(inT8) ||
|
||||
(nread = fread((char *) &ProtoSet->Protos[x].Angle, 1,
|
||||
sizeof(uinT8), File)) != sizeof(uinT8))
|
||||
cprintf("Bad read of inttemp!\n");
|
||||
for (y = 0; y < WerdsPerConfigVec; y++)
|
||||
if ((nread = fread((char *) &ProtoSet->Protos[x].Configs[y], 1,
|
||||
sizeof(uinT32), File)) != sizeof(uinT32))
|
||||
cprintf("Bad read of inttemp!\n");
|
||||
}
|
||||
} else {
|
||||
if ((nread =
|
||||
fread((char *) ProtoSet, 1, sizeof(PROTO_SET_STRUCT),
|
||||
File)) != sizeof(PROTO_SET_STRUCT))
|
||||
cprintf("Bad read of inttemp!\n");
|
||||
}
|
||||
if (swap) {
|
||||
for (x = 0; x < NUM_PP_PARAMS; x++)
|
||||
for (y = 0; y < NUM_PP_BUCKETS; y++)
|
||||
for (z = 0; z < WERDS_PER_PP_VECTOR; z++)
|
||||
Reverse32(&ProtoSet->ProtoPruner[x][y][z]);
|
||||
for (x = 0; x < PROTOS_PER_PROTO_SET; x++)
|
||||
for (y = 0; y < WerdsPerConfigVec; y++)
|
||||
Reverse32(&ProtoSet->Protos[x].Configs[y]);
|
||||
}
|
||||
Class->ProtoSets[j] = ProtoSet;
|
||||
}
|
||||
if (version_id < 4)
|
||||
if (version_id < 4) {
|
||||
Class->font_set_id = -1;
|
||||
else {
|
||||
fread(&Class->font_set_id, sizeof(int), 1, File);
|
||||
if (swap)
|
||||
Reverse32(&Class->font_set_id);
|
||||
} else {
|
||||
fp->FReadEndian(&Class->font_set_id, sizeof(Class->font_set_id), 1, swap);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1037,13 +978,12 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) {
|
||||
}
|
||||
}
|
||||
if (version_id >= 4) {
|
||||
this->fontinfo_table_.read(File, NewPermanentTessCallback(read_info), swap);
|
||||
this->fontinfo_table_.read(fp, NewPermanentTessCallback(read_info), swap);
|
||||
if (version_id >= 5) {
|
||||
this->fontinfo_table_.read(File,
|
||||
NewPermanentTessCallback(read_spacing_info),
|
||||
swap);
|
||||
this->fontinfo_table_.read(
|
||||
fp, NewPermanentTessCallback(read_spacing_info), swap);
|
||||
}
|
||||
this->fontset_table_.read(File, NewPermanentTessCallback(read_set), swap);
|
||||
this->fontset_table_.read(fp, NewPermanentTessCallback(read_set), swap);
|
||||
}
|
||||
|
||||
// Clean up.
|
||||
|
@ -86,27 +86,6 @@ bool MasterTrainer::Serialize(FILE* fp) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool MasterTrainer::DeSerialize(bool swap, FILE* fp) {
|
||||
if (fread(&norm_mode_, sizeof(norm_mode_), 1, fp) != 1) return false;
|
||||
if (swap) {
|
||||
ReverseN(&norm_mode_, sizeof(norm_mode_));
|
||||
}
|
||||
if (!unicharset_.load_from_file(fp)) return false;
|
||||
charsetsize_ = unicharset_.size();
|
||||
if (!feature_space_.DeSerialize(swap, fp)) return false;
|
||||
feature_map_.Init(feature_space_);
|
||||
if (!samples_.DeSerialize(swap, fp)) return false;
|
||||
if (!junk_samples_.DeSerialize(swap, fp)) return false;
|
||||
if (!verify_samples_.DeSerialize(swap, fp)) return false;
|
||||
if (!master_shapes_.DeSerialize(swap, fp)) return false;
|
||||
if (!flat_shapes_.DeSerialize(swap, fp)) return false;
|
||||
if (!fontinfo_table_.DeSerialize(swap, fp)) return false;
|
||||
if (!xheights_.DeSerialize(swap, fp)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Load an initial unicharset, or set one up if the file cannot be read.
|
||||
void MasterTrainer::LoadUnicharset(const char* filename) {
|
||||
if (!unicharset_.load_from_file(filename)) {
|
||||
|
@ -74,9 +74,6 @@ class MasterTrainer {
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
|
||||
// Loads an initial unicharset, or sets one up if the file cannot be read.
|
||||
void LoadUnicharset(const char* filename);
|
||||
|
@ -242,7 +242,7 @@ namespace tesseract {
|
||||
* @note Exceptions: none
|
||||
* @note History: Wed Dec 19 16:38:49 1990, DSJ, Created.
|
||||
*/
|
||||
NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) {
|
||||
NORM_PROTOS *Classify::ReadNormProtos(TFile *fp) {
|
||||
NORM_PROTOS *NormProtos;
|
||||
int i;
|
||||
char unichar[2 * UNICHAR_LEN + 1];
|
||||
@ -258,26 +258,26 @@ NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) {
|
||||
NormProtos->Protos[i] = NIL_LIST;
|
||||
|
||||
/* read file header and save in data structure */
|
||||
NormProtos->NumParams = ReadSampleSize (File);
|
||||
NormProtos->ParamDesc = ReadParamDesc (File, NormProtos->NumParams);
|
||||
NormProtos->NumParams = ReadSampleSize(fp);
|
||||
NormProtos->ParamDesc = ReadParamDesc(fp, NormProtos->NumParams);
|
||||
|
||||
/* read protos for each class into a separate list */
|
||||
while ((end_offset < 0 || ftell(File) < end_offset) &&
|
||||
tfscanf(File, "%s %d", unichar, &NumProtos) == 2) {
|
||||
const int kMaxLineSize = 100;
|
||||
char line[kMaxLineSize];
|
||||
while (fp->FGets(line, kMaxLineSize) != nullptr) {
|
||||
if (sscanf(line, "%s %d", unichar, &NumProtos) != 2) continue;
|
||||
if (unicharset.contains_unichar(unichar)) {
|
||||
unichar_id = unicharset.unichar_to_id(unichar);
|
||||
Protos = NormProtos->Protos[unichar_id];
|
||||
for (i = 0; i < NumProtos; i++)
|
||||
Protos =
|
||||
push_last (Protos, ReadPrototype (File, NormProtos->NumParams));
|
||||
Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));
|
||||
NormProtos->Protos[unichar_id] = Protos;
|
||||
} else {
|
||||
cprintf("Error: unichar %s in normproto file is not in unichar set.\n",
|
||||
tprintf("Error: unichar %s in normproto file is not in unichar set.\n",
|
||||
unichar);
|
||||
for (i = 0; i < NumProtos; i++)
|
||||
FreePrototype(ReadPrototype (File, NormProtos->NumParams));
|
||||
FreePrototype(ReadPrototype(fp, NormProtos->NumParams));
|
||||
}
|
||||
SkipNewline(File);
|
||||
}
|
||||
return (NormProtos);
|
||||
} /* ReadNormProtos */
|
||||
|
@ -71,10 +71,9 @@ bool UnicharAndFonts::Serialize(FILE* fp) const {
|
||||
}
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool UnicharAndFonts::DeSerialize(bool swap, FILE* fp) {
|
||||
if (fread(&unichar_id, sizeof(unichar_id), 1, fp) != 1) return false;
|
||||
if (swap)
|
||||
ReverseN(&unichar_id, sizeof(unichar_id));
|
||||
bool UnicharAndFonts::DeSerialize(bool swap, TFile* fp) {
|
||||
if (fp->FReadEndian(&unichar_id, sizeof(unichar_id), 1, swap) != 1)
|
||||
return false;
|
||||
if (!font_ids.DeSerialize(swap, fp)) return false;
|
||||
return true;
|
||||
}
|
||||
@ -96,10 +95,9 @@ bool Shape::Serialize(FILE* fp) const {
|
||||
}
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool Shape::DeSerialize(bool swap, FILE* fp) {
|
||||
bool Shape::DeSerialize(bool swap, TFile* fp) {
|
||||
uinT8 sorted;
|
||||
if (fread(&sorted, sizeof(sorted), 1, fp) != 1)
|
||||
return false;
|
||||
if (fp->FRead(&sorted, sizeof(sorted), 1) != 1) return false;
|
||||
unichars_sorted_ = sorted != 0;
|
||||
if (!unichars_.DeSerializeClasses(swap, fp)) return false;
|
||||
return true;
|
||||
@ -253,7 +251,7 @@ bool ShapeTable::Serialize(FILE* fp) const {
|
||||
}
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool ShapeTable::DeSerialize(bool swap, FILE* fp) {
|
||||
bool ShapeTable::DeSerialize(bool swap, TFile* fp) {
|
||||
if (!shape_table_.DeSerialize(swap, fp)) return false;
|
||||
num_fonts_ = 0;
|
||||
return true;
|
||||
|
@ -168,7 +168,7 @@ struct UnicharAndFonts {
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
bool DeSerialize(bool swap, TFile* fp);
|
||||
|
||||
// Sort function to sort a pair of UnicharAndFonts by unichar_id.
|
||||
static int SortByUnicharId(const void* v1, const void* v2);
|
||||
@ -191,7 +191,7 @@ class Shape {
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
bool DeSerialize(bool swap, TFile* fp);
|
||||
|
||||
int destination_index() const {
|
||||
return destination_index_;
|
||||
@ -272,7 +272,7 @@ class ShapeTable {
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
bool DeSerialize(bool swap, TFile* fp);
|
||||
|
||||
// Accessors.
|
||||
int NumShapes() const {
|
||||
|
@ -174,11 +174,7 @@ bool Dawg::match_words(WERD_CHOICE *word, inT32 index,
|
||||
return false;
|
||||
}
|
||||
|
||||
void Dawg::init(DawgType type, const STRING &lang,
|
||||
PermuterType perm, int unicharset_size, int debug_level) {
|
||||
type_ = type;
|
||||
lang_ = lang;
|
||||
perm_ = perm;
|
||||
void Dawg::init(int unicharset_size) {
|
||||
ASSERT_HOST(unicharset_size > 0);
|
||||
unicharset_size_ = unicharset_size;
|
||||
// Set bit masks. We will use the value unicharset_size_ as a null char, so
|
||||
@ -188,8 +184,6 @@ void Dawg::init(DawgType type, const STRING &lang,
|
||||
letter_mask_ = ~(~0ull << flag_start_bit_);
|
||||
next_node_mask_ = ~0ull << (flag_start_bit_ + NUM_FLAG_BITS);
|
||||
flags_mask_ = ~(letter_mask_ | next_node_mask_);
|
||||
|
||||
debug_level_ = debug_level;
|
||||
}
|
||||
|
||||
|
||||
@ -315,44 +309,34 @@ void SquishedDawg::print_edge(EDGE_REF edge) const {
|
||||
}
|
||||
}
|
||||
|
||||
void SquishedDawg::read_squished_dawg(FILE *file,
|
||||
DawgType type,
|
||||
const STRING &lang,
|
||||
PermuterType perm,
|
||||
int debug_level) {
|
||||
if (debug_level) tprintf("Reading squished dawg\n");
|
||||
bool SquishedDawg::read_squished_dawg(TFile *file) {
|
||||
if (debug_level_) tprintf("Reading squished dawg\n");
|
||||
|
||||
// Read the magic number and if it does not match kDawgMagicNumber
|
||||
// set swap to true to indicate that we need to switch endianness.
|
||||
inT16 magic;
|
||||
fread(&magic, sizeof(inT16), 1, file);
|
||||
if (file->FRead(&magic, sizeof(inT16), 1) != 1) return false;
|
||||
bool swap = (magic != kDawgMagicNumber);
|
||||
|
||||
int unicharset_size;
|
||||
fread(&unicharset_size, sizeof(inT32), 1, file);
|
||||
fread(&num_edges_, sizeof(inT32), 1, file);
|
||||
|
||||
if (swap) {
|
||||
ReverseN(&unicharset_size, sizeof(unicharset_size));
|
||||
ReverseN(&num_edges_, sizeof(num_edges_));
|
||||
}
|
||||
inT32 unicharset_size;
|
||||
if (file->FReadEndian(&unicharset_size, sizeof(unicharset_size), 1, swap) !=
|
||||
1)
|
||||
return false;
|
||||
if (file->FReadEndian(&num_edges_, sizeof(num_edges_), 1, swap) != 1)
|
||||
return false;
|
||||
ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
|
||||
Dawg::init(type, lang, perm, unicharset_size, debug_level);
|
||||
Dawg::init(unicharset_size);
|
||||
|
||||
edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_);
|
||||
fread(&edges_[0], sizeof(EDGE_RECORD), num_edges_, file);
|
||||
EDGE_REF edge;
|
||||
if (swap) {
|
||||
for (edge = 0; edge < num_edges_; ++edge) {
|
||||
ReverseN(&edges_[edge], sizeof(edges_[edge]));
|
||||
}
|
||||
}
|
||||
if (debug_level > 2) {
|
||||
if (file->FReadEndian(&edges_[0], sizeof(edges_[0]), num_edges_, swap) !=
|
||||
num_edges_)
|
||||
return false;
|
||||
if (debug_level_ > 2) {
|
||||
tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
|
||||
type_, lang_.string(), perm_, unicharset_size_, num_edges_);
|
||||
for (edge = 0; edge < num_edges_; ++edge)
|
||||
print_edge(edge);
|
||||
for (EDGE_REF edge = 0; edge < num_edges_; ++edge) print_edge(edge);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const {
|
||||
|
57
dict/dawg.h
57
dict/dawg.h
@ -201,7 +201,12 @@ class Dawg {
|
||||
}
|
||||
|
||||
protected:
|
||||
Dawg() {}
|
||||
Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
|
||||
: type_(type),
|
||||
lang_(lang),
|
||||
perm_(perm),
|
||||
unicharset_size_(0),
|
||||
debug_level_(debug_level) {}
|
||||
|
||||
/// Returns the next node visited by following this edge.
|
||||
inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {
|
||||
@ -274,10 +279,9 @@ class Dawg {
|
||||
(!word_end || (word_end == other_word_end)));
|
||||
}
|
||||
|
||||
/// Sets type_, lang_, perm_, unicharset_size_.
|
||||
/// Sets unicharset_size_.
|
||||
/// Initializes the values of various masks from unicharset_size_.
|
||||
void init(DawgType type, const STRING &lang,
|
||||
PermuterType perm, int unicharset_size, int debug_level);
|
||||
void init(int unicharset_size);
|
||||
|
||||
/// Matches all of the words that are represented by this string.
|
||||
/// If wilcard is set to something other than INVALID_UNICHAR_ID,
|
||||
@ -407,32 +411,36 @@ class DawgPositionVector : public GenericVector<DawgPosition> {
|
||||
//
|
||||
class SquishedDawg : public Dawg {
|
||||
public:
|
||||
SquishedDawg(FILE *file, DawgType type, const STRING &lang,
|
||||
PermuterType perm, int debug_level) {
|
||||
read_squished_dawg(file, type, lang, perm, debug_level);
|
||||
SquishedDawg(DawgType type, const STRING &lang, PermuterType perm,
|
||||
int debug_level)
|
||||
: Dawg(type, lang, perm, debug_level) {}
|
||||
SquishedDawg(const char *filename, DawgType type, const STRING &lang,
|
||||
PermuterType perm, int debug_level)
|
||||
: Dawg(type, lang, perm, debug_level) {
|
||||
TFile file;
|
||||
ASSERT_HOST(file.Open(filename, nullptr));
|
||||
ASSERT_HOST(read_squished_dawg(&file));
|
||||
num_forward_edges_in_node0 = num_forward_edges(0);
|
||||
}
|
||||
SquishedDawg(const char* filename, DawgType type,
|
||||
const STRING &lang, PermuterType perm, int debug_level) {
|
||||
FILE *file = fopen(filename, "rb");
|
||||
if (file == NULL) {
|
||||
tprintf("Failed to open dawg file %s\n", filename);
|
||||
exit(1);
|
||||
}
|
||||
read_squished_dawg(file, type, lang, perm, debug_level);
|
||||
num_forward_edges_in_node0 = num_forward_edges(0);
|
||||
fclose(file);
|
||||
}
|
||||
SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,
|
||||
const STRING &lang, PermuterType perm,
|
||||
int unicharset_size, int debug_level) :
|
||||
edges_(edges), num_edges_(num_edges) {
|
||||
init(type, lang, perm, unicharset_size, debug_level);
|
||||
const STRING &lang, PermuterType perm, int unicharset_size,
|
||||
int debug_level)
|
||||
: Dawg(type, lang, perm, debug_level),
|
||||
edges_(edges),
|
||||
num_edges_(num_edges) {
|
||||
init(unicharset_size);
|
||||
num_forward_edges_in_node0 = num_forward_edges(0);
|
||||
if (debug_level > 3) print_all("SquishedDawg:");
|
||||
}
|
||||
~SquishedDawg();
|
||||
|
||||
// Loads using the given TFile. Returns false on failure.
|
||||
bool Load(TFile *fp) {
|
||||
if (!read_squished_dawg(fp)) return false;
|
||||
num_forward_edges_in_node0 = num_forward_edges(0);
|
||||
return true;
|
||||
}
|
||||
|
||||
int NumEdges() { return num_edges_; }
|
||||
|
||||
/// Returns the edge that corresponds to the letter out of this node.
|
||||
@ -529,8 +537,7 @@ class SquishedDawg : public Dawg {
|
||||
inT32 num_forward_edges(NODE_REF node) const;
|
||||
|
||||
/// Reads SquishedDawg from a file.
|
||||
void read_squished_dawg(FILE *file, DawgType type, const STRING &lang,
|
||||
PermuterType perm, int debug_level);
|
||||
bool read_squished_dawg(TFile *file);
|
||||
|
||||
/// Prints the contents of an edge indicated by the given EDGE_REF.
|
||||
void print_edge(EDGE_REF edge) const;
|
||||
@ -547,7 +554,7 @@ class SquishedDawg : public Dawg {
|
||||
|
||||
// Member variables.
|
||||
EDGE_ARRAY edges_;
|
||||
int num_edges_;
|
||||
inT32 num_edges_;
|
||||
int num_forward_edges_in_node0;
|
||||
};
|
||||
|
||||
|
@ -27,44 +27,33 @@
|
||||
namespace tesseract {
|
||||
|
||||
struct DawgLoader {
|
||||
DawgLoader(const STRING &lang,
|
||||
const char *data_file_name,
|
||||
TessdataType tessdata_dawg_type,
|
||||
int dawg_debug_level)
|
||||
DawgLoader(const STRING &lang, TessdataType tessdata_dawg_type,
|
||||
int dawg_debug_level, TessdataManager *data_file)
|
||||
: lang_(lang),
|
||||
data_file_name_(data_file_name),
|
||||
data_file_(data_file),
|
||||
tessdata_dawg_type_(tessdata_dawg_type),
|
||||
dawg_debug_level_(dawg_debug_level) {}
|
||||
|
||||
Dawg *Load();
|
||||
|
||||
STRING lang_;
|
||||
const char *data_file_name_;
|
||||
TessdataManager *data_file_;
|
||||
TessdataType tessdata_dawg_type_;
|
||||
int dawg_debug_level_;
|
||||
};
|
||||
|
||||
Dawg *DawgCache::GetSquishedDawg(
|
||||
const STRING &lang,
|
||||
const char *data_file_name,
|
||||
TessdataType tessdata_dawg_type,
|
||||
int debug_level) {
|
||||
STRING data_id = data_file_name;
|
||||
Dawg *DawgCache::GetSquishedDawg(const STRING &lang,
|
||||
TessdataType tessdata_dawg_type,
|
||||
int debug_level, TessdataManager *data_file) {
|
||||
STRING data_id = data_file->GetDataFileName();
|
||||
data_id += kTessdataFileSuffixes[tessdata_dawg_type];
|
||||
DawgLoader loader(lang, data_file_name, tessdata_dawg_type, debug_level);
|
||||
DawgLoader loader(lang, tessdata_dawg_type, debug_level, data_file);
|
||||
return dawgs_.Get(data_id, NewTessCallback(&loader, &DawgLoader::Load));
|
||||
}
|
||||
|
||||
Dawg *DawgLoader::Load() {
|
||||
TessdataManager data_loader;
|
||||
if (!data_loader.Init(data_file_name_, dawg_debug_level_)) {
|
||||
return NULL;
|
||||
}
|
||||
if (!data_loader.SeekToStart(tessdata_dawg_type_)) {
|
||||
data_loader.End();
|
||||
return NULL;
|
||||
}
|
||||
FILE *fp = data_loader.GetDataFilePtr();
|
||||
TFile fp;
|
||||
if (!data_file_->GetComponent(tessdata_dawg_type_, &fp)) return nullptr;
|
||||
DawgType dawg_type;
|
||||
PermuterType perm_type;
|
||||
switch (tessdata_dawg_type_) {
|
||||
@ -96,13 +85,13 @@ Dawg *DawgLoader::Load() {
|
||||
perm_type = FREQ_DAWG_PERM;
|
||||
break;
|
||||
default:
|
||||
data_loader.End();
|
||||
return NULL;
|
||||
return nullptr;
|
||||
}
|
||||
SquishedDawg *retval =
|
||||
new SquishedDawg(fp, dawg_type, lang_, perm_type, dawg_debug_level_);
|
||||
data_loader.End();
|
||||
return retval;
|
||||
new SquishedDawg(dawg_type, lang_, perm_type, dawg_debug_level_);
|
||||
if (retval->Load(&fp)) return retval;
|
||||
delete retval;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
|
@ -29,11 +29,8 @@ namespace tesseract {
|
||||
|
||||
class DawgCache {
|
||||
public:
|
||||
Dawg *GetSquishedDawg(
|
||||
const STRING &lang,
|
||||
const char *data_file_name,
|
||||
TessdataType tessdata_dawg_type,
|
||||
int debug_level);
|
||||
Dawg *GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type,
|
||||
int debug_level, TessdataManager *data_file);
|
||||
|
||||
// If we manage the given dawg, decrement its count,
|
||||
// and possibly delete it if the count reaches zero.
|
||||
|
@ -221,35 +221,35 @@ void Dict::SetupForLoad(DawgCache *dawg_cache) {
|
||||
}
|
||||
|
||||
// Loads the dawgs needed by Tesseract. Call FinishLoad() after.
|
||||
void Dict::Load(const char *data_file_name, const STRING &lang) {
|
||||
void Dict::Load(const STRING &lang, TessdataManager *data_file) {
|
||||
// Load dawgs_.
|
||||
if (load_punc_dawg) {
|
||||
punc_dawg_ = dawg_cache_->GetSquishedDawg(
|
||||
lang, data_file_name, TESSDATA_PUNC_DAWG, dawg_debug_level);
|
||||
punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG,
|
||||
dawg_debug_level, data_file);
|
||||
if (punc_dawg_) dawgs_ += punc_dawg_;
|
||||
}
|
||||
if (load_system_dawg) {
|
||||
Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
|
||||
lang, data_file_name, TESSDATA_SYSTEM_DAWG, dawg_debug_level);
|
||||
lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
|
||||
if (system_dawg) dawgs_ += system_dawg;
|
||||
}
|
||||
if (load_number_dawg) {
|
||||
Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
|
||||
lang, data_file_name, TESSDATA_NUMBER_DAWG, dawg_debug_level);
|
||||
lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
|
||||
if (number_dawg) dawgs_ += number_dawg;
|
||||
}
|
||||
if (load_bigram_dawg) {
|
||||
bigram_dawg_ = dawg_cache_->GetSquishedDawg(
|
||||
lang, data_file_name, TESSDATA_BIGRAM_DAWG, dawg_debug_level);
|
||||
bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG,
|
||||
dawg_debug_level, data_file);
|
||||
}
|
||||
if (load_freq_dawg) {
|
||||
freq_dawg_ = dawg_cache_->GetSquishedDawg(
|
||||
lang, data_file_name, TESSDATA_FREQ_DAWG, dawg_debug_level);
|
||||
freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG,
|
||||
dawg_debug_level, data_file);
|
||||
if (freq_dawg_) { dawgs_ += freq_dawg_; }
|
||||
}
|
||||
if (load_unambig_dawg) {
|
||||
unambig_dawg_ = dawg_cache_->GetSquishedDawg(
|
||||
lang, data_file_name, TESSDATA_UNAMBIG_DAWG, dawg_debug_level);
|
||||
unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG,
|
||||
dawg_debug_level, data_file);
|
||||
if (unambig_dawg_) dawgs_ += unambig_dawg_;
|
||||
}
|
||||
|
||||
@ -302,21 +302,21 @@ void Dict::Load(const char *data_file_name, const STRING &lang) {
|
||||
}
|
||||
|
||||
// Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
|
||||
void Dict::LoadLSTM(const char *data_file_name, const STRING &lang) {
|
||||
void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) {
|
||||
// Load dawgs_.
|
||||
if (load_punc_dawg) {
|
||||
punc_dawg_ = dawg_cache_->GetSquishedDawg(
|
||||
lang, data_file_name, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level);
|
||||
punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG,
|
||||
dawg_debug_level, data_file);
|
||||
if (punc_dawg_) dawgs_ += punc_dawg_;
|
||||
}
|
||||
if (load_system_dawg) {
|
||||
Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
|
||||
lang, data_file_name, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level);
|
||||
lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
|
||||
if (system_dawg) dawgs_ += system_dawg;
|
||||
}
|
||||
if (load_number_dawg) {
|
||||
Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
|
||||
lang, data_file_name, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level);
|
||||
lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
|
||||
if (number_dawg) dawgs_ += number_dawg;
|
||||
}
|
||||
}
|
||||
|
@ -298,9 +298,9 @@ class Dict {
|
||||
// Sets up ready for a Load or LoadLSTM.
|
||||
void SetupForLoad(DawgCache *dawg_cache);
|
||||
// Loads the dawgs needed by Tesseract. Call FinishLoad() after.
|
||||
void Load(const char *data_file_name, const STRING &lang);
|
||||
void Load(const STRING &lang, TessdataManager *data_file);
|
||||
// Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
|
||||
void LoadLSTM(const char *data_file_name, const STRING &lang);
|
||||
void LoadLSTM(const STRING &lang, TessdataManager *data_file);
|
||||
// Completes the loading process after Load() and/or LoadLSTM().
|
||||
// Returns false if no dictionaries were loaded.
|
||||
bool FinishLoad();
|
||||
|
@ -87,8 +87,9 @@ class Trie : public Dawg {
|
||||
// contain more edges than max_num_edges, all the edges are cleared
|
||||
// so that new inserts can proceed).
|
||||
Trie(DawgType type, const STRING &lang, PermuterType perm,
|
||||
int unicharset_size, int debug_level) {
|
||||
init(type, lang, perm, unicharset_size, debug_level);
|
||||
int unicharset_size, int debug_level)
|
||||
: Dawg(type, lang, perm, debug_level) {
|
||||
init(unicharset_size);
|
||||
num_edges_ = 0;
|
||||
deref_node_index_mask_ = ~letter_mask_;
|
||||
new_dawg_node(); // need to allocate node 0
|
||||
|
@ -127,12 +127,11 @@ bool LSTMRecognizer::DeSerialize(bool swap, TFile* fp) {
|
||||
// on the unicharset matching. This enables training to deserialize a model
|
||||
// from checkpoint or restore without having to go back and reload the
|
||||
// dictionary.
|
||||
bool LSTMRecognizer::LoadDictionary(const char* data_file_name,
|
||||
const char* lang) {
|
||||
bool LSTMRecognizer::LoadDictionary(const char* lang, TessdataManager* mgr) {
|
||||
delete dict_;
|
||||
dict_ = new Dict(&ccutil_);
|
||||
dict_->SetupForLoad(Dict::GlobalDawgCache());
|
||||
dict_->LoadLSTM(data_file_name, lang);
|
||||
dict_->LoadLSTM(lang, mgr);
|
||||
if (dict_->FinishLoad()) return true; // Success.
|
||||
tprintf("Failed to load any lstm-specific dictionaries for lang %s!!\n",
|
||||
lang);
|
||||
|
@ -167,7 +167,7 @@ class LSTMRecognizer {
|
||||
// on the unicharset matching. This enables training to deserialize a model
|
||||
// from checkpoint or restore without having to go back and reload the
|
||||
// dictionary.
|
||||
bool LoadDictionary(const char* data_file_name, const char* lang);
|
||||
bool LoadDictionary(const char* lang, TessdataManager* mgr);
|
||||
|
||||
// Recognizes the line image, contained within image_data, returning the
|
||||
// ratings matrix and matching box_word for each WERD_RES in the output.
|
||||
|
@ -1223,7 +1223,7 @@ double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) {
|
||||
std::string truth_word(truth_words[i].string());
|
||||
StrMap::iterator it = word_counts.find(truth_word);
|
||||
if (it == word_counts.end())
|
||||
word_counts.insert(make_pair(truth_word, 1));
|
||||
word_counts.insert(std::make_pair(truth_word, 1));
|
||||
else
|
||||
++it->second;
|
||||
}
|
||||
@ -1231,7 +1231,7 @@ double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) {
|
||||
std::string ocr_word(ocr_words[i].string());
|
||||
StrMap::iterator it = word_counts.find(ocr_word);
|
||||
if (it == word_counts.end())
|
||||
word_counts.insert(make_pair(ocr_word, -1));
|
||||
word_counts.insert(std::make_pair(ocr_word, -1));
|
||||
else
|
||||
--it->second;
|
||||
}
|
||||
|
@ -31,7 +31,6 @@ STRING_PARAM_FLAG(classifier, "", "Classifier to test");
|
||||
STRING_PARAM_FLAG(lang, "eng", "Language to test");
|
||||
STRING_PARAM_FLAG(tessdata_dir, "", "Directory of traineddata files");
|
||||
DECLARE_INT_PARAM_FLAG(debug_level);
|
||||
DECLARE_STRING_PARAM_FLAG(T);
|
||||
|
||||
enum ClassifierName {
|
||||
CN_PRUNER,
|
||||
@ -79,13 +78,6 @@ static tesseract::ShapeClassifier* InitializeClassifier(
|
||||
}
|
||||
tesseract::ShapeClassifier* shape_classifier = nullptr;
|
||||
|
||||
if (!FLAGS_T.empty()) {
|
||||
const char* config_name;
|
||||
while ((config_name = GetNextFilename(argc, argv)) != nullptr) {
|
||||
tprintf("Reading config file %s ...\n", config_name);
|
||||
(*api)->ReadConfigFile(config_name);
|
||||
}
|
||||
}
|
||||
if (classifier == CN_PRUNER) {
|
||||
shape_classifier = new tesseract::TessClassifier(true, classify);
|
||||
} else if (classifier == CN_FULL) {
|
||||
|
@ -65,6 +65,7 @@
|
||||
//
|
||||
int main(int argc, char **argv) {
|
||||
int i;
|
||||
tesseract::TessdataManager tm;
|
||||
if (argc == 2) {
|
||||
printf("Combining tessdata files\n");
|
||||
STRING lang = argv[1];
|
||||
@ -73,8 +74,7 @@ int main(int argc, char **argv) {
|
||||
lang += '.';
|
||||
STRING output_file = lang;
|
||||
output_file += kTrainedDataSuffix;
|
||||
if (!tesseract::TessdataManager::CombineDataFiles(
|
||||
lang.string(), output_file.string())) {
|
||||
if (!tm.CombineDataFiles(lang.string(), output_file.string())) {
|
||||
printf("Error combining tessdata files into %s\n",
|
||||
output_file.string());
|
||||
} else {
|
||||
@ -83,8 +83,7 @@ int main(int argc, char **argv) {
|
||||
} else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 ||
|
||||
strcmp(argv[1], "-u") == 0)) {
|
||||
// Initialize TessdataManager with the data in the given traineddata file.
|
||||
tesseract::TessdataManager tm;
|
||||
tm.Init(argv[2], 0);
|
||||
tm.Init(argv[2]);
|
||||
printf("Extracting tessdata components from %s\n", argv[2]);
|
||||
if (strcmp(argv[1], "-e") == 0) {
|
||||
for (i = 3; i < argc; ++i) {
|
||||
@ -107,7 +106,6 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
}
|
||||
}
|
||||
tm.End();
|
||||
} else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
|
||||
// Rename the current traineddata file to a temporary name.
|
||||
const char *new_traineddata_filename = argv[2];
|
||||
@ -120,12 +118,10 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
|
||||
// Initialize TessdataManager with the data in the given traineddata file.
|
||||
tesseract::TessdataManager tm;
|
||||
tm.Init(traineddata_filename.string(), 0);
|
||||
tm.Init(traineddata_filename.string());
|
||||
|
||||
// Write the updated traineddata file.
|
||||
tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
|
||||
tm.End();
|
||||
} else {
|
||||
printf("Usage for combining tessdata components:\n"
|
||||
" %s language_data_path_prefix\n"
|
||||
@ -143,4 +139,5 @@ int main(int argc, char **argv) {
|
||||
" (e.g. %s -u eng.traineddata tmp/eng.)\n", argv[0], argv[0]);
|
||||
return 1;
|
||||
}
|
||||
tm.Directory();
|
||||
}
|
||||
|
@ -59,7 +59,6 @@ STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
|
||||
STRING_PARAM_FLAG(X, "", "File listing font xheights");
|
||||
STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
|
||||
STRING_PARAM_FLAG(O, "", "File to write unicharset to");
|
||||
STRING_PARAM_FLAG(T, "", "File to load trainer from");
|
||||
STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
|
||||
STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
|
||||
DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples,
|
||||
@ -118,10 +117,10 @@ ShapeTable* LoadShapeTable(const STRING& file_prefix) {
|
||||
ShapeTable* shape_table = nullptr;
|
||||
STRING shape_table_file = file_prefix;
|
||||
shape_table_file += kShapeTableFileSuffix;
|
||||
FILE* shape_fp = fopen(shape_table_file.string(), "rb");
|
||||
if (shape_fp != nullptr) {
|
||||
TFile shape_fp;
|
||||
if (shape_fp.Open(shape_table_file.string(), nullptr)) {
|
||||
shape_table = new ShapeTable;
|
||||
if (!shape_table->DeSerialize(false, shape_fp)) {
|
||||
if (!shape_table->DeSerialize(false, &shape_fp)) {
|
||||
delete shape_table;
|
||||
shape_table = nullptr;
|
||||
tprintf("Error: Failed to read shape table %s\n",
|
||||
@ -131,7 +130,6 @@ ShapeTable* LoadShapeTable(const STRING& file_prefix) {
|
||||
tprintf("Read shape table %s of %d shapes\n",
|
||||
shape_table_file.string(), num_shapes);
|
||||
}
|
||||
fclose(shape_fp);
|
||||
} else {
|
||||
tprintf("Warning: No shape table file present: %s\n",
|
||||
shape_table_file.string());
|
||||
@ -199,75 +197,55 @@ MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
|
||||
FLAGS_debug_level);
|
||||
IntFeatureSpace fs;
|
||||
fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets);
|
||||
if (FLAGS_T.empty()) {
|
||||
trainer->LoadUnicharset(FLAGS_U.c_str());
|
||||
// Get basic font information from font_properties.
|
||||
if (!FLAGS_F.empty()) {
|
||||
if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
|
||||
delete trainer;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
if (!FLAGS_X.empty()) {
|
||||
if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
|
||||
delete trainer;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
trainer->SetFeatureSpace(fs);
|
||||
const char* page_name;
|
||||
// Load training data from .tr files on the command line.
|
||||
while ((page_name = GetNextFilename(argc, argv)) != nullptr) {
|
||||
tprintf("Reading %s ...\n", page_name);
|
||||
trainer->ReadTrainingSamples(page_name, feature_defs, false);
|
||||
|
||||
// If there is a file with [lang].[fontname].exp[num].fontinfo present,
|
||||
// read font spacing information in to fontinfo_table.
|
||||
int pagename_len = strlen(page_name);
|
||||
char *fontinfo_file_name = new char[pagename_len + 7];
|
||||
strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr"
|
||||
strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
|
||||
trainer->AddSpacingInfo(fontinfo_file_name);
|
||||
delete[] fontinfo_file_name;
|
||||
|
||||
// Load the images into memory if required by the classifier.
|
||||
if (FLAGS_load_images) {
|
||||
STRING image_name = page_name;
|
||||
// Chop off the tr and replace with tif. Extension must be tif!
|
||||
image_name.truncate_at(image_name.length() - 2);
|
||||
image_name += "tif";
|
||||
trainer->LoadPageImages(image_name.string());
|
||||
}
|
||||
}
|
||||
trainer->PostLoadCleanup();
|
||||
// Write the master trainer if required.
|
||||
if (!FLAGS_output_trainer.empty()) {
|
||||
FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
|
||||
if (fp == nullptr) {
|
||||
tprintf("Can't create saved trainer data!\n");
|
||||
} else {
|
||||
trainer->Serialize(fp);
|
||||
fclose(fp);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
bool success = false;
|
||||
tprintf("Loading master trainer from file:%s\n",
|
||||
FLAGS_T.c_str());
|
||||
FILE* fp = fopen(FLAGS_T.c_str(), "rb");
|
||||
if (fp == nullptr) {
|
||||
tprintf("Can't read file %s to initialize master trainer\n",
|
||||
FLAGS_T.c_str());
|
||||
} else {
|
||||
success = trainer->DeSerialize(false, fp);
|
||||
fclose(fp);
|
||||
}
|
||||
if (!success) {
|
||||
tprintf("Deserialize of master trainer failed!\n");
|
||||
trainer->LoadUnicharset(FLAGS_U.c_str());
|
||||
// Get basic font information from font_properties.
|
||||
if (!FLAGS_F.empty()) {
|
||||
if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
|
||||
delete trainer;
|
||||
return nullptr;
|
||||
}
|
||||
trainer->SetFeatureSpace(fs);
|
||||
}
|
||||
if (!FLAGS_X.empty()) {
|
||||
if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
|
||||
delete trainer;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
trainer->SetFeatureSpace(fs);
|
||||
const char* page_name;
|
||||
// Load training data from .tr files on the command line.
|
||||
while ((page_name = GetNextFilename(argc, argv)) != nullptr) {
|
||||
tprintf("Reading %s ...\n", page_name);
|
||||
trainer->ReadTrainingSamples(page_name, feature_defs, false);
|
||||
|
||||
// If there is a file with [lang].[fontname].exp[num].fontinfo present,
|
||||
// read font spacing information in to fontinfo_table.
|
||||
int pagename_len = strlen(page_name);
|
||||
char* fontinfo_file_name = new char[pagename_len + 7];
|
||||
strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr"
|
||||
strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
|
||||
trainer->AddSpacingInfo(fontinfo_file_name);
|
||||
delete[] fontinfo_file_name;
|
||||
|
||||
// Load the images into memory if required by the classifier.
|
||||
if (FLAGS_load_images) {
|
||||
STRING image_name = page_name;
|
||||
// Chop off the tr and replace with tif. Extension must be tif!
|
||||
image_name.truncate_at(image_name.length() - 2);
|
||||
image_name += "tif";
|
||||
trainer->LoadPageImages(image_name.string());
|
||||
}
|
||||
}
|
||||
trainer->PostLoadCleanup();
|
||||
// Write the master trainer if required.
|
||||
if (!FLAGS_output_trainer.empty()) {
|
||||
FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
|
||||
if (fp == nullptr) {
|
||||
tprintf("Can't create saved trainer data!\n");
|
||||
} else {
|
||||
trainer->Serialize(fp);
|
||||
fclose(fp);
|
||||
}
|
||||
}
|
||||
trainer->PreTrainingSetup();
|
||||
if (!FLAGS_O.empty() &&
|
||||
|
@ -19,6 +19,7 @@
|
||||
|
||||
#include "dawg.h"
|
||||
#include "host.h"
|
||||
#include "serialis.h"
|
||||
#include "tesscallback.h"
|
||||
#include "trie.h"
|
||||
#include "unicharset.h"
|
||||
@ -28,17 +29,20 @@ const int kDictDebugLevel = 1;
|
||||
tesseract::Dawg *LoadSquishedDawg(const UNICHARSET &unicharset,
|
||||
const char *filename) {
|
||||
const int kDictDebugLevel = 1;
|
||||
FILE *dawg_file = fopen(filename, "rb");
|
||||
if (dawg_file == nullptr) {
|
||||
tesseract::TFile dawg_file;
|
||||
if (!dawg_file.Open(filename, nullptr)) {
|
||||
tprintf("Could not open %s for reading.\n", filename);
|
||||
return nullptr;
|
||||
}
|
||||
tprintf("Loading word list from %s\n", filename);
|
||||
tesseract::Dawg *retval = new tesseract::SquishedDawg(
|
||||
dawg_file, tesseract::DAWG_TYPE_WORD, "eng", SYSTEM_DAWG_PERM,
|
||||
kDictDebugLevel);
|
||||
tesseract::SquishedDawg *retval = new tesseract::SquishedDawg(
|
||||
tesseract::DAWG_TYPE_WORD, "eng", SYSTEM_DAWG_PERM, kDictDebugLevel);
|
||||
if (!retval->Load(&dawg_file)) {
|
||||
tprintf("Could not read %s\n", filename);
|
||||
delete retval;
|
||||
return nullptr;
|
||||
}
|
||||
tprintf("Word list loaded.\n");
|
||||
fclose(dawg_file);
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
@ -100,17 +100,15 @@ bool ParamsModel::Equivalent(const ParamsModel &that) const {
|
||||
bool ParamsModel::LoadFromFile(
|
||||
const char *lang,
|
||||
const char *full_path) {
|
||||
FILE *fp = fopen(full_path, "rb");
|
||||
if (!fp) {
|
||||
TFile fp;
|
||||
if (!fp.Open(full_path, nullptr)) {
|
||||
tprintf("Error opening file %s\n", full_path);
|
||||
return false;
|
||||
}
|
||||
bool result = LoadFromFp(lang, fp, -1);
|
||||
fclose(fp);
|
||||
return result;
|
||||
return LoadFromFp(lang, &fp);
|
||||
}
|
||||
|
||||
bool ParamsModel::LoadFromFp(const char *lang, FILE *fp, inT64 end_offset) {
|
||||
bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) {
|
||||
const int kMaxLineSize = 100;
|
||||
char line[kMaxLineSize];
|
||||
BitVector present;
|
||||
@ -120,9 +118,8 @@ bool ParamsModel::LoadFromFp(const char *lang, FILE *fp, inT64 end_offset) {
|
||||
GenericVector<float> &weights = weights_vec_[pass_];
|
||||
weights.init_to_size(PTRAIN_NUM_FEATURE_TYPES, 0.0);
|
||||
|
||||
while ((end_offset < 0 || ftell(fp) < end_offset) &&
|
||||
fgets(line, kMaxLineSize, fp)) {
|
||||
char *key = NULL;
|
||||
while (fp->FGets(line, kMaxLineSize) != nullptr) {
|
||||
char *key = nullptr;
|
||||
float value;
|
||||
if (!ParseLine(line, &key, &value))
|
||||
continue;
|
||||
|
@ -61,7 +61,7 @@ class ParamsModel {
|
||||
|
||||
// Returns true on success.
|
||||
bool LoadFromFile(const char *lang, const char *full_path);
|
||||
bool LoadFromFp(const char *lang, FILE *fp, inT64 end_offset);
|
||||
bool LoadFromFp(const char *lang, TFile *fp);
|
||||
|
||||
const GenericVector<float>& weights() const {
|
||||
return weights_vec_[pass_];
|
||||
|
@ -44,14 +44,14 @@ namespace tesseract {
|
||||
* and Dawg models.
|
||||
*/
|
||||
void Wordrec::program_editup(const char *textbase,
|
||||
bool init_classifier,
|
||||
bool init_dict) {
|
||||
TessdataManager *init_classifier,
|
||||
TessdataManager *init_dict) {
|
||||
if (textbase != NULL) imagefile = textbase;
|
||||
InitFeatureDefs(&feature_defs_);
|
||||
InitAdaptiveClassifier(init_classifier);
|
||||
if (init_dict) {
|
||||
getDict().SetupForLoad(Dict::GlobalDawgCache());
|
||||
getDict().Load(tessdata_manager.GetDataFileName().string(), lang);
|
||||
getDict().Load(lang, init_dict);
|
||||
getDict().FinishLoad();
|
||||
}
|
||||
pass2_ok_split = chop_ok_split;
|
||||
|
@ -200,9 +200,8 @@ class Wordrec : public Classify {
|
||||
}
|
||||
|
||||
// tface.cpp
|
||||
void program_editup(const char *textbase,
|
||||
bool init_classifier,
|
||||
bool init_permute);
|
||||
void program_editup(const char *textbase, TessdataManager *init_classifier,
|
||||
TessdataManager *init_dict);
|
||||
void cc_recog(WERD_RES *word);
|
||||
void program_editdown(inT32 elasped_time);
|
||||
void set_pass1();
|
||||
|
Loading…
Reference in New Issue
Block a user