Format modified code with clang-format

Format the files which were changed in
commit 297d7d86ce.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
Stefan Weil 2019-03-24 21:10:29 +01:00
parent 06acbaf99c
commit 91e2b253c0
5 changed files with 292 additions and 316 deletions

View File

@ -21,42 +21,42 @@
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
# include "config_auto.h"
#endif
#include "basedir.h"
#include "tessvars.h"
#include "control.h"
#include "reject.h"
#include "pageres.h"
#include "pgedit.h"
#include "tprintf.h"
#include "tessedit.h"
#include "reject.h"
#include "stopper.h"
#include "tessedit.h"
#include "tessvars.h"
#include "tprintf.h"
#ifndef DISABLED_LEGACY_ENGINE
#include "intmatcher.h"
#include "chop.h"
# include "chop.h"
# include "intmatcher.h"
#endif
#ifndef ANDROID_BUILD
#include "lstmrecognizer.h"
# include "lstmrecognizer.h"
#endif
#include "tesseractclass.h"
#include "params.h"
#include "tesseractclass.h"
#ifdef DISABLED_LEGACY_ENGINE
#include "matchdefs.h"
# include "matchdefs.h"
#endif
// config under api
#define API_CONFIG "configs/api_config"
// config under api
#define API_CONFIG "configs/api_config"
ETEXT_DESC *global_monitor = nullptr; // progress monitor
ETEXT_DESC* global_monitor = nullptr; // progress monitor
namespace tesseract {
// Read a "config" file containing a set of variable, value pairs.
// Searches the standard places: tessdata/configs, tessdata/tessconfigs
// and also accepts a relative or absolute path name.
void Tesseract::read_config_file(const char *filename,
void Tesseract::read_config_file(const char* filename,
SetParamConstraint constraint) {
STRING path = datadir;
path += "configs/";
@ -88,11 +88,11 @@ void Tesseract::read_config_file(const char *filename,
// the config files specified on the command line or left as the default
// OEM_TESSERACT_ONLY if none of the configs specify this variable.
bool Tesseract::init_tesseract_lang_data(
const char *arg0, const char *textbase, const char *language,
OcrEngineMode oem, char **configs, int configs_size,
const GenericVector<STRING> *vars_vec,
const GenericVector<STRING> *vars_values, bool set_only_non_debug_params,
TessdataManager *mgr) {
const char* arg0, const char* textbase, const char* language,
OcrEngineMode oem, char** configs, int configs_size,
const GenericVector<STRING>* vars_vec,
const GenericVector<STRING>* vars_values, bool set_only_non_debug_params,
TessdataManager* mgr) {
// Set the basename, compute the data directory.
main_setup(arg0, textbase);
@ -106,8 +106,9 @@ bool Tesseract::init_tesseract_lang_data(
STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string())) {
tprintf("Error opening data file %s\n", tessdata_path.string());
tprintf("Please make sure the TESSDATA_PREFIX environment variable is set"
" to your \"tessdata\" directory.\n");
tprintf(
"Please make sure the TESSDATA_PREFIX environment variable is set"
" to your \"tessdata\" directory.\n");
return false;
}
#ifndef DISABLED_LEGACY_ENGINE
@ -131,8 +132,9 @@ bool Tesseract::init_tesseract_lang_data(
this->params());
}
SetParamConstraint set_params_constraint = set_only_non_debug_params ?
SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;
SetParamConstraint set_params_constraint =
set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
: SET_PARAM_CONSTRAINT_NONE;
// Load tesseract variables from config files. This is done after loading
// language-specific variables from [lang].traineddata file, so that custom
// config files can override values in [lang].traineddata file.
@ -153,8 +155,8 @@ bool Tesseract::init_tesseract_lang_data(
}
}
if (((STRING &)tessedit_write_params_to_file).length() > 0) {
FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
if (((STRING&)tessedit_write_params_to_file).length() > 0) {
FILE* params_file = fopen(tessedit_write_params_to_file.string(), "wb");
if (params_file != nullptr) {
ParamUtils::PrintParams(params_file, this->params());
fclose(params_file);
@ -177,16 +179,16 @@ bool Tesseract::init_tesseract_lang_data(
// engine-specific data files need to be loaded.
// If LSTM_ONLY is requested, the base Tesseract files are *Not* required.
#ifndef ANDROID_BUILD
#ifdef DISABLED_LEGACY_ENGINE
# ifdef DISABLED_LEGACY_ENGINE
if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
#else
# else
if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
#endif // ndef DISABLED_LEGACY_ENGINE
# endif // ndef DISABLED_LEGACY_ENGINE
if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
lstm_recognizer_ = new LSTMRecognizer;
ASSERT_HOST(
lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : nullptr, mgr));
ASSERT_HOST(lstm_recognizer_->Load(
this->params(), lstm_use_matrix ? language : nullptr, mgr));
} else {
tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
@ -203,7 +205,7 @@ bool Tesseract::init_tesseract_lang_data(
}
#ifndef DISABLED_LEGACY_ENGINE
else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) ||
!unicharset.load_from_file(&fp, false)) {
!unicharset.load_from_file(&fp, false)) {
return false;
}
#endif // ndef DISABLED_LEGACY_ENGINE
@ -228,8 +230,8 @@ bool Tesseract::init_tesseract_lang_data(
// Init ParamsModel.
// Load pass1 and pass2 weights (for now these two sets are the same, but in
// the future separate sets of weights can be generated).
for (int p = ParamsModel::PTRAIN_PASS1;
p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES;
++p) {
language_model_->getParamsModel().SetPass(
static_cast<ParamsModel::PassEnum>(p));
if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
@ -247,8 +249,7 @@ bool Tesseract::init_tesseract_lang_data(
static bool IsStrInList(const STRING& str,
const GenericVector<STRING>& str_list) {
for (int i = 0; i < str_list.size(); ++i) {
if (str_list[i] == str)
return true;
if (str_list[i] == str) return true;
}
return false;
}
@ -265,8 +266,7 @@ void Tesseract::ParseLanguageString(const char* lang_str,
while (remains.length() > 0) {
// Find the start of the lang code and which vector to add to.
const char* start = remains.string();
while (*start == '+')
++start;
while (*start == '+') ++start;
GenericVector<STRING>* target = to_load;
if (*start == '~') {
target = not_to_load;
@ -275,8 +275,7 @@ void Tesseract::ParseLanguageString(const char* lang_str,
// Find the index of the end of the lang code in string start.
int end = strlen(start);
const char* plus = strchr(start, '+');
if (plus != nullptr && plus - start < end)
end = plus - start;
if (plus != nullptr && plus - start < end) end = plus - start;
STRING lang_code(start);
lang_code.truncate_at(end);
STRING next(start + end);
@ -292,13 +291,13 @@ void Tesseract::ParseLanguageString(const char* lang_str,
// string and recursively any additional languages required by any language
// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
// See init_tesseract_internal for args.
int Tesseract::init_tesseract(const char *arg0, const char *textbase,
const char *language, OcrEngineMode oem,
char **configs, int configs_size,
const GenericVector<STRING> *vars_vec,
const GenericVector<STRING> *vars_values,
int Tesseract::init_tesseract(const char* arg0, const char* textbase,
const char* language, OcrEngineMode oem,
char** configs, int configs_size,
const GenericVector<STRING>* vars_vec,
const GenericVector<STRING>* vars_values,
bool set_only_non_debug_params,
TessdataManager *mgr) {
TessdataManager* mgr) {
GenericVector<STRING> langs_to_load;
GenericVector<STRING> langs_not_to_load;
ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
@ -311,8 +310,8 @@ int Tesseract::init_tesseract(const char *arg0, const char *textbase,
// Load the rest into sub_langs_.
for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
const char *lang_str = langs_to_load[lang_index].string();
Tesseract *tess_to_init;
const char* lang_str = langs_to_load[lang_index].string();
Tesseract* tess_to_init;
if (!loaded_primary) {
tess_to_init = this;
} else {
@ -392,13 +391,13 @@ int Tesseract::init_tesseract(const char *arg0, const char *textbase,
// in vars_vec.
// If set_only_init_params is true, then only the initialization variables
// will be set.
int Tesseract::init_tesseract_internal(const char *arg0, const char *textbase,
const char *language, OcrEngineMode oem,
char **configs, int configs_size,
const GenericVector<STRING> *vars_vec,
const GenericVector<STRING> *vars_values,
int Tesseract::init_tesseract_internal(const char* arg0, const char* textbase,
const char* language, OcrEngineMode oem,
char** configs, int configs_size,
const GenericVector<STRING>* vars_vec,
const GenericVector<STRING>* vars_values,
bool set_only_non_debug_params,
TessdataManager *mgr) {
TessdataManager* mgr) {
if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
configs_size, vars_vec, vars_values,
set_only_non_debug_params, mgr)) {
@ -412,7 +411,7 @@ int Tesseract::init_tesseract_internal(const char *arg0, const char *textbase,
bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
program_editup(textbase, init_tesseract ? mgr : nullptr,
init_tesseract ? mgr : nullptr);
return 0; //Normal exit
return 0; // Normal exit
}
#ifndef DISABLED_LEGACY_ENGINE
@ -458,8 +457,8 @@ void Tesseract::SetupUniversalFontIds() {
}
// init the LM component
int Tesseract::init_tesseract_lm(const char *arg0, const char *textbase,
const char *language, TessdataManager *mgr) {
int Tesseract::init_tesseract_lm(const char* arg0, const char* textbase,
const char* language, TessdataManager* mgr) {
if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
nullptr, 0, nullptr, nullptr, false, mgr))
return -1;
@ -471,14 +470,11 @@ int Tesseract::init_tesseract_lm(const char *arg0, const char *textbase,
#endif // ndef DISABLED_LEGACY_ENGINE
void Tesseract::end_tesseract() {
end_recog();
}
void Tesseract::end_tesseract() { end_recog(); }
/* Define command type identifiers */
enum CMD_EVENTS
{
enum CMD_EVENTS {
ACTION_1_CMD_EVENT,
RECOG_WERDS,
RECOG_PSEUDO,

View File

@ -17,13 +17,13 @@
*
**********************************************************************/
#ifndef PARAMS_H
#define PARAMS_H
#ifndef PARAMS_H
#define PARAMS_H
#include <cstdio>
#include <cstdio>
#include "genericvector.h"
#include "strngs.h"
#include "genericvector.h"
#include "strngs.h"
namespace tesseract {
@ -41,10 +41,10 @@ enum SetParamConstraint {
};
struct ParamsVectors {
GenericVector<IntParam *> int_params;
GenericVector<BoolParam *> bool_params;
GenericVector<StringParam *> string_params;
GenericVector<DoubleParam *> double_params;
GenericVector<IntParam*> int_params;
GenericVector<BoolParam*> bool_params;
GenericVector<StringParam*> string_params;
GenericVector<DoubleParam*> double_params;
};
// Utility functions for working with Tesseract parameters.
@ -55,27 +55,25 @@ class ParamUtils {
// ORed or ANDed with any current values.
// Blank lines and lines beginning # are ignored.
// Values may have any whitespace after the name and are the rest of line.
static bool ReadParamsFile(
const char *file, // filename to read
SetParamConstraint constraint,
ParamsVectors *member_params);
static bool ReadParamsFile(const char* file, // filename to read
SetParamConstraint constraint,
ParamsVectors* member_params);
// Read parameters from the given file pointer.
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp,
ParamsVectors *member_params);
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile* fp,
ParamsVectors* member_params);
// Set a parameters to have the given value.
static bool SetParam(const char *name, const char* value,
static bool SetParam(const char* name, const char* value,
SetParamConstraint constraint,
ParamsVectors *member_params);
ParamsVectors* member_params);
// Returns the pointer to the parameter with the given name (of the
// appropriate type) if it was found in the vector obtained from
// GlobalParams() or in the given member_params.
template<class T>
static T *FindParam(const char *name,
const GenericVector<T *> &global_vec,
const GenericVector<T *> &member_vec) {
template <class T>
static T* FindParam(const char* name, const GenericVector<T*>& global_vec,
const GenericVector<T*>& member_vec) {
int i;
for (i = 0; i < global_vec.size(); ++i) {
if (strcmp(global_vec[i]->name_str(), name) == 0) return global_vec[i];
@ -86,8 +84,8 @@ class ParamUtils {
return nullptr;
}
// Removes the given pointer to the param from the given vector.
template<class T>
static void RemoveParam(T *param_ptr, GenericVector<T *> *vec) {
template <class T>
static void RemoveParam(T* param_ptr, GenericVector<T*>* vec) {
for (int i = 0; i < vec->size(); ++i) {
if ((*vec)[i] == param_ptr) {
vec->remove(i);
@ -97,12 +95,12 @@ class ParamUtils {
}
// Fetches the value of the named param as a STRING. Returns false if not
// found.
static bool GetParamAsString(const char *name,
static bool GetParamAsString(const char* name,
const ParamsVectors* member_params,
STRING *value);
STRING* value);
// Print parameters to the given file.
static void PrintParams(FILE *fp, const ParamsVectors *member_params);
static void PrintParams(FILE* fp, const ParamsVectors* member_params);
// Resets all parameters back to default values;
static void ResetToDefaults(ParamsVectors* member_params);
@ -113,36 +111,36 @@ class Param {
public:
~Param() = default;
const char *name_str() const { return name_; }
const char *info_str() const { return info_; }
const char* name_str() const { return name_; }
const char* info_str() const { return info_; }
bool is_init() const { return init_; }
bool is_debug() const { return debug_; }
bool constraint_ok(SetParamConstraint constraint) const {
return (constraint == SET_PARAM_CONSTRAINT_NONE ||
(constraint == SET_PARAM_CONSTRAINT_DEBUG_ONLY &&
this->is_debug()) ||
(constraint == SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY &&
!this->is_debug()) ||
(constraint == SET_PARAM_CONSTRAINT_NON_INIT_ONLY &&
!this->is_init()));
return (
constraint == SET_PARAM_CONSTRAINT_NONE ||
(constraint == SET_PARAM_CONSTRAINT_DEBUG_ONLY && this->is_debug()) ||
(constraint == SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY &&
!this->is_debug()) ||
(constraint == SET_PARAM_CONSTRAINT_NON_INIT_ONLY && !this->is_init()));
}
protected:
Param(const char *name, const char *comment, bool init) :
name_(name), info_(comment), init_(init) {
Param(const char* name, const char* comment, bool init)
: name_(name), info_(comment), init_(init) {
debug_ = (strstr(name, "debug") != nullptr) || (strstr(name, "display"));
}
const char *name_; // name of this parameter
const char *info_; // for menus
bool init_; // needs to be set before init
const char* name_; // name of this parameter
const char* info_; // for menus
bool init_; // needs to be set before init
bool debug_;
};
class IntParam : public Param {
public:
IntParam(int32_t value, const char *name, const char *comment, bool init,
ParamsVectors *vec) : Param(name, comment, init) {
public:
IntParam(int32_t value, const char* name, const char* comment, bool init,
ParamsVectors* vec)
: Param(name, comment, init) {
value_ = value;
default_ = value;
params_vec_ = &(vec->int_params);
@ -152,29 +150,29 @@ class IntParam : public Param {
operator int32_t() const { return value_; }
void operator=(int32_t value) { value_ = value; }
void set_value(int32_t value) { value_ = value; }
void ResetToDefault() {
value_ = default_;
}
void ResetToDefault() { value_ = default_; }
void ResetFrom(const ParamsVectors* vec) {
for (int i = 0; i < vec->int_params.size(); ++i) {
if (strcmp(vec->int_params[i]->name_str(), name_) == 0) {
//printf("overriding param %s=%d by =%d\n", name_, value_, *vec->int_params[i]);
// printf("overriding param %s=%d by =%d\n", name_, value_,
// *vec->int_params[i]);
value_ = *vec->int_params[i];
}
}
}
private:
int32_t value_;
int32_t default_;
// Pointer to the vector that contains this param (not owned by this class).
GenericVector<IntParam *> *params_vec_;
GenericVector<IntParam*>* params_vec_;
};
class BoolParam : public Param {
public:
BoolParam(bool value, const char *name, const char *comment, bool init,
ParamsVectors *vec) : Param(name, comment, init) {
BoolParam(bool value, const char* name, const char* comment, bool init,
ParamsVectors* vec)
: Param(name, comment, init) {
value_ = value;
default_ = value;
params_vec_ = &(vec->bool_params);
@ -184,13 +182,12 @@ class BoolParam : public Param {
operator BOOL8() const { return value_; }
void operator=(BOOL8 value) { value_ = value; }
void set_value(BOOL8 value) { value_ = value; }
void ResetToDefault() {
value_ = default_;
}
void ResetToDefault() { value_ = default_; }
void ResetFrom(const ParamsVectors* vec) {
for (int i = 0; i < vec->bool_params.size(); ++i) {
if (strcmp(vec->bool_params[i]->name_str(), name_) == 0) {
//printf("overriding param %s=%s by =%s\n", name_, value_ ? "true" : "false", *vec->bool_params[i] ? "true" : "false");
// printf("overriding param %s=%s by =%s\n", name_, value_ ? "true" :
// "false", *vec->bool_params[i] ? "true" : "false");
value_ = *vec->bool_params[i];
}
}
@ -200,34 +197,33 @@ class BoolParam : public Param {
BOOL8 value_;
BOOL8 default_;
// Pointer to the vector that contains this param (not owned by this class).
GenericVector<BoolParam *> *params_vec_;
GenericVector<BoolParam*>* params_vec_;
};
class StringParam : public Param {
public:
StringParam(const char *value, const char *name,
const char *comment, bool init,
ParamsVectors *vec) : Param(name, comment, init) {
StringParam(const char* value, const char* name, const char* comment,
bool init, ParamsVectors* vec)
: Param(name, comment, init) {
value_ = value;
default_ = value;
params_vec_ = &(vec->string_params);
vec->string_params.push_back(this);
}
~StringParam() { ParamUtils::RemoveParam<StringParam>(this, params_vec_); }
operator STRING &() { return value_; }
const char *string() const { return value_.string(); }
const char *c_str() const { return value_.string(); }
operator STRING&() { return value_; }
const char* string() const { return value_.string(); }
const char* c_str() const { return value_.string(); }
bool empty() { return value_.length() <= 0; }
bool operator==(const STRING& other) { return value_ == other; }
void operator=(const STRING& value) { value_ = value; }
void set_value(const STRING& value) { value_ = value; }
void ResetToDefault() {
value_ = default_;
}
void ResetToDefault() { value_ = default_; }
void ResetFrom(const ParamsVectors* vec) {
for (int i = 0; i < vec->string_params.size(); ++i) {
if (strcmp(vec->string_params[i]->name_str(), name_) == 0) {
//printf("overriding param %s=%s by =%s\n", name_, value_, vec->string_params[i]->c_str());
// printf("overriding param %s=%s by =%s\n", name_, value_,
// vec->string_params[i]->c_str());
value_ = *vec->string_params[i];
}
}
@ -237,13 +233,14 @@ class StringParam : public Param {
STRING value_;
STRING default_;
// Pointer to the vector that contains this param (not owned by this class).
GenericVector<StringParam *> *params_vec_;
GenericVector<StringParam*>* params_vec_;
};
class DoubleParam : public Param {
public:
DoubleParam(double value, const char *name, const char *comment,
bool init, ParamsVectors *vec) : Param(name, comment, init) {
DoubleParam(double value, const char* name, const char* comment, bool init,
ParamsVectors* vec)
: Param(name, comment, init) {
value_ = value;
default_ = value;
params_vec_ = &(vec->double_params);
@ -253,13 +250,12 @@ class DoubleParam : public Param {
operator double() const { return value_; }
void operator=(double value) { value_ = value; }
void set_value(double value) { value_ = value; }
void ResetToDefault() {
value_ = default_;
}
void ResetToDefault() { value_ = default_; }
void ResetFrom(const ParamsVectors* vec) {
for (int i = 0; i < vec->double_params.size(); ++i) {
if (strcmp(vec->double_params[i]->name_str(), name_) == 0) {
//printf("overriding param %s=%f by =%f\n", name_, value_, *vec->double_params[i]);
// printf("overriding param %s=%f by =%f\n", name_, value_,
// *vec->double_params[i]);
value_ = *vec->double_params[i];
}
}
@ -269,7 +265,7 @@ class DoubleParam : public Param {
double value_;
double default_;
// Pointer to the vector that contains this param (not owned by this class).
GenericVector<DoubleParam *> *params_vec_;
GenericVector<DoubleParam*>* params_vec_;
};
} // namespace tesseract
@ -283,7 +279,7 @@ class DoubleParam : public Param {
//
// TODO(daria): remove GlobalParams() when all global Tesseract
// parameters are converted to members.
tesseract::ParamsVectors *GlobalParams();
tesseract::ParamsVectors* GlobalParams();
/*************************************************************************
* Note on defining parameters.
@ -293,52 +289,48 @@ tesseract::ParamsVectors *GlobalParams();
* (there is no such guarantee for parameters defined with the other macros).
*************************************************************************/
#define INT_VAR_H(name,val,comment)\
tesseract::IntParam name
#define INT_VAR_H(name, val, comment) tesseract::IntParam name
#define BOOL_VAR_H(name,val,comment)\
tesseract::BoolParam name
#define BOOL_VAR_H(name, val, comment) tesseract::BoolParam name
#define STRING_VAR_H(name,val,comment)\
tesseract::StringParam name
#define STRING_VAR_H(name, val, comment) tesseract::StringParam name
#define double_VAR_H(name,val,comment)\
tesseract::DoubleParam name
#define double_VAR_H(name, val, comment) tesseract::DoubleParam name
#define INT_VAR(name,val,comment)\
tesseract::IntParam name(val,#name,comment,false,GlobalParams())
#define INT_VAR(name, val, comment) \
tesseract::IntParam name(val, #name, comment, false, GlobalParams())
#define BOOL_VAR(name,val,comment)\
tesseract::BoolParam name(val,#name,comment,false,GlobalParams())
#define BOOL_VAR(name, val, comment) \
tesseract::BoolParam name(val, #name, comment, false, GlobalParams())
#define STRING_VAR(name,val,comment)\
tesseract::StringParam name(val,#name,comment,false,GlobalParams())
#define STRING_VAR(name, val, comment) \
tesseract::StringParam name(val, #name, comment, false, GlobalParams())
#define double_VAR(name,val,comment)\
tesseract::DoubleParam name(val,#name,comment,false,GlobalParams())
#define double_VAR(name, val, comment) \
tesseract::DoubleParam name(val, #name, comment, false, GlobalParams())
#define INT_MEMBER(name, val, comment, vec)\
#define INT_MEMBER(name, val, comment, vec) \
name(val, #name, comment, false, vec)
#define BOOL_MEMBER(name, val, comment, vec)\
#define BOOL_MEMBER(name, val, comment, vec) \
name(val, #name, comment, false, vec)
#define STRING_MEMBER(name, val, comment, vec)\
#define STRING_MEMBER(name, val, comment, vec) \
name(val, #name, comment, false, vec)
#define double_MEMBER(name, val, comment, vec)\
#define double_MEMBER(name, val, comment, vec) \
name(val, #name, comment, false, vec)
#define INT_INIT_MEMBER(name, val, comment, vec)\
#define INT_INIT_MEMBER(name, val, comment, vec) \
name(val, #name, comment, true, vec)
#define BOOL_INIT_MEMBER(name, val, comment, vec)\
#define BOOL_INIT_MEMBER(name, val, comment, vec) \
name(val, #name, comment, true, vec)
#define STRING_INIT_MEMBER(name, val, comment, vec)\
#define STRING_INIT_MEMBER(name, val, comment, vec) \
name(val, #name, comment, true, vec)
#define double_INIT_MEMBER(name, val, comment, vec)\
#define double_INIT_MEMBER(name, val, comment, vec) \
name(val, #name, comment, true, vec)
#endif

View File

@ -27,7 +27,7 @@ namespace tesseract {
class Image;
Dict::Dict(CCUtil *ccutil)
Dict::Dict(CCUtil* ccutil)
: letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
probability_in_context_(&tesseract::Dict::def_probability_in_context),
params_model_classify_(nullptr),
@ -190,7 +190,7 @@ Dict::~Dict() {
if (output_ambig_words_file_ != nullptr) fclose(output_ambig_words_file_);
}
DawgCache *Dict::GlobalDawgCache() {
DawgCache* Dict::GlobalDawgCache() {
// This global cache (a singleton) will outlive every Tesseract instance
// (even those that someone else might declare as global statics).
static DawgCache cache;
@ -198,7 +198,7 @@ DawgCache *Dict::GlobalDawgCache() {
}
// Sets up ready for a Load or LoadLSTM.
void Dict::SetupForLoad(DawgCache *dawg_cache) {
void Dict::SetupForLoad(DawgCache* dawg_cache) {
if (dawgs_.length() != 0) this->End();
apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
@ -216,7 +216,7 @@ void Dict::SetupForLoad(DawgCache *dawg_cache) {
}
// Loads the dawgs needed by Tesseract. Call FinishLoad() after.
void Dict::Load(const STRING &lang, TessdataManager *data_file) {
void Dict::Load(const STRING& lang, TessdataManager* data_file) {
// Load dawgs_.
if (load_punc_dawg) {
punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG,
@ -224,12 +224,12 @@ void Dict::Load(const STRING &lang, TessdataManager *data_file) {
if (punc_dawg_) dawgs_ += punc_dawg_;
}
if (load_system_dawg) {
Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
if (system_dawg) dawgs_ += system_dawg;
}
if (load_number_dawg) {
Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
if (number_dawg) dawgs_ += number_dawg;
}
@ -251,15 +251,15 @@ void Dict::Load(const STRING &lang, TessdataManager *data_file) {
}
STRING name;
if (((STRING &)user_words_suffix).length() > 0 ||
((STRING &)user_words_file).length() > 0) {
Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
if (((STRING&)user_words_suffix).length() > 0 ||
((STRING&)user_words_file).length() > 0) {
Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
getUnicharset().size(), dawg_debug_level);
if (((STRING &)user_words_file).length() > 0) {
name = user_words_file;
if (((STRING&)user_words_file).length() > 0) {
name = user_words_file;
} else {
name = getCCUtil()->language_data_path_prefix;
name += user_words_suffix;
name = getCCUtil()->language_data_path_prefix;
name += user_words_suffix;
}
if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
Trie::RRP_REVERSE_IF_HAS_RTL)) {
@ -270,16 +270,16 @@ void Dict::Load(const STRING &lang, TessdataManager *data_file) {
}
}
if (((STRING &)user_patterns_suffix).length() > 0 ||
((STRING &)user_patterns_file).length() > 0) {
Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
if (((STRING&)user_patterns_suffix).length() > 0 ||
((STRING&)user_patterns_file).length() > 0) {
Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
getUnicharset().size(), dawg_debug_level);
trie_ptr->initialize_patterns(&(getUnicharset()));
if (((STRING &)user_patterns_file).length() > 0) {
name = user_patterns_file;
if (((STRING&)user_patterns_file).length() > 0) {
name = user_patterns_file;
} else {
name = getCCUtil()->language_data_path_prefix;
name += user_patterns_suffix;
name = getCCUtil()->language_data_path_prefix;
name += user_patterns_suffix;
}
if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
tprintf("Error: failed to load %s\n", name.string());
@ -299,7 +299,7 @@ void Dict::Load(const STRING &lang, TessdataManager *data_file) {
}
// Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) {
void Dict::LoadLSTM(const STRING& lang, TessdataManager* data_file) {
// Load dawgs_.
if (load_punc_dawg) {
punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG,
@ -307,27 +307,28 @@ void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) {
if (punc_dawg_) dawgs_ += punc_dawg_;
}
if (load_system_dawg) {
Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
Dawg* system_dawg = dawg_cache_->GetSquishedDawg(
lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
if (system_dawg) dawgs_ += system_dawg;
}
if (load_number_dawg) {
Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
Dawg* number_dawg = dawg_cache_->GetSquishedDawg(
lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
if (number_dawg) dawgs_ += number_dawg;
}
// stolen from Dict::Load (but needs params_ from Tesseract langdata/config/api):
// stolen from Dict::Load (but needs params_ from Tesseract
// langdata/config/api):
STRING name;
if (((STRING &)user_words_suffix).length() > 0 ||
((STRING &)user_words_file).length() > 0) {
Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
if (((STRING&)user_words_suffix).length() > 0 ||
((STRING&)user_words_file).length() > 0) {
Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
getUnicharset().size(), dawg_debug_level);
if (((STRING &)user_words_file).length() > 0) {
name = user_words_file;
if (((STRING&)user_words_file).length() > 0) {
name = user_words_file;
} else {
name = getCCUtil()->language_data_path_prefix;
name += user_words_suffix;
name = getCCUtil()->language_data_path_prefix;
name += user_words_suffix;
}
if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
Trie::RRP_REVERSE_IF_HAS_RTL)) {
@ -338,16 +339,16 @@ void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) {
}
}
if (((STRING &)user_patterns_suffix).length() > 0 ||
((STRING &)user_patterns_file).length() > 0) {
Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
if (((STRING&)user_patterns_suffix).length() > 0 ||
((STRING&)user_patterns_file).length() > 0) {
Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
getUnicharset().size(), dawg_debug_level);
trie_ptr->initialize_patterns(&(getUnicharset()));
if (((STRING &)user_patterns_file).length() > 0) {
name = user_patterns_file;
if (((STRING&)user_patterns_file).length() > 0) {
name = user_patterns_file;
} else {
name = getCCUtil()->language_data_path_prefix;
name += user_patterns_suffix;
name = getCCUtil()->language_data_path_prefix;
name += user_patterns_suffix;
}
if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
tprintf("Error: failed to load %s\n", name.string());
@ -356,7 +357,6 @@ void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) {
dawgs_ += trie_ptr;
}
}
}
// Completes the loading process after Load() and/or LoadLSTM().
@ -368,13 +368,14 @@ bool Dict::FinishLoad() {
// indices into the dawgs_ vector of the successors for dawg i.
successors_.reserve(dawgs_.length());
for (int i = 0; i < dawgs_.length(); ++i) {
const Dawg *dawg = dawgs_[i];
SuccessorList *lst = new SuccessorList();
const Dawg* dawg = dawgs_[i];
SuccessorList* lst = new SuccessorList();
for (int j = 0; j < dawgs_.length(); ++j) {
const Dawg *other = dawgs_[j];
const Dawg* other = dawgs_[j];
if (dawg != nullptr && other != nullptr &&
(dawg->lang() == other->lang()) &&
kDawgSuccessors[dawg->type()][other->type()]) *lst += j;
kDawgSuccessors[dawg->type()][other->type()])
*lst += j;
}
successors_ += lst;
}
@ -382,8 +383,7 @@ bool Dict::FinishLoad() {
}
void Dict::End() {
if (dawgs_.length() == 0)
return; // Not safe to call twice.
if (dawgs_.length() == 0) return; // Not safe to call twice.
for (int i = 0; i < dawgs_.size(); i++) {
if (!dawg_cache_->FreeDawg(dawgs_[i])) {
delete dawgs_[i];
@ -405,19 +405,18 @@ void Dict::End() {
// Returns true if in light of the current state unichar_id is allowed
// according to at least one of the dawgs in the dawgs_ vector.
// See more extensive comments in dict.h where this function is declared.
int Dict::def_letter_is_okay(void* void_dawg_args,
const UNICHARSET& unicharset,
UNICHAR_ID unichar_id,
bool word_end) const {
DawgArgs *dawg_args = static_cast<DawgArgs *>(void_dawg_args);
int Dict::def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset,
UNICHAR_ID unichar_id, bool word_end) const {
DawgArgs* dawg_args = static_cast<DawgArgs*>(void_dawg_args);
ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
if (dawg_debug_level >= 3) {
tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
" num active dawgs=%d\n",
getUnicharset().debug_str(unichar_id).string(), word_end,
dawg_args->active_dawgs->length());
tprintf(
"def_letter_is_okay: current unichar=%s word_end=%d"
" num active dawgs=%d\n",
getUnicharset().debug_str(unichar_id).string(), word_end,
dawg_args->active_dawgs->length());
}
// Do not accept words that contain kPatternUnicharID.
@ -438,9 +437,10 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
// with the updated ref (an edge with the corresponding unichar id) into
// dawg_args->updated_pos.
for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
const DawgPosition& pos = (*dawg_args->active_dawgs)[a];
const Dawg* punc_dawg =
pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
const Dawg* dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
if (!dawg && !punc_dawg) {
// shouldn't happen.
@ -450,23 +450,23 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
if (!dawg) {
// We're in the punctuation dawg. A core dawg has not been chosen.
NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
EDGE_REF punc_transition_edge = punc_dawg->edge_char_of(
punc_node, Dawg::kPatternUnicharID, word_end);
EDGE_REF punc_transition_edge =
punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end);
if (punc_transition_edge != NO_EDGE) {
// Find all successors, and see which can transition.
const SuccessorList &slist = *(successors_[pos.punc_index]);
const SuccessorList& slist = *(successors_[pos.punc_index]);
for (int s = 0; s < slist.length(); ++s) {
int sdawg_index = slist[s];
const Dawg *sdawg = dawgs_[sdawg_index];
const Dawg* sdawg = dawgs_[sdawg_index];
UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
if (dawg_edge != NO_EDGE) {
if (dawg_debug_level >=3) {
if (dawg_debug_level >= 3) {
tprintf("Letter found in dawg %d\n", sdawg_index);
}
dawg_args->updated_dawgs->add_unique(
DawgPosition(sdawg_index, dawg_edge,
pos.punc_index, punc_transition_edge, false),
DawgPosition(sdawg_index, dawg_edge, pos.punc_index,
punc_transition_edge, false),
dawg_debug_level > 0,
"Append transition from punc dawg to current dawgs: ");
if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
@ -476,16 +476,15 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
}
}
}
EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id,
word_end);
EDGE_REF punc_edge =
punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
if (punc_edge != NO_EDGE) {
if (dawg_debug_level >=3) {
if (dawg_debug_level >= 3) {
tprintf("Letter found in punctuation dawg\n");
}
dawg_args->updated_dawgs->add_unique(
DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
dawg_debug_level > 0,
"Extend punctuation dawg: ");
dawg_debug_level > 0, "Extend punctuation dawg: ");
if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
}
@ -496,14 +495,15 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
// We can end the main word here.
// If we can continue on the punc ref, add that possibility.
NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
EDGE_REF punc_edge = punc_node == NO_EDGE ? NO_EDGE
: punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
EDGE_REF punc_edge =
punc_node == NO_EDGE
? NO_EDGE
: punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
if (punc_edge != NO_EDGE) {
dawg_args->updated_dawgs->add_unique(
DawgPosition(pos.dawg_index, pos.dawg_ref,
pos.punc_index, punc_edge, true),
dawg_debug_level > 0,
"Return to punctuation dawg: ");
DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index,
punc_edge, true),
dawg_debug_level > 0, "Return to punctuation dawg: ");
if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
}
@ -524,9 +524,11 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
// Find the edge out of the node for the unichar_id.
NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE
: dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg),
word_end);
EDGE_REF edge =
(node == NO_EDGE)
? NO_EDGE
: dawg->edge_char_of(
node, char_for_dawg(unicharset, unichar_id, dawg), word_end);
if (dawg_debug_level >= 3) {
tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
@ -534,7 +536,7 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
}
if (edge != NO_EDGE) { // the unichar was found in the current dawg
if (dawg_debug_level >=3) {
if (dawg_debug_level >= 3) {
tprintf("Letter found in dawg %d\n", pos.dawg_index);
}
if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
@ -569,10 +571,10 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
return dawg_args->permuter;
}
void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos,
void Dict::ProcessPatternEdges(const Dawg* dawg, const DawgPosition& pos,
UNICHAR_ID unichar_id, bool word_end,
DawgArgs *dawg_args,
PermuterType *curr_perm) const {
DawgArgs* dawg_args,
PermuterType* curr_perm) const {
NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
// Try to find the edge corresponding to the exact unichar_id and to all the
// edges corresponding to the character class of unichar_id.
@ -584,9 +586,10 @@ void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos,
// On the first iteration check all the outgoing edges.
// On the second iteration check all self-loops.
for (int k = 0; k < 2; ++k) {
EDGE_REF edge = (k == 0)
? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
: dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_patterns[i], word_end);
EDGE_REF edge =
(k == 0) ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
: dawg->pattern_loop_edge(pos.dawg_ref,
unichar_id_patterns[i], word_end);
if (edge == NO_EDGE) continue;
if (dawg_debug_level >= 3) {
tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
@ -607,7 +610,7 @@ void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos,
// Fill the given active_dawgs vector with dawgs that could contain the
// beginning of the word. If hyphenated() returns true, copy the entries
// from hyphen_active_dawgs_ instead.
void Dict::init_active_dawgs(DawgPositionVector *active_dawgs,
void Dict::init_active_dawgs(DawgPositionVector* active_dawgs,
bool ambigs_mode) const {
int i;
if (hyphenated()) {
@ -624,11 +627,11 @@ void Dict::init_active_dawgs(DawgPositionVector *active_dawgs,
}
}
void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec,
void Dict::default_dawgs(DawgPositionVector* dawg_pos_vec,
bool suppress_patterns) const {
bool punc_dawg_available =
(punc_dawg_ != nullptr) &&
punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
(punc_dawg_ != nullptr) &&
punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
for (int i = 0; i < dawgs_.length(); i++) {
if (dawgs_[i] != nullptr &&
@ -651,7 +654,7 @@ void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec,
}
}
void Dict::add_document_word(const WERD_CHOICE &best_choice) {
void Dict::add_document_word(const WERD_CHOICE& best_choice) {
// Do not add hyphenated word parts to the document dawg.
// hyphen_word_ will be non-nullptr after the set_hyphen_word() is
// called when the first part of the hyphenated word is
@ -662,8 +665,7 @@ void Dict::add_document_word(const WERD_CHOICE &best_choice) {
int stringlen = best_choice.length();
if (valid_word(best_choice) || stringlen < 2)
return;
if (valid_word(best_choice) || stringlen < 2) return;
// Discard words that contain >= kDocDictMaxRepChars repeating unichars.
if (best_choice.length() >= kDocDictMaxRepChars) {
@ -682,8 +684,7 @@ void Dict::add_document_word(const WERD_CHOICE &best_choice) {
if (best_choice.certainty() < doc_dict_certainty_threshold ||
stringlen == 2) {
if (best_choice.certainty() < doc_dict_pending_threshold)
return;
if (best_choice.certainty() < doc_dict_pending_threshold) return;
if (!pending_words_->word_in_dawg(best_choice)) {
if (stringlen > 2 ||
@ -699,23 +700,20 @@ void Dict::add_document_word(const WERD_CHOICE &best_choice) {
if (save_doc_words) {
STRING filename(getCCUtil()->imagefile);
filename += ".doc";
FILE *doc_word_file = fopen(filename.string(), "a");
FILE* doc_word_file = fopen(filename.string(), "a");
if (doc_word_file == nullptr) {
tprintf("Error: Could not open file %s\n", filename.string());
ASSERT_HOST(doc_word_file);
}
fprintf(doc_word_file, "%s\n",
best_choice.debug_string().string());
fprintf(doc_word_file, "%s\n", best_choice.debug_string().string());
fclose(doc_word_file);
}
document_words_->add_word_to_dawg(best_choice);
}
void Dict::adjust_word(WERD_CHOICE *word,
bool nonword,
void Dict::adjust_word(WERD_CHOICE* word, bool nonword,
XHeightConsistencyEnum xheight_consistency,
float additional_adjust,
bool modify_rating,
float additional_adjust, bool modify_rating,
bool debug) {
bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
word->GetTopScriptID() == getUnicharset().han_sid());
@ -725,7 +723,7 @@ void Dict::adjust_word(WERD_CHOICE *word,
float adjust_factor = additional_adjust;
float new_rating = word->rating();
new_rating += kRatingPad;
const char *xheight_triggered = "";
const char* xheight_triggered = "";
if (word->length() > 1) {
// Calculate x-height and y-offset consistency penalties.
switch (xheight_consistency) {
@ -750,8 +748,7 @@ void Dict::adjust_word(WERD_CHOICE *word,
}
if (debug) {
tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
word->unichar_string().string(), word->rating(),
xheight_triggered);
word->unichar_string().string(), word->rating(), xheight_triggered);
}
if (nonword) { // non-dictionary word
@ -791,8 +788,8 @@ void Dict::adjust_word(WERD_CHOICE *word,
word->set_adjust_factor(adjust_factor);
}
int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
const WERD_CHOICE *word_ptr = &word;
int Dict::valid_word(const WERD_CHOICE& word, bool numbers_ok) const {
const WERD_CHOICE* word_ptr = &word;
WERD_CHOICE temp_word(word.unicharset());
if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
copy_hyphen_info(&temp_word);
@ -802,15 +799,15 @@ int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
if (word_ptr->length() == 0) return NO_PERM;
// Allocate vectors for holding current and updated
// active_dawgs and initialize them.
DawgPositionVector *active_dawgs = new DawgPositionVector[2];
DawgPositionVector* active_dawgs = new DawgPositionVector[2];
init_active_dawgs(&(active_dawgs[0]), false);
DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
int last_index = word_ptr->length() - 1;
// Call letter_is_okay for each letter in the word.
for (int i = hyphen_base_size(); i <= last_index; ++i) {
if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(),
word_ptr->unichar_id(i),
i == last_index))) break;
word_ptr->unichar_id(i), i == last_index)))
break;
// Swap active_dawgs, constraints with the corresponding updated vector.
if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
dawg_args.updated_dawgs = &(active_dawgs[0]);
@ -821,12 +818,13 @@ int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
}
}
delete[] active_dawgs;
return valid_word_permuter(dawg_args.permuter, numbers_ok) ?
dawg_args.permuter : NO_PERM;
return valid_word_permuter(dawg_args.permuter, numbers_ok)
? dawg_args.permuter
: NO_PERM;
}
bool Dict::valid_bigram(const WERD_CHOICE &word1,
const WERD_CHOICE &word2) const {
bool Dict::valid_bigram(const WERD_CHOICE& word1,
const WERD_CHOICE& word2) const {
if (bigram_dawg_ == nullptr) return false;
// Extract the core word from the middle of each word with any digits
@ -862,13 +860,13 @@ bool Dict::valid_bigram(const WERD_CHOICE &word1,
}
WERD_CHOICE normalized_word(&uchset, bigram_string.size());
for (int i = 0; i < bigram_string.size(); ++i) {
normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1,
0.0f, 0.0f);
normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1, 0.0f,
0.0f);
}
return bigram_dawg_->word_in_dawg(normalized_word);
}
bool Dict::valid_punctuation(const WERD_CHOICE &word) {
bool Dict::valid_punctuation(const WERD_CHOICE& word) {
if (word.length() == 0) return NO_PERM;
int i;
WERD_CHOICE new_word(word.unicharset());
@ -882,21 +880,21 @@ bool Dict::valid_punctuation(const WERD_CHOICE &word) {
!getUnicharset().get_isdigit(unichar_id)) {
return false; // neither punc, nor alpha, nor digit
} else if ((new_len = new_word.length()) == 0 ||
new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {
new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) {
new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
}
}
for (i = 0; i < dawgs_.size(); ++i) {
if (dawgs_[i] != nullptr &&
dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
dawgs_[i]->word_in_dawg(new_word)) return true;
if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
dawgs_[i]->word_in_dawg(new_word))
return true;
}
return false;
}
/// Returns true if the language is space-delimited (not CJ, or T).
bool Dict::IsSpaceDelimitedLang() const {
const UNICHARSET &u_set = getUnicharset();
const UNICHARSET& u_set = getUnicharset();
if (u_set.han_sid() > 0) return false;
if (u_set.katakana_sid() > 0) return false;
if (u_set.thai_sid() > 0) return false;

View File

@ -18,7 +18,7 @@
// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
# include "config_auto.h"
#endif
#include "lstmrecognizer.h"
@ -66,7 +66,8 @@ LSTMRecognizer::~LSTMRecognizer() {
}
// Loads a model from mgr, including the dictionary only if lang is not null.
bool LSTMRecognizer::Load(const ParamsVectors* params, const char* lang, TessdataManager* mgr) {
bool LSTMRecognizer::Load(const ParamsVectors* params, const char* lang,
TessdataManager* mgr) {
TFile fp;
if (!mgr->GetComponent(TESSDATA_LSTM, &fp)) return false;
if (!DeSerialize(mgr, &fp)) return false;
@ -155,7 +156,8 @@ bool LSTMRecognizer::LoadRecoder(TFile* fp) {
// from checkpoint or restore without having to go back and reload the
// dictionary.
// Some parameters have to be passed in (from langdata/config/api via Tesseract)
bool LSTMRecognizer::LoadDictionary(const ParamsVectors* params, const char* lang, TessdataManager* mgr) {
bool LSTMRecognizer::LoadDictionary(const ParamsVectors* params,
const char* lang, TessdataManager* mgr) {
delete dict_;
dict_ = new Dict(&ccutil_);
dict_->user_words_file.ResetFrom(params);
@ -261,7 +263,8 @@ bool LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert,
pixInvert(pix, pix);
Input::PreparePixInput(network_->InputShape(), pix, &randomizer_,
&inv_inputs);
network_->Forward(debug, inv_inputs, nullptr, &scratch_space_, &inv_outputs);
network_->Forward(debug, inv_inputs, nullptr, &scratch_space_,
&inv_outputs);
float inv_min, inv_mean, inv_sd;
OutputStats(inv_outputs, &inv_min, &inv_mean, &inv_sd);
if (inv_min > pos_min && inv_mean > pos_mean && inv_sd < pos_sd) {
@ -405,7 +408,7 @@ void LSTMRecognizer::DebugActivationRange(const NetworkIO& outputs,
// Helper returns true if the null_char is the winner at t, and it beats the
// null_threshold, or the next choice is space, in which case we will use the
// null anyway.
#if 0 // TODO: unused, remove if still unused after 2020.
#if 0 // TODO: unused, remove if still unused after 2020.
static bool NullIsBest(const NetworkIO& output, float null_thr,
int null_char, int t) {
if (output.f(t)[null_char] >= null_thr) return true;

View File

@ -56,18 +56,10 @@ class LSTMRecognizer {
LSTMRecognizer();
~LSTMRecognizer();
int NumOutputs() const {
return network_->NumOutputs();
}
int training_iteration() const {
return training_iteration_;
}
int sample_iteration() const {
return sample_iteration_;
}
double learning_rate() const {
return learning_rate_;
}
int NumOutputs() const { return network_->NumOutputs(); }
int training_iteration() const { return training_iteration_; }
int sample_iteration() const { return sample_iteration_; }
double learning_rate() const { return learning_rate_; }
LossType OutputLossType() const {
if (network_ == nullptr) return LT_NONE;
StaticShape shape;
@ -145,17 +137,14 @@ class LSTMRecognizer {
// Sets the sample iteration to the given value. The sample_iteration_
// determines the seed for the random number generator. The training
// iteration is incremented only by a successful training iteration.
void SetIteration(int iteration) {
sample_iteration_ = iteration;
}
void SetIteration(int iteration) { sample_iteration_ = iteration; }
// Accessors for textline image normalization.
int NumInputs() const {
return network_->NumInputs();
}
int NumInputs() const { return network_->NumInputs(); }
int null_char() const { return null_char_; }
// Loads a model from mgr, including the dictionary only if lang is not null.
bool Load(const ParamsVectors* params, const char* lang, TessdataManager* mgr);
bool Load(const ParamsVectors* params, const char* lang,
TessdataManager* mgr);
// Writes to the given file. Returns false in case of error.
// If mgr contains a unicharset and recoder, then they are not encoded to fp.
@ -175,7 +164,8 @@ class LSTMRecognizer {
// on the unicharset matching. This enables training to deserialize a model
// from checkpoint or restore without having to go back and reload the
// dictionary.
bool LoadDictionary(const ParamsVectors* params, const char* lang, TessdataManager* mgr);
bool LoadDictionary(const ParamsVectors* params, const char* lang,
TessdataManager* mgr);
// Recognizes the line image, contained within image_data, returning the
// recognized tesseract WERD_RES for the words.
@ -188,8 +178,8 @@ class LSTMRecognizer {
PointerVector<WERD_RES>* words, int lstm_choice_mode = 0);
// Helper computes min and mean best results in the output.
void OutputStats(const NetworkIO& outputs,
float* min_output, float* mean_output, float* sd);
void OutputStats(const NetworkIO& outputs, float* min_output,
float* mean_output, float* sd);
// Recognizes the image_data, returning the labels,
// scores, and corresponding pairs of start, end x-coords in coords.
// Returned in scale_factor is the reduction factor
@ -209,11 +199,9 @@ class LSTMRecognizer {
// Displays the forward results in a window with the characters and
// boundaries as determined by the labels and label_coords.
void DisplayForward(const NetworkIO& inputs,
const GenericVector<int>& labels,
void DisplayForward(const NetworkIO& inputs, const GenericVector<int>& labels,
const GenericVector<int>& label_coords,
const char* window_name,
ScrollView** window);
const char* window_name, ScrollView** window);
// Converts the network output to a sequence of labels. Outputs labels, scores
// and start xcoords of each char, and each null_char_, with an additional
// final xcoord for the end of the output.
@ -232,8 +220,8 @@ class LSTMRecognizer {
// Displays the labels and cuts at the corresponding xcoords.
// Size of labels should match xcoords.
void DisplayLSTMOutput(const GenericVector<int>& labels,
const GenericVector<int>& xcoords,
int height, ScrollView* window);
const GenericVector<int>& xcoords, int height,
ScrollView* window);
// Prints debug output detailing the activation path that is implied by the
// xcoords.
@ -253,8 +241,7 @@ class LSTMRecognizer {
// Converts the network output to a sequence of labels, with scores, using
// the simple character model (each position is a char, and the null_char_ is
// mainly intended for tail padding.)
void LabelsViaSimpleText(const NetworkIO& output,
GenericVector<int>* labels,
void LabelsViaSimpleText(const NetworkIO& output, GenericVector<int>* labels,
GenericVector<int>* xcoords);
// Returns a string corresponding to the label starting at start. Sets *end