From 91e2b253c071fe42a6d9b49f239666360cf25891 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Sun, 24 Mar 2019 21:10:29 +0100 Subject: [PATCH] Format modified code with clang-format Format the files which were changed in commit 297d7d86ceb3b3365876b56f08a18cba066bf23f. Signed-off-by: Stefan Weil --- src/ccmain/tessedit.cpp | 118 +++++++++--------- src/ccutil/params.h | 190 ++++++++++++++-------------- src/dict/dict.cpp | 240 ++++++++++++++++++------------------ src/lstm/lstmrecognizer.cpp | 13 +- src/lstm/lstmrecognizer.h | 47 +++---- 5 files changed, 292 insertions(+), 316 deletions(-) diff --git a/src/ccmain/tessedit.cpp b/src/ccmain/tessedit.cpp index 30ee96e7..ccef2af0 100644 --- a/src/ccmain/tessedit.cpp +++ b/src/ccmain/tessedit.cpp @@ -21,42 +21,42 @@ // Include automatically generated configuration file if running autoconf. #ifdef HAVE_CONFIG_H -#include "config_auto.h" +# include "config_auto.h" #endif #include "basedir.h" -#include "tessvars.h" #include "control.h" -#include "reject.h" #include "pageres.h" #include "pgedit.h" -#include "tprintf.h" -#include "tessedit.h" +#include "reject.h" #include "stopper.h" +#include "tessedit.h" +#include "tessvars.h" +#include "tprintf.h" #ifndef DISABLED_LEGACY_ENGINE -#include "intmatcher.h" -#include "chop.h" +# include "chop.h" +# include "intmatcher.h" #endif #ifndef ANDROID_BUILD -#include "lstmrecognizer.h" +# include "lstmrecognizer.h" #endif -#include "tesseractclass.h" #include "params.h" +#include "tesseractclass.h" #ifdef DISABLED_LEGACY_ENGINE -#include "matchdefs.h" +# include "matchdefs.h" #endif - // config under api -#define API_CONFIG "configs/api_config" +// config under api +#define API_CONFIG "configs/api_config" -ETEXT_DESC *global_monitor = nullptr; // progress monitor +ETEXT_DESC* global_monitor = nullptr; // progress monitor namespace tesseract { // Read a "config" file containing a set of variable, value pairs. // Searches the standard places: tessdata/configs, tessdata/tessconfigs // and also accepts a relative or absolute path name. -void Tesseract::read_config_file(const char *filename, +void Tesseract::read_config_file(const char* filename, SetParamConstraint constraint) { STRING path = datadir; path += "configs/"; @@ -88,11 +88,11 @@ void Tesseract::read_config_file(const char *filename, // the config files specified on the command line or left as the default // OEM_TESSERACT_ONLY if none of the configs specify this variable. bool Tesseract::init_tesseract_lang_data( - const char *arg0, const char *textbase, const char *language, - OcrEngineMode oem, char **configs, int configs_size, - const GenericVector *vars_vec, - const GenericVector *vars_values, bool set_only_non_debug_params, - TessdataManager *mgr) { + const char* arg0, const char* textbase, const char* language, + OcrEngineMode oem, char** configs, int configs_size, + const GenericVector* vars_vec, + const GenericVector* vars_values, bool set_only_non_debug_params, + TessdataManager* mgr) { // Set the basename, compute the data directory. main_setup(arg0, textbase); @@ -106,8 +106,9 @@ bool Tesseract::init_tesseract_lang_data( STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix; if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string())) { tprintf("Error opening data file %s\n", tessdata_path.string()); - tprintf("Please make sure the TESSDATA_PREFIX environment variable is set" - " to your \"tessdata\" directory.\n"); + tprintf( + "Please make sure the TESSDATA_PREFIX environment variable is set" + " to your \"tessdata\" directory.\n"); return false; } #ifndef DISABLED_LEGACY_ENGINE @@ -131,8 +132,9 @@ bool Tesseract::init_tesseract_lang_data( this->params()); } - SetParamConstraint set_params_constraint = set_only_non_debug_params ? - SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE; + SetParamConstraint set_params_constraint = + set_only_non_debug_params ? SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY + : SET_PARAM_CONSTRAINT_NONE; // Load tesseract variables from config files. This is done after loading // language-specific variables from [lang].traineddata file, so that custom // config files can override values in [lang].traineddata file. @@ -153,8 +155,8 @@ bool Tesseract::init_tesseract_lang_data( } } - if (((STRING &)tessedit_write_params_to_file).length() > 0) { - FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb"); + if (((STRING&)tessedit_write_params_to_file).length() > 0) { + FILE* params_file = fopen(tessedit_write_params_to_file.string(), "wb"); if (params_file != nullptr) { ParamUtils::PrintParams(params_file, this->params()); fclose(params_file); @@ -177,16 +179,16 @@ bool Tesseract::init_tesseract_lang_data( // engine-specific data files need to be loaded. // If LSTM_ONLY is requested, the base Tesseract files are *Not* required. #ifndef ANDROID_BUILD -#ifdef DISABLED_LEGACY_ENGINE +# ifdef DISABLED_LEGACY_ENGINE if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { -#else +# else if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY || tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) { -#endif // ndef DISABLED_LEGACY_ENGINE +# endif // ndef DISABLED_LEGACY_ENGINE if (mgr->IsComponentAvailable(TESSDATA_LSTM)) { lstm_recognizer_ = new LSTMRecognizer; - ASSERT_HOST( - lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : nullptr, mgr)); + ASSERT_HOST(lstm_recognizer_->Load( + this->params(), lstm_use_matrix ? language : nullptr, mgr)); } else { tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n"); tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY); @@ -203,7 +205,7 @@ bool Tesseract::init_tesseract_lang_data( } #ifndef DISABLED_LEGACY_ENGINE else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || - !unicharset.load_from_file(&fp, false)) { + !unicharset.load_from_file(&fp, false)) { return false; } #endif // ndef DISABLED_LEGACY_ENGINE @@ -228,8 +230,8 @@ bool Tesseract::init_tesseract_lang_data( // Init ParamsModel. // Load pass1 and pass2 weights (for now these two sets are the same, but in // the future separate sets of weights can be generated). - for (int p = ParamsModel::PTRAIN_PASS1; - p < ParamsModel::PTRAIN_NUM_PASSES; ++p) { + for (int p = ParamsModel::PTRAIN_PASS1; p < ParamsModel::PTRAIN_NUM_PASSES; + ++p) { language_model_->getParamsModel().SetPass( static_cast(p)); if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) { @@ -247,8 +249,7 @@ bool Tesseract::init_tesseract_lang_data( static bool IsStrInList(const STRING& str, const GenericVector& str_list) { for (int i = 0; i < str_list.size(); ++i) { - if (str_list[i] == str) - return true; + if (str_list[i] == str) return true; } return false; } @@ -265,8 +266,7 @@ void Tesseract::ParseLanguageString(const char* lang_str, while (remains.length() > 0) { // Find the start of the lang code and which vector to add to. const char* start = remains.string(); - while (*start == '+') - ++start; + while (*start == '+') ++start; GenericVector* target = to_load; if (*start == '~') { target = not_to_load; @@ -275,8 +275,7 @@ void Tesseract::ParseLanguageString(const char* lang_str, // Find the index of the end of the lang code in string start. int end = strlen(start); const char* plus = strchr(start, '+'); - if (plus != nullptr && plus - start < end) - end = plus - start; + if (plus != nullptr && plus - start < end) end = plus - start; STRING lang_code(start); lang_code.truncate_at(end); STRING next(start + end); @@ -292,13 +291,13 @@ void Tesseract::ParseLanguageString(const char* lang_str, // string and recursively any additional languages required by any language // traineddata file (via tessedit_load_sublangs in its config) that is loaded. // See init_tesseract_internal for args. -int Tesseract::init_tesseract(const char *arg0, const char *textbase, - const char *language, OcrEngineMode oem, - char **configs, int configs_size, - const GenericVector *vars_vec, - const GenericVector *vars_values, +int Tesseract::init_tesseract(const char* arg0, const char* textbase, + const char* language, OcrEngineMode oem, + char** configs, int configs_size, + const GenericVector* vars_vec, + const GenericVector* vars_values, bool set_only_non_debug_params, - TessdataManager *mgr) { + TessdataManager* mgr) { GenericVector langs_to_load; GenericVector langs_not_to_load; ParseLanguageString(language, &langs_to_load, &langs_not_to_load); @@ -311,8 +310,8 @@ int Tesseract::init_tesseract(const char *arg0, const char *textbase, // Load the rest into sub_langs_. for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) { if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) { - const char *lang_str = langs_to_load[lang_index].string(); - Tesseract *tess_to_init; + const char* lang_str = langs_to_load[lang_index].string(); + Tesseract* tess_to_init; if (!loaded_primary) { tess_to_init = this; } else { @@ -392,13 +391,13 @@ int Tesseract::init_tesseract(const char *arg0, const char *textbase, // in vars_vec. // If set_only_init_params is true, then only the initialization variables // will be set. -int Tesseract::init_tesseract_internal(const char *arg0, const char *textbase, - const char *language, OcrEngineMode oem, - char **configs, int configs_size, - const GenericVector *vars_vec, - const GenericVector *vars_values, +int Tesseract::init_tesseract_internal(const char* arg0, const char* textbase, + const char* language, OcrEngineMode oem, + char** configs, int configs_size, + const GenericVector* vars_vec, + const GenericVector* vars_values, bool set_only_non_debug_params, - TessdataManager *mgr) { + TessdataManager* mgr) { if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs, configs_size, vars_vec, vars_values, set_only_non_debug_params, mgr)) { @@ -412,7 +411,7 @@ int Tesseract::init_tesseract_internal(const char *arg0, const char *textbase, bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY; program_editup(textbase, init_tesseract ? mgr : nullptr, init_tesseract ? mgr : nullptr); - return 0; //Normal exit + return 0; // Normal exit } #ifndef DISABLED_LEGACY_ENGINE @@ -458,8 +457,8 @@ void Tesseract::SetupUniversalFontIds() { } // init the LM component -int Tesseract::init_tesseract_lm(const char *arg0, const char *textbase, - const char *language, TessdataManager *mgr) { +int Tesseract::init_tesseract_lm(const char* arg0, const char* textbase, + const char* language, TessdataManager* mgr) { if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY, nullptr, 0, nullptr, nullptr, false, mgr)) return -1; @@ -471,14 +470,11 @@ int Tesseract::init_tesseract_lm(const char *arg0, const char *textbase, #endif // ndef DISABLED_LEGACY_ENGINE -void Tesseract::end_tesseract() { - end_recog(); -} +void Tesseract::end_tesseract() { end_recog(); } /* Define command type identifiers */ -enum CMD_EVENTS -{ +enum CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, diff --git a/src/ccutil/params.h b/src/ccutil/params.h index 7e227857..ee5f0a97 100644 --- a/src/ccutil/params.h +++ b/src/ccutil/params.h @@ -17,13 +17,13 @@ * **********************************************************************/ -#ifndef PARAMS_H -#define PARAMS_H +#ifndef PARAMS_H +#define PARAMS_H -#include +#include -#include "genericvector.h" -#include "strngs.h" +#include "genericvector.h" +#include "strngs.h" namespace tesseract { @@ -41,10 +41,10 @@ enum SetParamConstraint { }; struct ParamsVectors { - GenericVector int_params; - GenericVector bool_params; - GenericVector string_params; - GenericVector double_params; + GenericVector int_params; + GenericVector bool_params; + GenericVector string_params; + GenericVector double_params; }; // Utility functions for working with Tesseract parameters. @@ -55,27 +55,25 @@ class ParamUtils { // ORed or ANDed with any current values. // Blank lines and lines beginning # are ignored. // Values may have any whitespace after the name and are the rest of line. - static bool ReadParamsFile( - const char *file, // filename to read - SetParamConstraint constraint, - ParamsVectors *member_params); + static bool ReadParamsFile(const char* file, // filename to read + SetParamConstraint constraint, + ParamsVectors* member_params); // Read parameters from the given file pointer. - static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, - ParamsVectors *member_params); + static bool ReadParamsFromFp(SetParamConstraint constraint, TFile* fp, + ParamsVectors* member_params); // Set a parameters to have the given value. - static bool SetParam(const char *name, const char* value, + static bool SetParam(const char* name, const char* value, SetParamConstraint constraint, - ParamsVectors *member_params); + ParamsVectors* member_params); // Returns the pointer to the parameter with the given name (of the // appropriate type) if it was found in the vector obtained from // GlobalParams() or in the given member_params. - template - static T *FindParam(const char *name, - const GenericVector &global_vec, - const GenericVector &member_vec) { + template + static T* FindParam(const char* name, const GenericVector& global_vec, + const GenericVector& member_vec) { int i; for (i = 0; i < global_vec.size(); ++i) { if (strcmp(global_vec[i]->name_str(), name) == 0) return global_vec[i]; @@ -86,8 +84,8 @@ class ParamUtils { return nullptr; } // Removes the given pointer to the param from the given vector. - template - static void RemoveParam(T *param_ptr, GenericVector *vec) { + template + static void RemoveParam(T* param_ptr, GenericVector* vec) { for (int i = 0; i < vec->size(); ++i) { if ((*vec)[i] == param_ptr) { vec->remove(i); @@ -97,12 +95,12 @@ class ParamUtils { } // Fetches the value of the named param as a STRING. Returns false if not // found. - static bool GetParamAsString(const char *name, + static bool GetParamAsString(const char* name, const ParamsVectors* member_params, - STRING *value); + STRING* value); // Print parameters to the given file. - static void PrintParams(FILE *fp, const ParamsVectors *member_params); + static void PrintParams(FILE* fp, const ParamsVectors* member_params); // Resets all parameters back to default values; static void ResetToDefaults(ParamsVectors* member_params); @@ -113,36 +111,36 @@ class Param { public: ~Param() = default; - const char *name_str() const { return name_; } - const char *info_str() const { return info_; } + const char* name_str() const { return name_; } + const char* info_str() const { return info_; } bool is_init() const { return init_; } bool is_debug() const { return debug_; } bool constraint_ok(SetParamConstraint constraint) const { - return (constraint == SET_PARAM_CONSTRAINT_NONE || - (constraint == SET_PARAM_CONSTRAINT_DEBUG_ONLY && - this->is_debug()) || - (constraint == SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY && - !this->is_debug()) || - (constraint == SET_PARAM_CONSTRAINT_NON_INIT_ONLY && - !this->is_init())); + return ( + constraint == SET_PARAM_CONSTRAINT_NONE || + (constraint == SET_PARAM_CONSTRAINT_DEBUG_ONLY && this->is_debug()) || + (constraint == SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY && + !this->is_debug()) || + (constraint == SET_PARAM_CONSTRAINT_NON_INIT_ONLY && !this->is_init())); } protected: - Param(const char *name, const char *comment, bool init) : - name_(name), info_(comment), init_(init) { + Param(const char* name, const char* comment, bool init) + : name_(name), info_(comment), init_(init) { debug_ = (strstr(name, "debug") != nullptr) || (strstr(name, "display")); } - const char *name_; // name of this parameter - const char *info_; // for menus - bool init_; // needs to be set before init + const char* name_; // name of this parameter + const char* info_; // for menus + bool init_; // needs to be set before init bool debug_; }; class IntParam : public Param { - public: - IntParam(int32_t value, const char *name, const char *comment, bool init, - ParamsVectors *vec) : Param(name, comment, init) { + public: + IntParam(int32_t value, const char* name, const char* comment, bool init, + ParamsVectors* vec) + : Param(name, comment, init) { value_ = value; default_ = value; params_vec_ = &(vec->int_params); @@ -152,29 +150,29 @@ class IntParam : public Param { operator int32_t() const { return value_; } void operator=(int32_t value) { value_ = value; } void set_value(int32_t value) { value_ = value; } - void ResetToDefault() { - value_ = default_; - } + void ResetToDefault() { value_ = default_; } void ResetFrom(const ParamsVectors* vec) { for (int i = 0; i < vec->int_params.size(); ++i) { if (strcmp(vec->int_params[i]->name_str(), name_) == 0) { - //printf("overriding param %s=%d by =%d\n", name_, value_, *vec->int_params[i]); + // printf("overriding param %s=%d by =%d\n", name_, value_, + // *vec->int_params[i]); value_ = *vec->int_params[i]; } } } - + private: int32_t value_; int32_t default_; // Pointer to the vector that contains this param (not owned by this class). - GenericVector *params_vec_; + GenericVector* params_vec_; }; class BoolParam : public Param { public: - BoolParam(bool value, const char *name, const char *comment, bool init, - ParamsVectors *vec) : Param(name, comment, init) { + BoolParam(bool value, const char* name, const char* comment, bool init, + ParamsVectors* vec) + : Param(name, comment, init) { value_ = value; default_ = value; params_vec_ = &(vec->bool_params); @@ -184,13 +182,12 @@ class BoolParam : public Param { operator BOOL8() const { return value_; } void operator=(BOOL8 value) { value_ = value; } void set_value(BOOL8 value) { value_ = value; } - void ResetToDefault() { - value_ = default_; - } + void ResetToDefault() { value_ = default_; } void ResetFrom(const ParamsVectors* vec) { for (int i = 0; i < vec->bool_params.size(); ++i) { if (strcmp(vec->bool_params[i]->name_str(), name_) == 0) { - //printf("overriding param %s=%s by =%s\n", name_, value_ ? "true" : "false", *vec->bool_params[i] ? "true" : "false"); + // printf("overriding param %s=%s by =%s\n", name_, value_ ? "true" : + // "false", *vec->bool_params[i] ? "true" : "false"); value_ = *vec->bool_params[i]; } } @@ -200,34 +197,33 @@ class BoolParam : public Param { BOOL8 value_; BOOL8 default_; // Pointer to the vector that contains this param (not owned by this class). - GenericVector *params_vec_; + GenericVector* params_vec_; }; class StringParam : public Param { public: - StringParam(const char *value, const char *name, - const char *comment, bool init, - ParamsVectors *vec) : Param(name, comment, init) { + StringParam(const char* value, const char* name, const char* comment, + bool init, ParamsVectors* vec) + : Param(name, comment, init) { value_ = value; default_ = value; params_vec_ = &(vec->string_params); vec->string_params.push_back(this); } ~StringParam() { ParamUtils::RemoveParam(this, params_vec_); } - operator STRING &() { return value_; } - const char *string() const { return value_.string(); } - const char *c_str() const { return value_.string(); } + operator STRING&() { return value_; } + const char* string() const { return value_.string(); } + const char* c_str() const { return value_.string(); } bool empty() { return value_.length() <= 0; } bool operator==(const STRING& other) { return value_ == other; } void operator=(const STRING& value) { value_ = value; } void set_value(const STRING& value) { value_ = value; } - void ResetToDefault() { - value_ = default_; - } + void ResetToDefault() { value_ = default_; } void ResetFrom(const ParamsVectors* vec) { for (int i = 0; i < vec->string_params.size(); ++i) { if (strcmp(vec->string_params[i]->name_str(), name_) == 0) { - //printf("overriding param %s=%s by =%s\n", name_, value_, vec->string_params[i]->c_str()); + // printf("overriding param %s=%s by =%s\n", name_, value_, + // vec->string_params[i]->c_str()); value_ = *vec->string_params[i]; } } @@ -237,13 +233,14 @@ class StringParam : public Param { STRING value_; STRING default_; // Pointer to the vector that contains this param (not owned by this class). - GenericVector *params_vec_; + GenericVector* params_vec_; }; class DoubleParam : public Param { public: - DoubleParam(double value, const char *name, const char *comment, - bool init, ParamsVectors *vec) : Param(name, comment, init) { + DoubleParam(double value, const char* name, const char* comment, bool init, + ParamsVectors* vec) + : Param(name, comment, init) { value_ = value; default_ = value; params_vec_ = &(vec->double_params); @@ -253,13 +250,12 @@ class DoubleParam : public Param { operator double() const { return value_; } void operator=(double value) { value_ = value; } void set_value(double value) { value_ = value; } - void ResetToDefault() { - value_ = default_; - } + void ResetToDefault() { value_ = default_; } void ResetFrom(const ParamsVectors* vec) { for (int i = 0; i < vec->double_params.size(); ++i) { if (strcmp(vec->double_params[i]->name_str(), name_) == 0) { - //printf("overriding param %s=%f by =%f\n", name_, value_, *vec->double_params[i]); + // printf("overriding param %s=%f by =%f\n", name_, value_, + // *vec->double_params[i]); value_ = *vec->double_params[i]; } } @@ -269,7 +265,7 @@ class DoubleParam : public Param { double value_; double default_; // Pointer to the vector that contains this param (not owned by this class). - GenericVector *params_vec_; + GenericVector* params_vec_; }; } // namespace tesseract @@ -283,7 +279,7 @@ class DoubleParam : public Param { // // TODO(daria): remove GlobalParams() when all global Tesseract // parameters are converted to members. -tesseract::ParamsVectors *GlobalParams(); +tesseract::ParamsVectors* GlobalParams(); /************************************************************************* * Note on defining parameters. @@ -293,52 +289,48 @@ tesseract::ParamsVectors *GlobalParams(); * (there is no such guarantee for parameters defined with the other macros). *************************************************************************/ -#define INT_VAR_H(name,val,comment)\ - tesseract::IntParam name +#define INT_VAR_H(name, val, comment) tesseract::IntParam name -#define BOOL_VAR_H(name,val,comment)\ - tesseract::BoolParam name +#define BOOL_VAR_H(name, val, comment) tesseract::BoolParam name -#define STRING_VAR_H(name,val,comment)\ - tesseract::StringParam name +#define STRING_VAR_H(name, val, comment) tesseract::StringParam name -#define double_VAR_H(name,val,comment)\ - tesseract::DoubleParam name +#define double_VAR_H(name, val, comment) tesseract::DoubleParam name -#define INT_VAR(name,val,comment)\ - tesseract::IntParam name(val,#name,comment,false,GlobalParams()) +#define INT_VAR(name, val, comment) \ + tesseract::IntParam name(val, #name, comment, false, GlobalParams()) -#define BOOL_VAR(name,val,comment)\ - tesseract::BoolParam name(val,#name,comment,false,GlobalParams()) +#define BOOL_VAR(name, val, comment) \ + tesseract::BoolParam name(val, #name, comment, false, GlobalParams()) -#define STRING_VAR(name,val,comment)\ - tesseract::StringParam name(val,#name,comment,false,GlobalParams()) +#define STRING_VAR(name, val, comment) \ + tesseract::StringParam name(val, #name, comment, false, GlobalParams()) -#define double_VAR(name,val,comment)\ - tesseract::DoubleParam name(val,#name,comment,false,GlobalParams()) +#define double_VAR(name, val, comment) \ + tesseract::DoubleParam name(val, #name, comment, false, GlobalParams()) -#define INT_MEMBER(name, val, comment, vec)\ +#define INT_MEMBER(name, val, comment, vec) \ name(val, #name, comment, false, vec) -#define BOOL_MEMBER(name, val, comment, vec)\ +#define BOOL_MEMBER(name, val, comment, vec) \ name(val, #name, comment, false, vec) -#define STRING_MEMBER(name, val, comment, vec)\ +#define STRING_MEMBER(name, val, comment, vec) \ name(val, #name, comment, false, vec) -#define double_MEMBER(name, val, comment, vec)\ +#define double_MEMBER(name, val, comment, vec) \ name(val, #name, comment, false, vec) -#define INT_INIT_MEMBER(name, val, comment, vec)\ +#define INT_INIT_MEMBER(name, val, comment, vec) \ name(val, #name, comment, true, vec) -#define BOOL_INIT_MEMBER(name, val, comment, vec)\ +#define BOOL_INIT_MEMBER(name, val, comment, vec) \ name(val, #name, comment, true, vec) -#define STRING_INIT_MEMBER(name, val, comment, vec)\ +#define STRING_INIT_MEMBER(name, val, comment, vec) \ name(val, #name, comment, true, vec) -#define double_INIT_MEMBER(name, val, comment, vec)\ +#define double_INIT_MEMBER(name, val, comment, vec) \ name(val, #name, comment, true, vec) #endif diff --git a/src/dict/dict.cpp b/src/dict/dict.cpp index f2f1903a..dded79d9 100644 --- a/src/dict/dict.cpp +++ b/src/dict/dict.cpp @@ -27,7 +27,7 @@ namespace tesseract { class Image; -Dict::Dict(CCUtil *ccutil) +Dict::Dict(CCUtil* ccutil) : letter_is_okay_(&tesseract::Dict::def_letter_is_okay), probability_in_context_(&tesseract::Dict::def_probability_in_context), params_model_classify_(nullptr), @@ -190,7 +190,7 @@ Dict::~Dict() { if (output_ambig_words_file_ != nullptr) fclose(output_ambig_words_file_); } -DawgCache *Dict::GlobalDawgCache() { +DawgCache* Dict::GlobalDawgCache() { // This global cache (a singleton) will outlive every Tesseract instance // (even those that someone else might declare as global statics). static DawgCache cache; @@ -198,7 +198,7 @@ DawgCache *Dict::GlobalDawgCache() { } // Sets up ready for a Load or LoadLSTM. -void Dict::SetupForLoad(DawgCache *dawg_cache) { +void Dict::SetupForLoad(DawgCache* dawg_cache) { if (dawgs_.length() != 0) this->End(); apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol); @@ -216,7 +216,7 @@ void Dict::SetupForLoad(DawgCache *dawg_cache) { } // Loads the dawgs needed by Tesseract. Call FinishLoad() after. -void Dict::Load(const STRING &lang, TessdataManager *data_file) { +void Dict::Load(const STRING& lang, TessdataManager* data_file) { // Load dawgs_. if (load_punc_dawg) { punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, @@ -224,12 +224,12 @@ void Dict::Load(const STRING &lang, TessdataManager *data_file) { if (punc_dawg_) dawgs_ += punc_dawg_; } if (load_system_dawg) { - Dawg *system_dawg = dawg_cache_->GetSquishedDawg( + Dawg* system_dawg = dawg_cache_->GetSquishedDawg( lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file); if (system_dawg) dawgs_ += system_dawg; } if (load_number_dawg) { - Dawg *number_dawg = dawg_cache_->GetSquishedDawg( + Dawg* number_dawg = dawg_cache_->GetSquishedDawg( lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file); if (number_dawg) dawgs_ += number_dawg; } @@ -251,15 +251,15 @@ void Dict::Load(const STRING &lang, TessdataManager *data_file) { } STRING name; - if (((STRING &)user_words_suffix).length() > 0 || - ((STRING &)user_words_file).length() > 0) { - Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, + if (((STRING&)user_words_suffix).length() > 0 || + ((STRING&)user_words_file).length() > 0) { + Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level); - if (((STRING &)user_words_file).length() > 0) { - name = user_words_file; + if (((STRING&)user_words_file).length() > 0) { + name = user_words_file; } else { - name = getCCUtil()->language_data_path_prefix; - name += user_words_suffix; + name = getCCUtil()->language_data_path_prefix; + name += user_words_suffix; } if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(), Trie::RRP_REVERSE_IF_HAS_RTL)) { @@ -270,16 +270,16 @@ void Dict::Load(const STRING &lang, TessdataManager *data_file) { } } - if (((STRING &)user_patterns_suffix).length() > 0 || - ((STRING &)user_patterns_file).length() > 0) { - Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, + if (((STRING&)user_patterns_suffix).length() > 0 || + ((STRING&)user_patterns_file).length() > 0) { + Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(), dawg_debug_level); trie_ptr->initialize_patterns(&(getUnicharset())); - if (((STRING &)user_patterns_file).length() > 0) { - name = user_patterns_file; + if (((STRING&)user_patterns_file).length() > 0) { + name = user_patterns_file; } else { - name = getCCUtil()->language_data_path_prefix; - name += user_patterns_suffix; + name = getCCUtil()->language_data_path_prefix; + name += user_patterns_suffix; } if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) { tprintf("Error: failed to load %s\n", name.string()); @@ -299,7 +299,7 @@ void Dict::Load(const STRING &lang, TessdataManager *data_file) { } // Loads the dawgs needed by the LSTM model. Call FinishLoad() after. -void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) { +void Dict::LoadLSTM(const STRING& lang, TessdataManager* data_file) { // Load dawgs_. if (load_punc_dawg) { punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, @@ -307,27 +307,28 @@ void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) { if (punc_dawg_) dawgs_ += punc_dawg_; } if (load_system_dawg) { - Dawg *system_dawg = dawg_cache_->GetSquishedDawg( + Dawg* system_dawg = dawg_cache_->GetSquishedDawg( lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file); if (system_dawg) dawgs_ += system_dawg; } if (load_number_dawg) { - Dawg *number_dawg = dawg_cache_->GetSquishedDawg( + Dawg* number_dawg = dawg_cache_->GetSquishedDawg( lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file); if (number_dawg) dawgs_ += number_dawg; } - // stolen from Dict::Load (but needs params_ from Tesseract langdata/config/api): + // stolen from Dict::Load (but needs params_ from Tesseract + // langdata/config/api): STRING name; - if (((STRING &)user_words_suffix).length() > 0 || - ((STRING &)user_words_file).length() > 0) { - Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, + if (((STRING&)user_words_suffix).length() > 0 || + ((STRING&)user_words_file).length() > 0) { + Trie* trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level); - if (((STRING &)user_words_file).length() > 0) { - name = user_words_file; + if (((STRING&)user_words_file).length() > 0) { + name = user_words_file; } else { - name = getCCUtil()->language_data_path_prefix; - name += user_words_suffix; + name = getCCUtil()->language_data_path_prefix; + name += user_words_suffix; } if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(), Trie::RRP_REVERSE_IF_HAS_RTL)) { @@ -338,16 +339,16 @@ void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) { } } - if (((STRING &)user_patterns_suffix).length() > 0 || - ((STRING &)user_patterns_file).length() > 0) { - Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, + if (((STRING&)user_patterns_suffix).length() > 0 || + ((STRING&)user_patterns_file).length() > 0) { + Trie* trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(), dawg_debug_level); trie_ptr->initialize_patterns(&(getUnicharset())); - if (((STRING &)user_patterns_file).length() > 0) { - name = user_patterns_file; + if (((STRING&)user_patterns_file).length() > 0) { + name = user_patterns_file; } else { - name = getCCUtil()->language_data_path_prefix; - name += user_patterns_suffix; + name = getCCUtil()->language_data_path_prefix; + name += user_patterns_suffix; } if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) { tprintf("Error: failed to load %s\n", name.string()); @@ -356,7 +357,6 @@ void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) { dawgs_ += trie_ptr; } } - } // Completes the loading process after Load() and/or LoadLSTM(). @@ -368,13 +368,14 @@ bool Dict::FinishLoad() { // indices into the dawgs_ vector of the successors for dawg i. successors_.reserve(dawgs_.length()); for (int i = 0; i < dawgs_.length(); ++i) { - const Dawg *dawg = dawgs_[i]; - SuccessorList *lst = new SuccessorList(); + const Dawg* dawg = dawgs_[i]; + SuccessorList* lst = new SuccessorList(); for (int j = 0; j < dawgs_.length(); ++j) { - const Dawg *other = dawgs_[j]; + const Dawg* other = dawgs_[j]; if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) && - kDawgSuccessors[dawg->type()][other->type()]) *lst += j; + kDawgSuccessors[dawg->type()][other->type()]) + *lst += j; } successors_ += lst; } @@ -382,8 +383,7 @@ bool Dict::FinishLoad() { } void Dict::End() { - if (dawgs_.length() == 0) - return; // Not safe to call twice. + if (dawgs_.length() == 0) return; // Not safe to call twice. for (int i = 0; i < dawgs_.size(); i++) { if (!dawg_cache_->FreeDawg(dawgs_[i])) { delete dawgs_[i]; @@ -405,19 +405,18 @@ void Dict::End() { // Returns true if in light of the current state unichar_id is allowed // according to at least one of the dawgs in the dawgs_ vector. // See more extensive comments in dict.h where this function is declared. -int Dict::def_letter_is_okay(void* void_dawg_args, - const UNICHARSET& unicharset, - UNICHAR_ID unichar_id, - bool word_end) const { - DawgArgs *dawg_args = static_cast(void_dawg_args); +int Dict::def_letter_is_okay(void* void_dawg_args, const UNICHARSET& unicharset, + UNICHAR_ID unichar_id, bool word_end) const { + DawgArgs* dawg_args = static_cast(void_dawg_args); ASSERT_HOST(unicharset.contains_unichar_id(unichar_id)); if (dawg_debug_level >= 3) { - tprintf("def_letter_is_okay: current unichar=%s word_end=%d" - " num active dawgs=%d\n", - getUnicharset().debug_str(unichar_id).string(), word_end, - dawg_args->active_dawgs->length()); + tprintf( + "def_letter_is_okay: current unichar=%s word_end=%d" + " num active dawgs=%d\n", + getUnicharset().debug_str(unichar_id).string(), word_end, + dawg_args->active_dawgs->length()); } // Do not accept words that contain kPatternUnicharID. @@ -438,9 +437,10 @@ int Dict::def_letter_is_okay(void* void_dawg_args, // with the updated ref (an edge with the corresponding unichar id) into // dawg_args->updated_pos. for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) { - const DawgPosition &pos = (*dawg_args->active_dawgs)[a]; - const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr; - const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr; + const DawgPosition& pos = (*dawg_args->active_dawgs)[a]; + const Dawg* punc_dawg = + pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr; + const Dawg* dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr; if (!dawg && !punc_dawg) { // shouldn't happen. @@ -450,23 +450,23 @@ int Dict::def_letter_is_okay(void* void_dawg_args, if (!dawg) { // We're in the punctuation dawg. A core dawg has not been chosen. NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref); - EDGE_REF punc_transition_edge = punc_dawg->edge_char_of( - punc_node, Dawg::kPatternUnicharID, word_end); + EDGE_REF punc_transition_edge = + punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end); if (punc_transition_edge != NO_EDGE) { // Find all successors, and see which can transition. - const SuccessorList &slist = *(successors_[pos.punc_index]); + const SuccessorList& slist = *(successors_[pos.punc_index]); for (int s = 0; s < slist.length(); ++s) { int sdawg_index = slist[s]; - const Dawg *sdawg = dawgs_[sdawg_index]; + const Dawg* sdawg = dawgs_[sdawg_index]; UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg); EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end); if (dawg_edge != NO_EDGE) { - if (dawg_debug_level >=3) { + if (dawg_debug_level >= 3) { tprintf("Letter found in dawg %d\n", sdawg_index); } dawg_args->updated_dawgs->add_unique( - DawgPosition(sdawg_index, dawg_edge, - pos.punc_index, punc_transition_edge, false), + DawgPosition(sdawg_index, dawg_edge, pos.punc_index, + punc_transition_edge, false), dawg_debug_level > 0, "Append transition from punc dawg to current dawgs: "); if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter(); @@ -476,16 +476,15 @@ int Dict::def_letter_is_okay(void* void_dawg_args, } } } - EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id, - word_end); + EDGE_REF punc_edge = + punc_dawg->edge_char_of(punc_node, unichar_id, word_end); if (punc_edge != NO_EDGE) { - if (dawg_debug_level >=3) { + if (dawg_debug_level >= 3) { tprintf("Letter found in punctuation dawg\n"); } dawg_args->updated_dawgs->add_unique( DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false), - dawg_debug_level > 0, - "Extend punctuation dawg: "); + dawg_debug_level > 0, "Extend punctuation dawg: "); if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM; if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true; } @@ -496,14 +495,15 @@ int Dict::def_letter_is_okay(void* void_dawg_args, // We can end the main word here. // If we can continue on the punc ref, add that possibility. NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref); - EDGE_REF punc_edge = punc_node == NO_EDGE ? NO_EDGE - : punc_dawg->edge_char_of(punc_node, unichar_id, word_end); + EDGE_REF punc_edge = + punc_node == NO_EDGE + ? NO_EDGE + : punc_dawg->edge_char_of(punc_node, unichar_id, word_end); if (punc_edge != NO_EDGE) { dawg_args->updated_dawgs->add_unique( - DawgPosition(pos.dawg_index, pos.dawg_ref, - pos.punc_index, punc_edge, true), - dawg_debug_level > 0, - "Return to punctuation dawg: "); + DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index, + punc_edge, true), + dawg_debug_level > 0, "Return to punctuation dawg: "); if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter(); if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true; } @@ -524,9 +524,11 @@ int Dict::def_letter_is_okay(void* void_dawg_args, // Find the edge out of the node for the unichar_id. NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); - EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE - : dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg), - word_end); + EDGE_REF edge = + (node == NO_EDGE) + ? NO_EDGE + : dawg->edge_char_of( + node, char_for_dawg(unicharset, unichar_id, dawg), word_end); if (dawg_debug_level >= 3) { tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", @@ -534,7 +536,7 @@ int Dict::def_letter_is_okay(void* void_dawg_args, } if (edge != NO_EDGE) { // the unichar was found in the current dawg - if (dawg_debug_level >=3) { + if (dawg_debug_level >= 3) { tprintf("Letter found in dawg %d\n", pos.dawg_index); } if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) { @@ -569,10 +571,10 @@ int Dict::def_letter_is_okay(void* void_dawg_args, return dawg_args->permuter; } -void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, +void Dict::ProcessPatternEdges(const Dawg* dawg, const DawgPosition& pos, UNICHAR_ID unichar_id, bool word_end, - DawgArgs *dawg_args, - PermuterType *curr_perm) const { + DawgArgs* dawg_args, + PermuterType* curr_perm) const { NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); // Try to find the edge corresponding to the exact unichar_id and to all the // edges corresponding to the character class of unichar_id. @@ -584,9 +586,10 @@ void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, // On the first iteration check all the outgoing edges. // On the second iteration check all self-loops. for (int k = 0; k < 2; ++k) { - EDGE_REF edge = (k == 0) - ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end) - : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_patterns[i], word_end); + EDGE_REF edge = + (k == 0) ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end) + : dawg->pattern_loop_edge(pos.dawg_ref, + unichar_id_patterns[i], word_end); if (edge == NO_EDGE) continue; if (dawg_debug_level >= 3) { tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", @@ -607,7 +610,7 @@ void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, // Fill the given active_dawgs vector with dawgs that could contain the // beginning of the word. If hyphenated() returns true, copy the entries // from hyphen_active_dawgs_ instead. -void Dict::init_active_dawgs(DawgPositionVector *active_dawgs, +void Dict::init_active_dawgs(DawgPositionVector* active_dawgs, bool ambigs_mode) const { int i; if (hyphenated()) { @@ -624,11 +627,11 @@ void Dict::init_active_dawgs(DawgPositionVector *active_dawgs, } } -void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, +void Dict::default_dawgs(DawgPositionVector* dawg_pos_vec, bool suppress_patterns) const { bool punc_dawg_available = - (punc_dawg_ != nullptr) && - punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE; + (punc_dawg_ != nullptr) && + punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE; for (int i = 0; i < dawgs_.length(); i++) { if (dawgs_[i] != nullptr && @@ -651,7 +654,7 @@ void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, } } -void Dict::add_document_word(const WERD_CHOICE &best_choice) { +void Dict::add_document_word(const WERD_CHOICE& best_choice) { // Do not add hyphenated word parts to the document dawg. // hyphen_word_ will be non-nullptr after the set_hyphen_word() is // called when the first part of the hyphenated word is @@ -662,8 +665,7 @@ void Dict::add_document_word(const WERD_CHOICE &best_choice) { int stringlen = best_choice.length(); - if (valid_word(best_choice) || stringlen < 2) - return; + if (valid_word(best_choice) || stringlen < 2) return; // Discard words that contain >= kDocDictMaxRepChars repeating unichars. if (best_choice.length() >= kDocDictMaxRepChars) { @@ -682,8 +684,7 @@ void Dict::add_document_word(const WERD_CHOICE &best_choice) { if (best_choice.certainty() < doc_dict_certainty_threshold || stringlen == 2) { - if (best_choice.certainty() < doc_dict_pending_threshold) - return; + if (best_choice.certainty() < doc_dict_pending_threshold) return; if (!pending_words_->word_in_dawg(best_choice)) { if (stringlen > 2 || @@ -699,23 +700,20 @@ void Dict::add_document_word(const WERD_CHOICE &best_choice) { if (save_doc_words) { STRING filename(getCCUtil()->imagefile); filename += ".doc"; - FILE *doc_word_file = fopen(filename.string(), "a"); + FILE* doc_word_file = fopen(filename.string(), "a"); if (doc_word_file == nullptr) { tprintf("Error: Could not open file %s\n", filename.string()); ASSERT_HOST(doc_word_file); } - fprintf(doc_word_file, "%s\n", - best_choice.debug_string().string()); + fprintf(doc_word_file, "%s\n", best_choice.debug_string().string()); fclose(doc_word_file); } document_words_->add_word_to_dawg(best_choice); } -void Dict::adjust_word(WERD_CHOICE *word, - bool nonword, +void Dict::adjust_word(WERD_CHOICE* word, bool nonword, XHeightConsistencyEnum xheight_consistency, - float additional_adjust, - bool modify_rating, + float additional_adjust, bool modify_rating, bool debug) { bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() && word->GetTopScriptID() == getUnicharset().han_sid()); @@ -725,7 +723,7 @@ void Dict::adjust_word(WERD_CHOICE *word, float adjust_factor = additional_adjust; float new_rating = word->rating(); new_rating += kRatingPad; - const char *xheight_triggered = ""; + const char* xheight_triggered = ""; if (word->length() > 1) { // Calculate x-height and y-offset consistency penalties. switch (xheight_consistency) { @@ -750,8 +748,7 @@ void Dict::adjust_word(WERD_CHOICE *word, } if (debug) { tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "", - word->unichar_string().string(), word->rating(), - xheight_triggered); + word->unichar_string().string(), word->rating(), xheight_triggered); } if (nonword) { // non-dictionary word @@ -791,8 +788,8 @@ void Dict::adjust_word(WERD_CHOICE *word, word->set_adjust_factor(adjust_factor); } -int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const { - const WERD_CHOICE *word_ptr = &word; +int Dict::valid_word(const WERD_CHOICE& word, bool numbers_ok) const { + const WERD_CHOICE* word_ptr = &word; WERD_CHOICE temp_word(word.unicharset()); if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) { copy_hyphen_info(&temp_word); @@ -802,15 +799,15 @@ int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const { if (word_ptr->length() == 0) return NO_PERM; // Allocate vectors for holding current and updated // active_dawgs and initialize them. - DawgPositionVector *active_dawgs = new DawgPositionVector[2]; + DawgPositionVector* active_dawgs = new DawgPositionVector[2]; init_active_dawgs(&(active_dawgs[0]), false); DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM); int last_index = word_ptr->length() - 1; // Call letter_is_okay for each letter in the word. for (int i = hyphen_base_size(); i <= last_index; ++i) { if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(), - word_ptr->unichar_id(i), - i == last_index))) break; + word_ptr->unichar_id(i), i == last_index))) + break; // Swap active_dawgs, constraints with the corresponding updated vector. if (dawg_args.updated_dawgs == &(active_dawgs[1])) { dawg_args.updated_dawgs = &(active_dawgs[0]); @@ -821,12 +818,13 @@ int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const { } } delete[] active_dawgs; - return valid_word_permuter(dawg_args.permuter, numbers_ok) ? - dawg_args.permuter : NO_PERM; + return valid_word_permuter(dawg_args.permuter, numbers_ok) + ? dawg_args.permuter + : NO_PERM; } -bool Dict::valid_bigram(const WERD_CHOICE &word1, - const WERD_CHOICE &word2) const { +bool Dict::valid_bigram(const WERD_CHOICE& word1, + const WERD_CHOICE& word2) const { if (bigram_dawg_ == nullptr) return false; // Extract the core word from the middle of each word with any digits @@ -862,13 +860,13 @@ bool Dict::valid_bigram(const WERD_CHOICE &word1, } WERD_CHOICE normalized_word(&uchset, bigram_string.size()); for (int i = 0; i < bigram_string.size(); ++i) { - normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1, - 0.0f, 0.0f); + normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1, 0.0f, + 0.0f); } return bigram_dawg_->word_in_dawg(normalized_word); } -bool Dict::valid_punctuation(const WERD_CHOICE &word) { +bool Dict::valid_punctuation(const WERD_CHOICE& word) { if (word.length() == 0) return NO_PERM; int i; WERD_CHOICE new_word(word.unicharset()); @@ -882,21 +880,21 @@ bool Dict::valid_punctuation(const WERD_CHOICE &word) { !getUnicharset().get_isdigit(unichar_id)) { return false; // neither punc, nor alpha, nor digit } else if ((new_len = new_word.length()) == 0 || - new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) { + new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) { new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0); } } for (i = 0; i < dawgs_.size(); ++i) { - if (dawgs_[i] != nullptr && - dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION && - dawgs_[i]->word_in_dawg(new_word)) return true; + if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION && + dawgs_[i]->word_in_dawg(new_word)) + return true; } return false; } /// Returns true if the language is space-delimited (not CJ, or T). bool Dict::IsSpaceDelimitedLang() const { - const UNICHARSET &u_set = getUnicharset(); + const UNICHARSET& u_set = getUnicharset(); if (u_set.han_sid() > 0) return false; if (u_set.katakana_sid() > 0) return false; if (u_set.thai_sid() > 0) return false; diff --git a/src/lstm/lstmrecognizer.cpp b/src/lstm/lstmrecognizer.cpp index bf6ee98f..f3967c7d 100644 --- a/src/lstm/lstmrecognizer.cpp +++ b/src/lstm/lstmrecognizer.cpp @@ -18,7 +18,7 @@ // Include automatically generated configuration file if running autoconf. #ifdef HAVE_CONFIG_H -#include "config_auto.h" +# include "config_auto.h" #endif #include "lstmrecognizer.h" @@ -66,7 +66,8 @@ LSTMRecognizer::~LSTMRecognizer() { } // Loads a model from mgr, including the dictionary only if lang is not null. -bool LSTMRecognizer::Load(const ParamsVectors* params, const char* lang, TessdataManager* mgr) { +bool LSTMRecognizer::Load(const ParamsVectors* params, const char* lang, + TessdataManager* mgr) { TFile fp; if (!mgr->GetComponent(TESSDATA_LSTM, &fp)) return false; if (!DeSerialize(mgr, &fp)) return false; @@ -155,7 +156,8 @@ bool LSTMRecognizer::LoadRecoder(TFile* fp) { // from checkpoint or restore without having to go back and reload the // dictionary. // Some parameters have to be passed in (from langdata/config/api via Tesseract) -bool LSTMRecognizer::LoadDictionary(const ParamsVectors* params, const char* lang, TessdataManager* mgr) { +bool LSTMRecognizer::LoadDictionary(const ParamsVectors* params, + const char* lang, TessdataManager* mgr) { delete dict_; dict_ = new Dict(&ccutil_); dict_->user_words_file.ResetFrom(params); @@ -261,7 +263,8 @@ bool LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert, pixInvert(pix, pix); Input::PreparePixInput(network_->InputShape(), pix, &randomizer_, &inv_inputs); - network_->Forward(debug, inv_inputs, nullptr, &scratch_space_, &inv_outputs); + network_->Forward(debug, inv_inputs, nullptr, &scratch_space_, + &inv_outputs); float inv_min, inv_mean, inv_sd; OutputStats(inv_outputs, &inv_min, &inv_mean, &inv_sd); if (inv_min > pos_min && inv_mean > pos_mean && inv_sd < pos_sd) { @@ -405,7 +408,7 @@ void LSTMRecognizer::DebugActivationRange(const NetworkIO& outputs, // Helper returns true if the null_char is the winner at t, and it beats the // null_threshold, or the next choice is space, in which case we will use the // null anyway. -#if 0 // TODO: unused, remove if still unused after 2020. +#if 0 // TODO: unused, remove if still unused after 2020. static bool NullIsBest(const NetworkIO& output, float null_thr, int null_char, int t) { if (output.f(t)[null_char] >= null_thr) return true; diff --git a/src/lstm/lstmrecognizer.h b/src/lstm/lstmrecognizer.h index 5c0ec53a..45425ac8 100644 --- a/src/lstm/lstmrecognizer.h +++ b/src/lstm/lstmrecognizer.h @@ -56,18 +56,10 @@ class LSTMRecognizer { LSTMRecognizer(); ~LSTMRecognizer(); - int NumOutputs() const { - return network_->NumOutputs(); - } - int training_iteration() const { - return training_iteration_; - } - int sample_iteration() const { - return sample_iteration_; - } - double learning_rate() const { - return learning_rate_; - } + int NumOutputs() const { return network_->NumOutputs(); } + int training_iteration() const { return training_iteration_; } + int sample_iteration() const { return sample_iteration_; } + double learning_rate() const { return learning_rate_; } LossType OutputLossType() const { if (network_ == nullptr) return LT_NONE; StaticShape shape; @@ -145,17 +137,14 @@ class LSTMRecognizer { // Sets the sample iteration to the given value. The sample_iteration_ // determines the seed for the random number generator. The training // iteration is incremented only by a successful training iteration. - void SetIteration(int iteration) { - sample_iteration_ = iteration; - } + void SetIteration(int iteration) { sample_iteration_ = iteration; } // Accessors for textline image normalization. - int NumInputs() const { - return network_->NumInputs(); - } + int NumInputs() const { return network_->NumInputs(); } int null_char() const { return null_char_; } // Loads a model from mgr, including the dictionary only if lang is not null. - bool Load(const ParamsVectors* params, const char* lang, TessdataManager* mgr); + bool Load(const ParamsVectors* params, const char* lang, + TessdataManager* mgr); // Writes to the given file. Returns false in case of error. // If mgr contains a unicharset and recoder, then they are not encoded to fp. @@ -175,7 +164,8 @@ class LSTMRecognizer { // on the unicharset matching. This enables training to deserialize a model // from checkpoint or restore without having to go back and reload the // dictionary. - bool LoadDictionary(const ParamsVectors* params, const char* lang, TessdataManager* mgr); + bool LoadDictionary(const ParamsVectors* params, const char* lang, + TessdataManager* mgr); // Recognizes the line image, contained within image_data, returning the // recognized tesseract WERD_RES for the words. @@ -188,8 +178,8 @@ class LSTMRecognizer { PointerVector* words, int lstm_choice_mode = 0); // Helper computes min and mean best results in the output. - void OutputStats(const NetworkIO& outputs, - float* min_output, float* mean_output, float* sd); + void OutputStats(const NetworkIO& outputs, float* min_output, + float* mean_output, float* sd); // Recognizes the image_data, returning the labels, // scores, and corresponding pairs of start, end x-coords in coords. // Returned in scale_factor is the reduction factor @@ -209,11 +199,9 @@ class LSTMRecognizer { // Displays the forward results in a window with the characters and // boundaries as determined by the labels and label_coords. - void DisplayForward(const NetworkIO& inputs, - const GenericVector& labels, + void DisplayForward(const NetworkIO& inputs, const GenericVector& labels, const GenericVector& label_coords, - const char* window_name, - ScrollView** window); + const char* window_name, ScrollView** window); // Converts the network output to a sequence of labels. Outputs labels, scores // and start xcoords of each char, and each null_char_, with an additional // final xcoord for the end of the output. @@ -232,8 +220,8 @@ class LSTMRecognizer { // Displays the labels and cuts at the corresponding xcoords. // Size of labels should match xcoords. void DisplayLSTMOutput(const GenericVector& labels, - const GenericVector& xcoords, - int height, ScrollView* window); + const GenericVector& xcoords, int height, + ScrollView* window); // Prints debug output detailing the activation path that is implied by the // xcoords. @@ -253,8 +241,7 @@ class LSTMRecognizer { // Converts the network output to a sequence of labels, with scores, using // the simple character model (each position is a char, and the null_char_ is // mainly intended for tail padding.) - void LabelsViaSimpleText(const NetworkIO& output, - GenericVector* labels, + void LabelsViaSimpleText(const NetworkIO& output, GenericVector* labels, GenericVector* xcoords); // Returns a string corresponding to the label starting at start. Sets *end