fix langdata (user words/patterns) file suffixes for LSTMs:

- add another constructor for LSTMRecognizer which takes the language_data_path_prefix configured/selected at runtime and passes it to the internal CCUtil - use this in Tesseract::init_tesseract_lang_data when LSTMs are available (this was missing from 297d7d86ce)
2025-01-21 08:43:03 +08:00 · 2019-09-19 19:30:54 +02:00 · 2019-09-19 19:30:54 +02:00 · 5b976bfb55
commit 5b976bfb55
parent 3b030b4aeb
3 changed files with 7 additions and 1 deletions
--- a/src/ccmain/tessedit.cpp
+++ b/src/ccmain/tessedit.cpp
@ -175,7 +175,7 @@ bool Tesseract::init_tesseract_lang_data(
      tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
 #  endif  // ndef DISABLED_LEGACY_ENGINE
    if (mgr->IsComponentAvailable(TESSDATA_LSTM)) {
-      lstm_recognizer_ = new LSTMRecognizer;
+      lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix);
      ASSERT_HOST(lstm_recognizer_->Load(
          this->params(), lstm_use_matrix ? language : nullptr, mgr));
    } else {
--- a/src/lstm/lstmrecognizer.cpp
+++ b/src/lstm/lstmrecognizer.cpp
@ -49,6 +49,11 @@ const double kDictRatio = 2.25;
 // Default certainty offset to give the dictionary a chance.
 const double kCertOffset = -0.085;

+LSTMRecognizer::LSTMRecognizer(const STRING language_data_path_prefix)
+    : LSTMRecognizer::LSTMRecognizer() {
+  ccutil_.language_data_path_prefix = language_data_path_prefix;
+}
+
 LSTMRecognizer::LSTMRecognizer()
    : network_(nullptr),
      training_flags_(0),
--- a/src/lstm/lstmrecognizer.h
+++ b/src/lstm/lstmrecognizer.h
@ -53,6 +53,7 @@ enum TrainingFlags {
 class LSTMRecognizer {
 public:
  LSTMRecognizer();
+  LSTMRecognizer(const STRING language_data_path_prefix);
  ~LSTMRecognizer();

  int NumOutputs() const { return network_->NumOutputs(); }