Use model prefix also for submodels

Fix also a regression in the for loop which handles submodels. Fixes: 0d91c700c0 ("Modernize code in Tesseract::init_tesseract") Signed-off-by: Stefan Weil <sw@weilnetz.de>
2025-01-19 15:03:45 +08:00 · 2021-08-23 07:18:21 +02:00 · 2021-08-23 07:18:21 +02:00 · 391e713ae8
commit 391e713ae8
parent 7cfcfe1101
1 changed files with 16 additions and 1 deletions
--- a/src/ccmain/tessedit.cpp
+++ b/src/ccmain/tessedit.cpp
@ -23,6 +23,8 @@
 #  include "config_auto.h"
 #endif

+#include <regex> // for std::regex_match
+
 #include "control.h"
 #include "matchdefs.h"
 #include "pageres.h"
@ -247,6 +249,15 @@ static bool IsStrInList(const std::string &str, const std::vector<std::string> &
 void Tesseract::ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load,
                                    std::vector<std::string> *not_to_load) {
  std::string remains(lang_str);
+  // Look whether the model file uses a prefix which must be applied to
+  // included model files as well.
+  std::regex e("(.*)/[^/]*");
+  std::cmatch cm;
+  std::string prefix;
+  if (std::regex_match(lang.c_str(), cm, e, std::regex_constants::match_default)) {
+    // A prefix was found.
+    prefix = cm[1].str() + "/";
+  }
  while (!remains.empty()) {
    // Find the start of the lang code and which vector to add to.
    const char *start = remains.c_str();
@ -268,6 +279,7 @@ void Tesseract::ParseLanguageString(const std::string &lang_str, std::vector<std
    lang_code.resize(end);
    std::string next(start + end);
    remains = next;
+    lang_code = prefix + lang_code;
    // Check whether lang_code is already in the target vector and add.
    if (!IsStrInList(lang_code, *target)) {
      target->push_back(lang_code);
@ -296,7 +308,10 @@ int Tesseract::init_tesseract(const std::string &arg0, const std::string &textba
  // Add any languages that this language requires
  bool loaded_primary = false;
  // Load the rest into sub_langs_.
-  for (auto &lang_to_load : langs_to_load) {
+  // A range based for loop does not work here because langs_to_load
+  // might be changed in the loop when a new submodel is found.
+  for (size_t lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
+    auto &lang_to_load = langs_to_load[lang_index];
    if (!IsStrInList(lang_to_load, langs_not_to_load)) {
      const char *lang_str = lang_to_load.c_str();
      Tesseract *tess_to_init;