Don't create OCR result files when training data is created

The configuration file lstm.train causes Tesseract to generate training data for training of an LSTM line recognizer. In this mode, no other files with OCR results should be written. Without this patch, Tesseract writes a small text file. Signed-off-by: Stefan Weil <sw@weilnetz.de>
2024-12-13 16:09:04 +08:00 · 2019-10-02 19:10:23 +02:00 · 2019-10-02 19:10:23 +02:00 · 58122ea313
commit 58122ea313
parent 3dfd72721b
1 changed files with 8 additions and 3 deletions
--- a/src/api/tesseractmain.cpp
+++ b/src/api/tesseractmain.cpp
@ -565,6 +565,9 @@ static void PreloadRenderers(

    api->GetBoolVariable("tessedit_create_txt", &b);
    if (b || (!error && renderers->empty())) {
+      // Create text output if no other output was requested
+      // even if text output was not explicitly requested unless
+      // there was an error.
      auto* renderer =
        new tesseract::TessTextRenderer(outputbase);
      if (renderer->happy()) {
@ -716,13 +719,15 @@ int main(int argc, char** argv) {
    return ret_val;
  }

-  // set in_training_mode to true when using one of these configs:
-  // ambigs.train, box.train, box.train.stderr, linebox, rebox
+  // Set in_training_mode to true when using one of these configs:
+  // ambigs.train, box.train, box.train.stderr, linebox, rebox, lstm.train.
+  // In this mode no other OCR result files are written.
  bool b = false;
  bool in_training_mode =
      (api.GetBoolVariable("tessedit_ambigs_training", &b) && b) ||
      (api.GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) ||
-      (api.GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b);
+      (api.GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b) ||
+      (api.GetBoolVariable("tessedit_train_line_recognizer", &b) && b);

 #ifdef DISABLED_LEGACY_ENGINE
  auto cur_psm = api.GetPageSegMode();