From 975c626dc392b298b68924278b42414ddb56fa31 Mon Sep 17 00:00:00 2001 From: zdenop Date: Mon, 11 Nov 2019 08:58:03 +0100 Subject: [PATCH] Fail if no valid lstmf file was written (fix issue #2741) Signed-off-by: Stefan Weil # Conflicts: # src/ccmain/linerec.cpp --- src/api/baseapi.cpp | 4 +++- src/ccmain/linerec.cpp | 21 ++++++++++++--------- src/ccmain/tesseractclass.h | 3 ++- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index 90c6a1c91..9f79dd327 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -858,7 +858,9 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) { } if (tesseract_->tessedit_train_line_recognizer) { - tesseract_->TrainLineRecognizer(*input_file_, *output_file_, block_list_); + if (!tesseract_->TrainLineRecognizer(*input_file_, *output_file_, block_list_)) { + return -1; + } tesseract_->CorrectClassifyWords(page_res_); return 0; } diff --git a/src/ccmain/linerec.cpp b/src/ccmain/linerec.cpp index 6f1c77714..fa6217c9b 100644 --- a/src/ccmain/linerec.cpp +++ b/src/ccmain/linerec.cpp @@ -40,16 +40,17 @@ const float kWorstDictCertainty = -25.0f; // Generates training data for training a line recognizer, eg LSTM. // Breaks the page into lines, according to the boxes, and writes them to a // serialized DocumentData based on output_basename. -void Tesseract::TrainLineRecognizer(const STRING& input_imagename, +// Return true if successful, false if an error occurred. +bool Tesseract::TrainLineRecognizer(const STRING& input_imagename, const STRING& output_basename, BLOCK_LIST *block_list) { STRING lstmf_name = output_basename + ".lstmf"; DocumentData images(lstmf_name); if (applybox_page > 0) { // Load existing document for the previous pages. - if (!images.LoadDocument(lstmf_name.string(), 0, 0, nullptr)) { - tprintf("Failed to read training data from %s!\n", lstmf_name.string()); - return; + if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) { + tprintf("Failed to read training data from %s!\n", lstmf_name.c_str()); + return false; } } GenericVector boxes; @@ -58,18 +59,20 @@ void Tesseract::TrainLineRecognizer(const STRING& input_imagename, if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr, nullptr) || boxes.empty()) { - tprintf("Failed to read boxes from %s\n", input_imagename.string()); - return; + tprintf("Failed to read boxes from %s\n", input_imagename.c_str()); + return false; } TrainFromBoxes(boxes, texts, block_list, &images); if (images.NumPages() <= 0) { tprintf("Failed to read pages from %s\n", input_imagename.c_str()); - return; + return false; } images.Shuffle(); - if (!images.SaveDocument(lstmf_name.string(), nullptr)) { - tprintf("Failed to write training data to %s!\n", lstmf_name.string()); + if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) { + tprintf("Failed to write training data to %s!\n", lstmf_name.c_str()); + return false; } + return true; } // Generates training data for training a line recognizer, eg LSTM. diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h index a9fc5d094..b2abcf4c1 100644 --- a/src/ccmain/tesseractclass.h +++ b/src/ccmain/tesseractclass.h @@ -336,7 +336,8 @@ class Tesseract : public Wordrec { // Generates training data for training a line recognizer, eg LSTM. // Breaks the page into lines, according to the boxes, and writes them to a // serialized DocumentData based on output_basename. - void TrainLineRecognizer(const STRING& input_imagename, + // Return true if successful, false if an error occurred. + bool TrainLineRecognizer(const STRING& input_imagename, const STRING& output_basename, BLOCK_LIST* block_list); // Generates training data for training a line recognizer, eg LSTM.