Fail if no valid lstmf file was written (fix issue #2741)

Signed-off-by: Stefan Weil <sw@weilnetz.de>

# Conflicts:
#	src/ccmain/linerec.cpp
This commit is contained in:
zdenop 2019-11-11 08:58:03 +01:00
parent 185d237c2e
commit 975c626dc3
3 changed files with 17 additions and 11 deletions

View File

@ -858,7 +858,9 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
}
if (tesseract_->tessedit_train_line_recognizer) {
tesseract_->TrainLineRecognizer(*input_file_, *output_file_, block_list_);
if (!tesseract_->TrainLineRecognizer(*input_file_, *output_file_, block_list_)) {
return -1;
}
tesseract_->CorrectClassifyWords(page_res_);
return 0;
}

View File

@ -40,16 +40,17 @@ const float kWorstDictCertainty = -25.0f;
// Generates training data for training a line recognizer, eg LSTM.
// Breaks the page into lines, according to the boxes, and writes them to a
// serialized DocumentData based on output_basename.
void Tesseract::TrainLineRecognizer(const STRING& input_imagename,
// Return true if successful, false if an error occurred.
bool Tesseract::TrainLineRecognizer(const STRING& input_imagename,
const STRING& output_basename,
BLOCK_LIST *block_list) {
STRING lstmf_name = output_basename + ".lstmf";
DocumentData images(lstmf_name);
if (applybox_page > 0) {
// Load existing document for the previous pages.
if (!images.LoadDocument(lstmf_name.string(), 0, 0, nullptr)) {
tprintf("Failed to read training data from %s!\n", lstmf_name.string());
return;
if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
return false;
}
}
GenericVector<TBOX> boxes;
@ -58,18 +59,20 @@ void Tesseract::TrainLineRecognizer(const STRING& input_imagename,
if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr,
nullptr) ||
boxes.empty()) {
tprintf("Failed to read boxes from %s\n", input_imagename.string());
return;
tprintf("Failed to read boxes from %s\n", input_imagename.c_str());
return false;
}
TrainFromBoxes(boxes, texts, block_list, &images);
if (images.NumPages() <= 0) {
tprintf("Failed to read pages from %s\n", input_imagename.c_str());
return;
return false;
}
images.Shuffle();
if (!images.SaveDocument(lstmf_name.string(), nullptr)) {
tprintf("Failed to write training data to %s!\n", lstmf_name.string());
if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
return false;
}
return true;
}
// Generates training data for training a line recognizer, eg LSTM.

View File

@ -336,7 +336,8 @@ class Tesseract : public Wordrec {
// Generates training data for training a line recognizer, eg LSTM.
// Breaks the page into lines, according to the boxes, and writes them to a
// serialized DocumentData based on output_basename.
void TrainLineRecognizer(const STRING& input_imagename,
// Return true if successful, false if an error occurred.
bool TrainLineRecognizer(const STRING& input_imagename,
const STRING& output_basename,
BLOCK_LIST* block_list);
// Generates training data for training a line recognizer, eg LSTM.