Add a new renderer to create box files from images for LSTM training

(cherry picked from commit 921da6be2bdbda2ddd64514f9b6bec40a336246a)

fix typo

(cherry picked from commit 7bd1a0c80393fce2f34e2845cb26760bcf3791cd)

Add lstmboxrenderer to CMakeLists

(cherry picked from commit cfef3a889aef830725921b5c0218d5e9c633b03e)

fix formatting

(cherry picked from commit 7ba2b01ede7940ed609a073364948ef8c838cd10)
This commit is contained in:
Shree Devi Kumar 2019-01-31 17:30:59 +00:00
parent 56725de8b7
commit 9c89cd51cf
10 changed files with 150 additions and 1 deletions

View File

@ -252,6 +252,7 @@ set(tesseract_src ${tesseract_src}
src/api/renderer.cpp
src/api/altorenderer.cpp
src/api/hocrrenderer.cpp
src/api/lstmboxrenderer.cpp
src/api/pdfrenderer.cpp
)

View File

@ -35,6 +35,7 @@ endif
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp
libtesseract_api_la_SOURCES += altorenderer.cpp
libtesseract_api_la_SOURCES += hocrrenderer.cpp
libtesseract_api_la_SOURCES += lstmboxrenderer.cpp
libtesseract_api_la_SOURCES += pdfrenderer.cpp
libtesseract_api_la_SOURCES += renderer.cpp

View File

@ -613,6 +613,14 @@ class TESS_API TessBaseAPI {
* Returned string must be freed with the delete [] operator.
*/
char* GetTSVText(int page_number);
/**
* Make a box file for LSTM training from the internal data structures.
* Constructs coordinates in the original image - not just the rectangle.
* page_number is a 0-based page index that will appear in the box file.
* Returned string must be freed with the delete [] operator.
*/
char* GetLSTMBOXText(int page_number);
/**
* The recognized text is returned as a char* which is coded in the same

View File

@ -268,7 +268,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
if (grapheme && grapheme[0] != 0) {
if (hocr_boxes) {
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
hocr_str << "<span class='ocrx_cinfo' title='x_bboxes "
hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes "
<< left << " " << top << " " << right << " " << bottom
<< "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
}

110
src/api/lstmboxrenderer.cpp Normal file
View File

@ -0,0 +1,110 @@
/**********************************************************************
* File: lstmboxrenderer.cpp
* Description: Renderer for creating box file for LSTM training.
* based on the tsv renderer.
*
* (C) Copyright 2006, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include <locale> // for std::locale::classic
#include <memory> // for std::unique_ptr
#include <sstream> // for std::stringstream
#include "baseapi.h" // for TessBaseAPI
#include "renderer.h"
#include "tesseractclass.h" // for Tesseract
namespace tesseract {
/**
* Create a UTF8 box file for LSTM training from the internal data structures.
* page_number is a 0-base page index that will appear in the box file.
* Returned string must be freed with the delete [] operator.
*/
char* TessBaseAPI::GetLSTMBOXText(int page_number) {
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
return nullptr;
STRING lstm_box_str("");
int page_num = page_number;
bool first_word = true;
LTRResultIterator* res_it = GetLTRIterator();
while (!res_it->Empty(RIL_BLOCK)) {
if (res_it->Empty(RIL_SYMBOL)) {
res_it->Next(RIL_SYMBOL);
continue;
}
int left, top, right, bottom;
if (!first_word) {
if (res_it->IsAtBeginningOf(RIL_WORD)) {
lstm_box_str.add_str_int(" ", left);
lstm_box_str.add_str_int(" ", image_height_ - bottom);
lstm_box_str.add_str_int(" ", right + 2);
lstm_box_str.add_str_int(" ", image_height_ - top);
lstm_box_str.add_str_int(" ", page_num); // level 5 - word
lstm_box_str += "\n"; // end of row for word
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
lstm_box_str.add_str_int("\t ", left);
lstm_box_str.add_str_int(" ", image_height_ - bottom);
lstm_box_str.add_str_int(" ", right + 5);
lstm_box_str.add_str_int(" ", image_height_ - top);
lstm_box_str.add_str_int(" ", page_num); // level 4 - line
lstm_box_str += "\n"; // end of row for line
}
}
first_word=false;
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
do {
lstm_box_str +=std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));
lstm_box_str.add_str_int(" ", left);
lstm_box_str.add_str_int(" ", image_height_ - bottom);
lstm_box_str.add_str_int(" ", right);
lstm_box_str.add_str_int(" ", image_height_ - top);
lstm_box_str.add_str_int(" ", page_num); // level 6 - symbol
lstm_box_str += "\n"; // end of row
}
char* ret = new char[lstm_box_str.length() + 1];
strcpy(ret, lstm_box_str.string());
delete res_it;
return ret;
}
/**********************************************************************
* LSTMBOX Renderer interface implementation
**********************************************************************/
TessLSTMBOXRenderer::TessLSTMBOXRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "box") {
}
bool TessLSTMBOXRenderer::AddImageHandler(TessBaseAPI* api) {
const std::unique_ptr<const char[]> lstmbox(api->GetLSTMBOXText(imagenum()));
if (lstmbox == nullptr) return false;
AppendString(lstmbox.get());
return true;
}
} // namespace tesseract.

View File

@ -247,6 +247,17 @@ class TESS_API TessUnlvRenderer : public TessResultRenderer {
virtual bool AddImageHandler(TessBaseAPI* api);
};
/**
* Renders tesseract output into a plain UTF-8 text string for LSTMBOX
*/
class TESS_API TessLSTMBOXRenderer : public TessResultRenderer {
public:
explicit TessLSTMBOXRenderer(const char *outputbase);
protected:
virtual bool AddImageHandler(TessBaseAPI* api);
};
/**
* Renders tesseract output into a plain UTF-8 text string
*/

View File

@ -494,6 +494,20 @@ static void PreloadRenderers(
}
}
api->GetBoolVariable("tessedit_create_lstmbox", &b);
if (b) {
tesseract::TessLSTMBOXRenderer* renderer =
new tesseract::TessLSTMBOXRenderer(outputbase);
if (renderer->happy()) {
renderers->push_back(renderer);
} else {
delete renderer;
tprintf("Error, could not create LSTM BOX output file: %s\n",
strerror(errno));
error = true;
}
}
api->GetBoolVariable("tessedit_create_boxfile", &b);
if (b) {
tesseract::TessBoxTextRenderer* renderer =

View File

@ -391,6 +391,8 @@ Tesseract::Tesseract()
this->params()),
BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file",
this->params()),
BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
this->params()),
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
this->params()),
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",

View File

@ -1040,6 +1040,7 @@ class Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file");
BOOL_VAR_H(tessedit_create_lstmbox, false, "Write .box file for LSTM training");
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
BOOL_VAR_H(textonly_pdf, false,

1
tessdata/configs/lstmbox Normal file
View File

@ -0,0 +1 @@
tessedit_create_lstmbox 1