mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 12:49:35 +08:00
Add a new renderer to create box files from images for LSTM training
(cherry picked from commit 921da6be2bdbda2ddd64514f9b6bec40a336246a) fix typo (cherry picked from commit 7bd1a0c80393fce2f34e2845cb26760bcf3791cd) Add lstmboxrenderer to CMakeLists (cherry picked from commit cfef3a889aef830725921b5c0218d5e9c633b03e) fix formatting (cherry picked from commit 7ba2b01ede7940ed609a073364948ef8c838cd10)
This commit is contained in:
parent
56725de8b7
commit
9c89cd51cf
@ -252,6 +252,7 @@ set(tesseract_src ${tesseract_src}
|
||||
src/api/renderer.cpp
|
||||
src/api/altorenderer.cpp
|
||||
src/api/hocrrenderer.cpp
|
||||
src/api/lstmboxrenderer.cpp
|
||||
src/api/pdfrenderer.cpp
|
||||
)
|
||||
|
||||
|
@ -35,6 +35,7 @@ endif
|
||||
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp
|
||||
libtesseract_api_la_SOURCES += altorenderer.cpp
|
||||
libtesseract_api_la_SOURCES += hocrrenderer.cpp
|
||||
libtesseract_api_la_SOURCES += lstmboxrenderer.cpp
|
||||
libtesseract_api_la_SOURCES += pdfrenderer.cpp
|
||||
libtesseract_api_la_SOURCES += renderer.cpp
|
||||
|
||||
|
@ -613,6 +613,14 @@ class TESS_API TessBaseAPI {
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char* GetTSVText(int page_number);
|
||||
|
||||
/**
|
||||
* Make a box file for LSTM training from the internal data structures.
|
||||
* Constructs coordinates in the original image - not just the rectangle.
|
||||
* page_number is a 0-based page index that will appear in the box file.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char* GetLSTMBOXText(int page_number);
|
||||
|
||||
/**
|
||||
* The recognized text is returned as a char* which is coded in the same
|
||||
|
@ -268,7 +268,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
|
||||
if (grapheme && grapheme[0] != 0) {
|
||||
if (hocr_boxes) {
|
||||
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
|
||||
hocr_str << "<span class='ocrx_cinfo' title='x_bboxes "
|
||||
hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes "
|
||||
<< left << " " << top << " " << right << " " << bottom
|
||||
<< "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
|
||||
}
|
||||
|
110
src/api/lstmboxrenderer.cpp
Normal file
110
src/api/lstmboxrenderer.cpp
Normal file
@ -0,0 +1,110 @@
|
||||
/**********************************************************************
|
||||
* File: lstmboxrenderer.cpp
|
||||
* Description: Renderer for creating box file for LSTM training.
|
||||
* based on the tsv renderer.
|
||||
*
|
||||
* (C) Copyright 2006, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
|
||||
#include <locale> // for std::locale::classic
|
||||
#include <memory> // for std::unique_ptr
|
||||
#include <sstream> // for std::stringstream
|
||||
#include "baseapi.h" // for TessBaseAPI
|
||||
#include "renderer.h"
|
||||
#include "tesseractclass.h" // for Tesseract
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/**
|
||||
* Create a UTF8 box file for LSTM training from the internal data structures.
|
||||
* page_number is a 0-base page index that will appear in the box file.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
|
||||
char* TessBaseAPI::GetLSTMBOXText(int page_number) {
|
||||
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
|
||||
return nullptr;
|
||||
|
||||
STRING lstm_box_str("");
|
||||
|
||||
int page_num = page_number;
|
||||
bool first_word = true;
|
||||
|
||||
LTRResultIterator* res_it = GetLTRIterator();
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
if (res_it->Empty(RIL_SYMBOL)) {
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
continue;
|
||||
}
|
||||
|
||||
int left, top, right, bottom;
|
||||
|
||||
if (!first_word) {
|
||||
if (res_it->IsAtBeginningOf(RIL_WORD)) {
|
||||
lstm_box_str.add_str_int(" ", left);
|
||||
lstm_box_str.add_str_int(" ", image_height_ - bottom);
|
||||
lstm_box_str.add_str_int(" ", right + 2);
|
||||
lstm_box_str.add_str_int(" ", image_height_ - top);
|
||||
lstm_box_str.add_str_int(" ", page_num); // level 5 - word
|
||||
lstm_box_str += "\n"; // end of row for word
|
||||
}
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
lstm_box_str.add_str_int("\t ", left);
|
||||
lstm_box_str.add_str_int(" ", image_height_ - bottom);
|
||||
lstm_box_str.add_str_int(" ", right + 5);
|
||||
lstm_box_str.add_str_int(" ", image_height_ - top);
|
||||
lstm_box_str.add_str_int(" ", page_num); // level 4 - line
|
||||
lstm_box_str += "\n"; // end of row for line
|
||||
}
|
||||
}
|
||||
first_word=false;
|
||||
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
|
||||
|
||||
do {
|
||||
lstm_box_str +=std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));
|
||||
|
||||
lstm_box_str.add_str_int(" ", left);
|
||||
lstm_box_str.add_str_int(" ", image_height_ - bottom);
|
||||
lstm_box_str.add_str_int(" ", right);
|
||||
lstm_box_str.add_str_int(" ", image_height_ - top);
|
||||
lstm_box_str.add_str_int(" ", page_num); // level 6 - symbol
|
||||
lstm_box_str += "\n"; // end of row
|
||||
|
||||
}
|
||||
|
||||
char* ret = new char[lstm_box_str.length() + 1];
|
||||
strcpy(ret, lstm_box_str.string());
|
||||
delete res_it;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* LSTMBOX Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessLSTMBOXRenderer::TessLSTMBOXRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "box") {
|
||||
}
|
||||
|
||||
bool TessLSTMBOXRenderer::AddImageHandler(TessBaseAPI* api) {
|
||||
const std::unique_ptr<const char[]> lstmbox(api->GetLSTMBOXText(imagenum()));
|
||||
if (lstmbox == nullptr) return false;
|
||||
|
||||
AppendString(lstmbox.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
@ -247,6 +247,17 @@ class TESS_API TessUnlvRenderer : public TessResultRenderer {
|
||||
virtual bool AddImageHandler(TessBaseAPI* api);
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders tesseract output into a plain UTF-8 text string for LSTMBOX
|
||||
*/
|
||||
class TESS_API TessLSTMBOXRenderer : public TessResultRenderer {
|
||||
public:
|
||||
explicit TessLSTMBOXRenderer(const char *outputbase);
|
||||
|
||||
protected:
|
||||
virtual bool AddImageHandler(TessBaseAPI* api);
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders tesseract output into a plain UTF-8 text string
|
||||
*/
|
||||
|
@ -494,6 +494,20 @@ static void PreloadRenderers(
|
||||
}
|
||||
}
|
||||
|
||||
api->GetBoolVariable("tessedit_create_lstmbox", &b);
|
||||
if (b) {
|
||||
tesseract::TessLSTMBOXRenderer* renderer =
|
||||
new tesseract::TessLSTMBOXRenderer(outputbase);
|
||||
if (renderer->happy()) {
|
||||
renderers->push_back(renderer);
|
||||
} else {
|
||||
delete renderer;
|
||||
tprintf("Error, could not create LSTM BOX output file: %s\n",
|
||||
strerror(errno));
|
||||
error = true;
|
||||
}
|
||||
}
|
||||
|
||||
api->GetBoolVariable("tessedit_create_boxfile", &b);
|
||||
if (b) {
|
||||
tesseract::TessBoxTextRenderer* renderer =
|
||||
|
@ -391,6 +391,8 @@ Tesseract::Tesseract()
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
|
||||
|
@ -1040,6 +1040,7 @@ class Tesseract : public Wordrec {
|
||||
BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
|
||||
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
|
||||
BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file");
|
||||
BOOL_VAR_H(tessedit_create_lstmbox, false, "Write .box file for LSTM training");
|
||||
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
|
||||
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
|
||||
BOOL_VAR_H(textonly_pdf, false,
|
||||
|
1
tessdata/configs/lstmbox
Normal file
1
tessdata/configs/lstmbox
Normal file
@ -0,0 +1 @@
|
||||
tessedit_create_lstmbox 1
|
Loading…
Reference in New Issue
Block a user