mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-24 02:59:07 +08:00
commit
2ae65b2493
@ -252,6 +252,7 @@ set(tesseract_src ${tesseract_src}
|
||||
src/api/renderer.cpp
|
||||
src/api/altorenderer.cpp
|
||||
src/api/hocrrenderer.cpp
|
||||
src/api/lstmboxrenderer.cpp
|
||||
src/api/pdfrenderer.cpp
|
||||
)
|
||||
|
||||
|
@ -35,6 +35,7 @@ endif
|
||||
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp
|
||||
libtesseract_api_la_SOURCES += altorenderer.cpp
|
||||
libtesseract_api_la_SOURCES += hocrrenderer.cpp
|
||||
libtesseract_api_la_SOURCES += lstmboxrenderer.cpp
|
||||
libtesseract_api_la_SOURCES += pdfrenderer.cpp
|
||||
libtesseract_api_la_SOURCES += renderer.cpp
|
||||
|
||||
|
@ -613,6 +613,14 @@ class TESS_API TessBaseAPI {
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char* GetTSVText(int page_number);
|
||||
|
||||
/**
|
||||
* Make a box file for LSTM training from the internal data structures.
|
||||
* Constructs coordinates in the original image - not just the rectangle.
|
||||
* page_number is a 0-based page index that will appear in the box file.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
char* GetLSTMBOXText(int page_number);
|
||||
|
||||
/**
|
||||
* The recognized text is returned as a char* which is coded in the same
|
||||
|
@ -268,7 +268,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
|
||||
if (grapheme && grapheme[0] != 0) {
|
||||
if (hocr_boxes) {
|
||||
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
|
||||
hocr_str << "<span class='ocrx_cinfo' title='x_bboxes "
|
||||
hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes "
|
||||
<< left << " " << top << " " << right << " " << bottom
|
||||
<< "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
|
||||
}
|
||||
|
105
src/api/lstmboxrenderer.cpp
Normal file
105
src/api/lstmboxrenderer.cpp
Normal file
@ -0,0 +1,105 @@
|
||||
/**********************************************************************
|
||||
* File: lstmboxrenderer.cpp
|
||||
* Description: Renderer for creating box file for LSTM training.
|
||||
* based on the tsv renderer.
|
||||
*
|
||||
* (C) Copyright 2006, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "baseapi.h" // for TessBaseAPI
|
||||
#include "renderer.h"
|
||||
#include "tesseractclass.h" // for Tesseract
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
/**
|
||||
* Create a UTF8 box file for LSTM training from the internal data structures.
|
||||
* page_number is a 0-base page index that will appear in the box file.
|
||||
* Returned string must be freed with the delete [] operator.
|
||||
*/
|
||||
static void AddBoxToLSTM(int right, int bottom, int top,
|
||||
int image_height_, int page_num,
|
||||
STRING* text) {
|
||||
text->add_str_int(" ", image_height_ - bottom);
|
||||
text->add_str_int(" ", right + 5);
|
||||
text->add_str_int(" ", image_height_ - top);
|
||||
text->add_str_int(" ", page_num);
|
||||
}
|
||||
|
||||
char* TessBaseAPI::GetLSTMBOXText(int page_number) {
|
||||
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
|
||||
return nullptr;
|
||||
|
||||
STRING lstm_box_str("");
|
||||
int page_num = page_number;
|
||||
bool first_word = true;
|
||||
int left, top, right, bottom;
|
||||
|
||||
LTRResultIterator* res_it = GetLTRIterator();
|
||||
while (!res_it->Empty(RIL_BLOCK)) {
|
||||
if (res_it->Empty(RIL_SYMBOL)) {
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
continue;
|
||||
}
|
||||
if (!first_word) {
|
||||
if (!(res_it->IsAtBeginningOf(RIL_TEXTLINE))) {
|
||||
if (res_it->IsAtBeginningOf(RIL_WORD)) {
|
||||
lstm_box_str.add_str_int(" ", left);
|
||||
AddBoxToLSTM(right, bottom, top, image_height_, page_num, &lstm_box_str);
|
||||
lstm_box_str += "\n"; // end of row for word
|
||||
} // word
|
||||
} else {
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
lstm_box_str.add_str_int("\t ", left);
|
||||
AddBoxToLSTM(right, bottom, top, image_height_, page_num, &lstm_box_str);
|
||||
lstm_box_str += "\n"; // end of row for line
|
||||
} // line
|
||||
}
|
||||
} // not first word
|
||||
first_word=false;
|
||||
// Use bounding box for whole line for everything
|
||||
res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
|
||||
do { lstm_box_str +=
|
||||
std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
|
||||
res_it->Next(RIL_SYMBOL);
|
||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));
|
||||
lstm_box_str.add_str_int(" ", left);
|
||||
AddBoxToLSTM(right, bottom, top, image_height_, page_num, &lstm_box_str);
|
||||
lstm_box_str += "\n"; // end of row for symbol
|
||||
}
|
||||
lstm_box_str.add_str_int("\t ", left);
|
||||
AddBoxToLSTM(right, bottom, top, image_height_, page_num, &lstm_box_str);
|
||||
lstm_box_str += "\n"; // end of PAGE
|
||||
char* ret = new char[lstm_box_str.length() + 1];
|
||||
strcpy(ret, lstm_box_str.string());
|
||||
delete res_it;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**********************************************************************
|
||||
* LSTMBOX Renderer interface implementation
|
||||
**********************************************************************/
|
||||
TessLSTMBOXRenderer::TessLSTMBOXRenderer(const char *outputbase)
|
||||
: TessResultRenderer(outputbase, "box") {
|
||||
}
|
||||
|
||||
bool TessLSTMBOXRenderer::AddImageHandler(TessBaseAPI* api) {
|
||||
const std::unique_ptr<const char[]> lstmbox(api->GetLSTMBOXText(imagenum()));
|
||||
if (lstmbox == nullptr) return false;
|
||||
|
||||
AppendString(lstmbox.get());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
@ -247,6 +247,17 @@ class TESS_API TessUnlvRenderer : public TessResultRenderer {
|
||||
virtual bool AddImageHandler(TessBaseAPI* api);
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders tesseract output into a plain UTF-8 text string for LSTMBOX
|
||||
*/
|
||||
class TESS_API TessLSTMBOXRenderer : public TessResultRenderer {
|
||||
public:
|
||||
explicit TessLSTMBOXRenderer(const char* outputbase);
|
||||
|
||||
protected:
|
||||
virtual bool AddImageHandler(TessBaseAPI* api);
|
||||
};
|
||||
|
||||
/**
|
||||
* Renders tesseract output into a plain UTF-8 text string
|
||||
*/
|
||||
|
@ -496,6 +496,20 @@ static void PreloadRenderers(
|
||||
}
|
||||
}
|
||||
|
||||
api->GetBoolVariable("tessedit_create_lstmbox", &b);
|
||||
if (b) {
|
||||
tesseract::TessLSTMBOXRenderer* renderer =
|
||||
new tesseract::TessLSTMBOXRenderer(outputbase);
|
||||
if (renderer->happy()) {
|
||||
renderers->push_back(renderer);
|
||||
} else {
|
||||
delete renderer;
|
||||
tprintf("Error, could not create LSTM BOX output file: %s\n",
|
||||
strerror(errno));
|
||||
error = true;
|
||||
}
|
||||
}
|
||||
|
||||
api->GetBoolVariable("tessedit_create_boxfile", &b);
|
||||
if (b) {
|
||||
tesseract::TessBoxTextRenderer* renderer =
|
||||
|
@ -391,6 +391,8 @@ Tesseract::Tesseract()
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
|
||||
this->params()),
|
||||
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
|
||||
|
@ -1040,6 +1040,7 @@ class Tesseract : public Wordrec {
|
||||
BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
|
||||
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
|
||||
BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file");
|
||||
BOOL_VAR_H(tessedit_create_lstmbox, false, "Write .box file for LSTM training");
|
||||
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
|
||||
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
|
||||
BOOL_VAR_H(textonly_pdf, false,
|
||||
|
1
tessdata/configs/lstmbox
Normal file
1
tessdata/configs/lstmbox
Normal file
@ -0,0 +1 @@
|
||||
tessedit_create_lstmbox 1
|
Loading…
Reference in New Issue
Block a user