mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 20:59:36 +08:00
Merge pull request #2231 from Shreeshrii/wordstr
Add renderer to create WordStr box files from images
This commit is contained in:
commit
15f2a4b2c1
@ -266,6 +266,7 @@ set(tesseract_src ${tesseract_src}
|
|||||||
src/api/hocrrenderer.cpp
|
src/api/hocrrenderer.cpp
|
||||||
src/api/lstmboxrenderer.cpp
|
src/api/lstmboxrenderer.cpp
|
||||||
src/api/pdfrenderer.cpp
|
src/api/pdfrenderer.cpp
|
||||||
|
src/api/wordstrboxrenderer.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
|
@ -37,6 +37,7 @@ libtesseract_api_la_SOURCES += altorenderer.cpp
|
|||||||
libtesseract_api_la_SOURCES += hocrrenderer.cpp
|
libtesseract_api_la_SOURCES += hocrrenderer.cpp
|
||||||
libtesseract_api_la_SOURCES += lstmboxrenderer.cpp
|
libtesseract_api_la_SOURCES += lstmboxrenderer.cpp
|
||||||
libtesseract_api_la_SOURCES += pdfrenderer.cpp
|
libtesseract_api_la_SOURCES += pdfrenderer.cpp
|
||||||
|
libtesseract_api_la_SOURCES += wordstrboxrenderer.cpp
|
||||||
libtesseract_api_la_SOURCES += renderer.cpp
|
libtesseract_api_la_SOURCES += renderer.cpp
|
||||||
|
|
||||||
lib_LTLIBRARIES += libtesseract.la
|
lib_LTLIBRARIES += libtesseract.la
|
||||||
|
@ -631,6 +631,14 @@ class TESS_API TessBaseAPI {
|
|||||||
*/
|
*/
|
||||||
char* GetBoxText(int page_number);
|
char* GetBoxText(int page_number);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The recognized text is returned as a char* which is coded in the same
|
||||||
|
* format as a WordStr box file used in training.
|
||||||
|
* page_number is a 0-based page index that will appear in the box file.
|
||||||
|
* Returned string must be freed with the delete [] operator.
|
||||||
|
*/
|
||||||
|
char* GetWordStrBoxText(int page_number);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The recognized text is returned as a char* which is coded
|
* The recognized text is returned as a char* which is coded
|
||||||
* as UNLV format Latin-1 with specific reject and suspect codes.
|
* as UNLV format Latin-1 with specific reject and suspect codes.
|
||||||
|
@ -269,6 +269,17 @@ class TESS_API TessBoxTextRenderer : public TessResultRenderer {
|
|||||||
virtual bool AddImageHandler(TessBaseAPI* api);
|
virtual bool AddImageHandler(TessBaseAPI* api);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Renders tesseract output into a plain UTF-8 text string in WordStr format
|
||||||
|
*/
|
||||||
|
class TESS_API TessWordStrBoxRenderer : public TessResultRenderer {
|
||||||
|
public:
|
||||||
|
explicit TessWordStrBoxRenderer(const char* outputbase);
|
||||||
|
|
||||||
|
protected:
|
||||||
|
virtual bool AddImageHandler(TessBaseAPI* api);
|
||||||
|
};
|
||||||
|
|
||||||
#ifndef DISABLED_LEGACY_ENGINE
|
#ifndef DISABLED_LEGACY_ENGINE
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -524,6 +524,20 @@ static void PreloadRenderers(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
api->GetBoolVariable("tessedit_create_wordstrbox", &b);
|
||||||
|
if (b) {
|
||||||
|
tesseract::TessWordStrBoxRenderer* renderer =
|
||||||
|
new tesseract::TessWordStrBoxRenderer(outputbase);
|
||||||
|
if (renderer->happy()) {
|
||||||
|
renderers->push_back(renderer);
|
||||||
|
} else {
|
||||||
|
delete renderer;
|
||||||
|
tprintf("Error, could not create WordStr BOX output file: %s\n",
|
||||||
|
strerror(errno));
|
||||||
|
error = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
api->GetBoolVariable("tessedit_create_txt", &b);
|
api->GetBoolVariable("tessedit_create_txt", &b);
|
||||||
if (b || (!error && renderers->empty())) {
|
if (b || (!error && renderers->empty())) {
|
||||||
tesseract::TessTextRenderer* renderer =
|
tesseract::TessTextRenderer* renderer =
|
||||||
|
101
src/api/wordstrboxrenderer.cpp
Normal file
101
src/api/wordstrboxrenderer.cpp
Normal file
@ -0,0 +1,101 @@
|
|||||||
|
/**********************************************************************
|
||||||
|
* File: wordstrboxrenderer.cpp
|
||||||
|
* Description: Renderer for creating box file with WordStr strings.
|
||||||
|
* based on the tsv renderer.
|
||||||
|
*
|
||||||
|
* (C) Copyright 2006, Google Inc.
|
||||||
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
** you may not use this file except in compliance with the License.
|
||||||
|
** You may obtain a copy of the License at
|
||||||
|
** http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
** Unless required by applicable law or agreed to in writing, software
|
||||||
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
** See the License for the specific language governing permissions and
|
||||||
|
** limitations under the License.
|
||||||
|
*
|
||||||
|
**********************************************************************/
|
||||||
|
|
||||||
|
#include "baseapi.h" // for TessBaseAPI
|
||||||
|
#include "renderer.h"
|
||||||
|
#include "tesseractclass.h" // for Tesseract
|
||||||
|
|
||||||
|
namespace tesseract {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a UTF8 box file with WordStr strings from the internal data structures.
|
||||||
|
* page_number is a 0-base page index that will appear in the box file.
|
||||||
|
* Returned string must be freed with the delete [] operator.
|
||||||
|
*/
|
||||||
|
|
||||||
|
char* TessBaseAPI::GetWordStrBoxText(int page_number) {
|
||||||
|
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
STRING wordstr_box_str("");
|
||||||
|
int left, top, right, bottom;
|
||||||
|
int page_num = page_number;
|
||||||
|
bool first_line = true;
|
||||||
|
|
||||||
|
LTRResultIterator* res_it = GetLTRIterator();
|
||||||
|
while (!res_it->Empty(RIL_BLOCK)) {
|
||||||
|
if (res_it->Empty(RIL_WORD)) {
|
||||||
|
res_it->Next(RIL_WORD);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||||
|
if (!first_line) {
|
||||||
|
wordstr_box_str.add_str_int("\n\t ", right + 1);
|
||||||
|
wordstr_box_str.add_str_int(" ", image_height_ - bottom);
|
||||||
|
wordstr_box_str.add_str_int(" ", right + 5);
|
||||||
|
wordstr_box_str.add_str_int(" ", image_height_ - top);
|
||||||
|
wordstr_box_str.add_str_int(" ", page_num); // row for tab for EOL
|
||||||
|
wordstr_box_str += "\n";
|
||||||
|
} else {
|
||||||
|
first_line = false;
|
||||||
|
}
|
||||||
|
// Use bounding box for whole line for WordStr
|
||||||
|
res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
|
||||||
|
wordstr_box_str.add_str_int("WordStr ", left);
|
||||||
|
wordstr_box_str.add_str_int(" ", image_height_ - bottom);
|
||||||
|
wordstr_box_str.add_str_int(" ", right);
|
||||||
|
wordstr_box_str.add_str_int(" ", image_height_ - top);
|
||||||
|
wordstr_box_str.add_str_int(" ", page_num); // word
|
||||||
|
wordstr_box_str += " #";
|
||||||
|
}
|
||||||
|
do { wordstr_box_str +=
|
||||||
|
std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
|
||||||
|
wordstr_box_str += " ";
|
||||||
|
res_it->Next(RIL_WORD);
|
||||||
|
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||||
|
}
|
||||||
|
wordstr_box_str.add_str_int("\n\t ", right + 1);
|
||||||
|
wordstr_box_str.add_str_int(" ", image_height_ - bottom);
|
||||||
|
wordstr_box_str.add_str_int(" ", right + 5);
|
||||||
|
wordstr_box_str.add_str_int(" ", image_height_ - top);
|
||||||
|
wordstr_box_str.add_str_int(" ", page_num); // row for tab for EOL
|
||||||
|
wordstr_box_str += "\n";
|
||||||
|
char* ret = new char[wordstr_box_str.length() + 1];
|
||||||
|
strcpy(ret, wordstr_box_str.string());
|
||||||
|
delete res_it;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**********************************************************************
|
||||||
|
* WordStrBox Renderer interface implementation
|
||||||
|
**********************************************************************/
|
||||||
|
TessWordStrBoxRenderer::TessWordStrBoxRenderer(const char *outputbase)
|
||||||
|
: TessResultRenderer(outputbase, "box") {
|
||||||
|
}
|
||||||
|
|
||||||
|
bool TessWordStrBoxRenderer::AddImageHandler(TessBaseAPI* api) {
|
||||||
|
const std::unique_ptr<const char[]> wordstrbox(api->GetWordStrBoxText(imagenum()));
|
||||||
|
if (wordstrbox == nullptr) return false;
|
||||||
|
|
||||||
|
AppendString(wordstrbox.get());
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace tesseract.
|
@ -395,6 +395,8 @@ Tesseract::Tesseract()
|
|||||||
this->params()),
|
this->params()),
|
||||||
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
|
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
|
||||||
this->params()),
|
this->params()),
|
||||||
|
BOOL_MEMBER(tessedit_create_wordstrbox, false, "Write WordStr format .box output file",
|
||||||
|
this->params()),
|
||||||
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
|
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
|
||||||
this->params()),
|
this->params()),
|
||||||
BOOL_MEMBER(textonly_pdf, false,
|
BOOL_MEMBER(textonly_pdf, false,
|
||||||
|
@ -1042,6 +1042,7 @@ class Tesseract : public Wordrec {
|
|||||||
BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file");
|
BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file");
|
||||||
BOOL_VAR_H(tessedit_create_lstmbox, false, "Write .box file for LSTM training");
|
BOOL_VAR_H(tessedit_create_lstmbox, false, "Write .box file for LSTM training");
|
||||||
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
|
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
|
||||||
|
BOOL_VAR_H(tessedit_create_wordstrbox, false, "Write WordStr format .box output file");
|
||||||
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
|
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
|
||||||
BOOL_VAR_H(textonly_pdf, false,
|
BOOL_VAR_H(textonly_pdf, false,
|
||||||
"Create PDF with only one invisible text layer");
|
"Create PDF with only one invisible text layer");
|
||||||
|
1
tessdata/configs/wordstrbox
Normal file
1
tessdata/configs/wordstrbox
Normal file
@ -0,0 +1 @@
|
|||||||
|
tessedit_create_wordstrbox 1
|
Loading…
Reference in New Issue
Block a user