mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-11 04:33:20 +08:00
put common code in AddBoxToLSTM
This commit is contained in:
parent
b51c1bf05a
commit
311053681c
@ -27,61 +27,59 @@ namespace tesseract {
|
|||||||
* page_number is a 0-base page index that will appear in the box file.
|
* page_number is a 0-base page index that will appear in the box file.
|
||||||
* Returned string must be freed with the delete [] operator.
|
* Returned string must be freed with the delete [] operator.
|
||||||
*/
|
*/
|
||||||
|
static void AddBoxToLSTM(int right, int bottom, int top,
|
||||||
|
int image_height_, int page_num,
|
||||||
|
STRING* text) {
|
||||||
|
text->add_str_int(" ", image_height_ - bottom);
|
||||||
|
text->add_str_int(" ", right + 5);
|
||||||
|
text->add_str_int(" ", image_height_ - top);
|
||||||
|
text->add_str_int(" ", page_num);
|
||||||
|
}
|
||||||
|
|
||||||
char* TessBaseAPI::GetLSTMBOXText(int page_number) {
|
char* TessBaseAPI::GetLSTMBOXText(int page_number) {
|
||||||
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
|
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
||||||
STRING lstm_box_str("");
|
STRING lstm_box_str("");
|
||||||
|
|
||||||
int page_num = page_number;
|
int page_num = page_number;
|
||||||
bool first_word = true;
|
bool first_word = true;
|
||||||
|
int left, top, right, bottom;
|
||||||
|
|
||||||
LTRResultIterator* res_it = GetLTRIterator();
|
LTRResultIterator* res_it = GetLTRIterator();
|
||||||
while (!res_it->Empty(RIL_BLOCK)) {
|
while (!res_it->Empty(RIL_BLOCK)) {
|
||||||
if (res_it->Empty(RIL_SYMBOL)) {
|
if (res_it->Empty(RIL_SYMBOL)) {
|
||||||
res_it->Next(RIL_SYMBOL);
|
res_it->Next(RIL_SYMBOL);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int left, top, right, bottom;
|
|
||||||
|
|
||||||
if (!first_word) {
|
if (!first_word) {
|
||||||
|
if (!(res_it->IsAtBeginningOf(RIL_TEXTLINE))) {
|
||||||
if (res_it->IsAtBeginningOf(RIL_WORD)) {
|
if (res_it->IsAtBeginningOf(RIL_WORD)) {
|
||||||
lstm_box_str.add_str_int(" ", left);
|
lstm_box_str.add_str_int(" ", left);
|
||||||
lstm_box_str.add_str_int(" ", image_height_ - bottom);
|
AddBoxToLSTM(right, bottom, top, image_height_, page_num, &lstm_box_str);
|
||||||
lstm_box_str.add_str_int(" ", right + 5);
|
|
||||||
lstm_box_str.add_str_int(" ", image_height_ - top);
|
|
||||||
lstm_box_str.add_str_int(" ", page_num); // - word
|
|
||||||
lstm_box_str += "\n"; // end of row for word
|
lstm_box_str += "\n"; // end of row for word
|
||||||
}
|
} // word
|
||||||
|
} else {
|
||||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||||
lstm_box_str.add_str_int("\t ", left);
|
lstm_box_str.add_str_int("\t ", left);
|
||||||
lstm_box_str.add_str_int(" ", image_height_ - bottom);
|
AddBoxToLSTM(right, bottom, top, image_height_, page_num, &lstm_box_str);
|
||||||
lstm_box_str.add_str_int(" ", right + 5);
|
|
||||||
lstm_box_str.add_str_int(" ", image_height_ - top);
|
|
||||||
lstm_box_str.add_str_int(" ", page_num); // - line
|
|
||||||
lstm_box_str += "\n"; // end of row for line
|
lstm_box_str += "\n"; // end of row for line
|
||||||
}
|
} // line
|
||||||
}
|
}
|
||||||
|
} // not first word
|
||||||
first_word=false;
|
first_word=false;
|
||||||
// Use bounding box for whole line for every character
|
// Use bounding box for whole line for everything
|
||||||
res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
|
res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
|
||||||
|
do { lstm_box_str +=
|
||||||
do {
|
|
||||||
lstm_box_str +=
|
|
||||||
std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
|
std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
|
||||||
res_it->Next(RIL_SYMBOL);
|
res_it->Next(RIL_SYMBOL);
|
||||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));
|
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));
|
||||||
|
|
||||||
lstm_box_str.add_str_int(" ", left);
|
lstm_box_str.add_str_int(" ", left);
|
||||||
lstm_box_str.add_str_int(" ", image_height_ - bottom);
|
AddBoxToLSTM(right, bottom, top, image_height_, page_num, &lstm_box_str);
|
||||||
lstm_box_str.add_str_int(" ", right + 5);
|
lstm_box_str += "\n"; // end of row for symbol
|
||||||
lstm_box_str.add_str_int(" ", image_height_ - top);
|
|
||||||
lstm_box_str.add_str_int(" ", page_num); // symbol
|
|
||||||
lstm_box_str += "\n"; // end of row
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
lstm_box_str.add_str_int("\t ", left);
|
||||||
|
AddBoxToLSTM(right, bottom, top, image_height_, page_num, &lstm_box_str);
|
||||||
|
lstm_box_str += "\n"; // end of PAGE
|
||||||
char* ret = new char[lstm_box_str.length() + 1];
|
char* ret = new char[lstm_box_str.length() + 1];
|
||||||
strcpy(ret, lstm_box_str.string());
|
strcpy(ret, lstm_box_str.string());
|
||||||
delete res_it;
|
delete res_it;
|
||||||
|
Loading…
Reference in New Issue
Block a user