mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-12 05:13:14 +08:00
Improved newlines and spaces in a box file so it works better with RTL languages.
This commit is contained in:
parent
6b634170c1
commit
164897210a
@ -206,12 +206,20 @@ UNICHAR::const_iterator UNICHAR::end(const char* utf8_str, const int len) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Converts a utf-8 string to a vector of unicodes.
|
// Converts a utf-8 string to a vector of unicodes.
|
||||||
void UNICHAR::UTF8ToUnicode(const char* utf8_str,
|
// Returns false if the input contains invalid UTF-8, and replaces
|
||||||
|
// the rest of the string with a single space.
|
||||||
|
bool UNICHAR::UTF8ToUnicode(const char* utf8_str,
|
||||||
GenericVector<int>* unicodes) {
|
GenericVector<int>* unicodes) {
|
||||||
const int utf8_length = strlen(utf8_str);
|
const int utf8_length = strlen(utf8_str);
|
||||||
const_iterator end_it(end(utf8_str, utf8_length));
|
const_iterator end_it(end(utf8_str, utf8_length));
|
||||||
for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
|
for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
|
||||||
unicodes->push_back(*it);
|
if (it.is_legal()) {
|
||||||
|
unicodes->push_back(*it);
|
||||||
|
} else {
|
||||||
|
unicodes->push_back(' ');
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -151,7 +151,9 @@ class UNICHAR {
|
|||||||
static const_iterator end(const char* utf8_str, const int byte_length);
|
static const_iterator end(const char* utf8_str, const int byte_length);
|
||||||
|
|
||||||
// Converts a utf-8 string to a vector of unicodes.
|
// Converts a utf-8 string to a vector of unicodes.
|
||||||
static void UTF8ToUnicode(const char* utf8_str, GenericVector<int>* unicodes);
|
// Returns false if the input contains invalid UTF-8, and replaces
|
||||||
|
// the rest of the string with a single space.
|
||||||
|
static bool UTF8ToUnicode(const char* utf8_str, GenericVector<int>* unicodes);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// A UTF-8 representation of 1 or more Unicode characters.
|
// A UTF-8 representation of 1 or more Unicode characters.
|
||||||
|
@ -23,9 +23,18 @@
|
|||||||
#include "boxchar.h"
|
#include "boxchar.h"
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
#include "fileio.h"
|
#include "fileio.h"
|
||||||
|
#include "genericvector.h"
|
||||||
#include "ndminx.h"
|
#include "ndminx.h"
|
||||||
|
#include "normstrngs.h"
|
||||||
|
#include "tprintf.h"
|
||||||
|
#include "unicharset.h"
|
||||||
|
#include "unicode/uchar.h" // from libicu
|
||||||
|
|
||||||
|
// Absolute Ratio of dx:dy or dy:dx to be a newline.
|
||||||
|
const int kMinNewlineRatio = 5;
|
||||||
|
|
||||||
namespace tesseract {
|
namespace tesseract {
|
||||||
|
|
||||||
@ -33,17 +42,14 @@ BoxChar::BoxChar(const char* utf8_str, int len) : ch_(utf8_str, len) {
|
|||||||
box_ = NULL;
|
box_ = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
BoxChar::~BoxChar() {
|
BoxChar::~BoxChar() { boxDestroy(&box_); }
|
||||||
boxDestroy(&box_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void BoxChar::AddBox(int x, int y, int width, int height) {
|
void BoxChar::AddBox(int x, int y, int width, int height) {
|
||||||
box_ = boxCreate(x, y, width, height);
|
box_ = boxCreate(x, y, width, height);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* static */
|
/* static */
|
||||||
void BoxChar::TranslateBoxes(int xshift, int yshift,
|
void BoxChar::TranslateBoxes(int xshift, int yshift, vector<BoxChar*>* boxes) {
|
||||||
vector<BoxChar*>* boxes) {
|
|
||||||
for (int i = 0; i < boxes->size(); ++i) {
|
for (int i = 0; i < boxes->size(); ++i) {
|
||||||
BOX* box = (*boxes)[i]->box_;
|
BOX* box = (*boxes)[i]->box_;
|
||||||
if (box != NULL) {
|
if (box != NULL) {
|
||||||
@ -53,15 +59,218 @@ void BoxChar::TranslateBoxes(int xshift, int yshift,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prepares for writing the boxes to a file by inserting newlines, spaces,
|
||||||
|
// and re-ordering so the boxes are strictly left-to-right.
|
||||||
|
/* static */
|
||||||
|
void BoxChar::PrepareToWrite(vector<BoxChar*>* boxes) {
|
||||||
|
bool rtl_rules = ContainsMostlyRTL(*boxes);
|
||||||
|
bool vertical_rules = MostlyVertical(*boxes);
|
||||||
|
InsertNewlines(rtl_rules, vertical_rules, boxes);
|
||||||
|
InsertSpaces(rtl_rules, vertical_rules, boxes);
|
||||||
|
for (int i = 0; i < boxes->size(); ++i) {
|
||||||
|
if ((*boxes)[i]->box_ == NULL) tprintf("Null box at index %d\n", i);
|
||||||
|
}
|
||||||
|
if (rtl_rules) {
|
||||||
|
ReorderRTLText(boxes);
|
||||||
|
}
|
||||||
|
tprintf("Rtl = %d ,vertical=%d\n", rtl_rules, vertical_rules);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Inserts newline (tab) characters into the vector at newline positions.
|
||||||
|
/* static */
|
||||||
|
void BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules,
|
||||||
|
vector<BoxChar*>* boxes) {
|
||||||
|
int prev_i = -1;
|
||||||
|
int max_shift = 0;
|
||||||
|
for (int i = 0; i < boxes->size(); ++i) {
|
||||||
|
Box* box = (*boxes)[i]->box_;
|
||||||
|
if (box == NULL) {
|
||||||
|
if (prev_i < 0 || prev_i < i - 1 || i + 1 == boxes->size()) {
|
||||||
|
// Erase null boxes at the start of a line and after another null box.
|
||||||
|
do {
|
||||||
|
delete (*boxes)[i];
|
||||||
|
boxes->erase(boxes->begin() + i);
|
||||||
|
--i;
|
||||||
|
} while (i >= 0 && i + 1 == boxes->size() && (*boxes)[i]->box_ == NULL);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (prev_i >= 0) {
|
||||||
|
Box* prev_box = (*boxes)[prev_i]->box_;
|
||||||
|
int shift = box->x - prev_box->x;
|
||||||
|
if (vertical_rules) {
|
||||||
|
shift = box->y - prev_box->y;
|
||||||
|
} else if (rtl_rules) {
|
||||||
|
shift = -shift;
|
||||||
|
}
|
||||||
|
if (-shift > max_shift) {
|
||||||
|
// This is a newline.
|
||||||
|
int width = prev_box->w;
|
||||||
|
int height = prev_box->h;
|
||||||
|
int x = prev_box->x + width;
|
||||||
|
int y = prev_box->y;
|
||||||
|
if (vertical_rules) {
|
||||||
|
x = prev_box->x;
|
||||||
|
y = prev_box->y + height;
|
||||||
|
} else if (rtl_rules) {
|
||||||
|
x = prev_box->x - width;
|
||||||
|
if (x < 0) {
|
||||||
|
tprintf("prev x = %d, width=%d\n", prev_box->x, width);
|
||||||
|
x = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (prev_i == i - 1) {
|
||||||
|
// New character needed.
|
||||||
|
BoxChar* new_box = new BoxChar("\t", 1);
|
||||||
|
new_box->AddBox(x, y, width, height);
|
||||||
|
new_box->page_ = (*boxes)[i]->page_;
|
||||||
|
boxes->insert(boxes->begin() + i, new_box);
|
||||||
|
++i;
|
||||||
|
} else {
|
||||||
|
(*boxes)[i - 1]->AddBox(x, y, width, height);
|
||||||
|
(*boxes)[i - 1]->ch_ = "\t";
|
||||||
|
}
|
||||||
|
max_shift = 0;
|
||||||
|
} else if (shift > max_shift) {
|
||||||
|
max_shift = shift;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
prev_i = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Converts NULL boxes to space characters, with appropriate bounding boxes.
|
||||||
|
/* static */
|
||||||
|
void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules,
|
||||||
|
vector<BoxChar*>* boxes) {
|
||||||
|
// After InsertNewlines, any remaining null boxes are not newlines, and are
|
||||||
|
// singletons, so add a box to each remaining null box.
|
||||||
|
for (int i = 1; i + 1 < boxes->size(); ++i) {
|
||||||
|
Box* box = (*boxes)[i]->box_;
|
||||||
|
if (box == NULL) {
|
||||||
|
Box* prev = (*boxes)[i - 1]->box_;
|
||||||
|
Box* next = (*boxes)[i + 1]->box_;
|
||||||
|
ASSERT_HOST(prev != NULL && next != NULL);
|
||||||
|
int top = MIN(prev->y, next->y);
|
||||||
|
int bottom = MAX(prev->y + prev->h, next->y + next->h);
|
||||||
|
int left = prev->x + prev->w;
|
||||||
|
int right = next->x;
|
||||||
|
if (vertical_rules) {
|
||||||
|
top = prev->y + prev->h;
|
||||||
|
bottom = next->y;
|
||||||
|
left = MIN(prev->x, next->x);
|
||||||
|
right = MAX(prev->x + prev->w, next->x + next->w);
|
||||||
|
} else if (rtl_rules) {
|
||||||
|
// With RTL we have to account for BiDi.
|
||||||
|
// Right becomes the min left of all prior boxes back to the first
|
||||||
|
// space or newline.
|
||||||
|
right = prev->x;
|
||||||
|
left = next->x + next->w;
|
||||||
|
for (int j = i - 2;
|
||||||
|
j >= 0 && (*boxes)[j]->ch_ != " " && (*boxes)[j]->ch_ != "\t";
|
||||||
|
--j) {
|
||||||
|
prev = (*boxes)[j]->box_;
|
||||||
|
ASSERT_HOST(prev != NULL);
|
||||||
|
if (prev->x < right) {
|
||||||
|
right = prev->x;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Left becomes the max right of all next boxes foward to the first
|
||||||
|
// space or newline.
|
||||||
|
for (int j = i + 2; j < boxes->size() && (*boxes)[j]->box_ != NULL &&
|
||||||
|
(*boxes)[j]->ch_ != "\t";
|
||||||
|
++j) {
|
||||||
|
next = (*boxes)[j]->box_;
|
||||||
|
if (next->x + next->w > left) {
|
||||||
|
left = next->x + next->w;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Italic and stylized characters can produce negative spaces, which
|
||||||
|
// Leptonica doesn't like, so clip to a positive size.
|
||||||
|
if (right <= left) right = left + 1;
|
||||||
|
if (bottom <= top) bottom = top + 1;
|
||||||
|
(*boxes)[i]->AddBox(left, top, right - left, bottom - top);
|
||||||
|
(*boxes)[i]->ch_ = " ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reorders text in a right-to-left script in left-to-right order.
|
||||||
|
/* static */
|
||||||
|
void BoxChar::ReorderRTLText(vector<BoxChar*>* boxes) {
|
||||||
|
// After adding newlines and spaces, this task is simply a matter of sorting
|
||||||
|
// by left each group of boxes between newlines.
|
||||||
|
BoxCharPtrSort sorter;
|
||||||
|
int end = 0;
|
||||||
|
for (int start = 0; start < boxes->size(); start = end + 1) {
|
||||||
|
end = start + 1;
|
||||||
|
while (end < boxes->size() && (*boxes)[end]->ch_ != "\t") ++end;
|
||||||
|
sort(boxes->begin() + start, boxes->begin() + end, sorter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns true if the vector contains mostly RTL characters.
|
||||||
|
/* static */
|
||||||
|
bool BoxChar::ContainsMostlyRTL(const vector<BoxChar*>& boxes) {
|
||||||
|
int num_rtl = 0, num_ltr = 0;
|
||||||
|
for (int i = 0; i < boxes.size(); ++i) {
|
||||||
|
// Convert the unichar to UTF32 representation
|
||||||
|
GenericVector<char32> uni_vector;
|
||||||
|
if (!UNICHAR::UTF8ToUnicode(boxes[i]->ch_.c_str(), &uni_vector)) {
|
||||||
|
tprintf("Illegal utf8 in boxchar %d string:%s = ", i,
|
||||||
|
boxes[i]->ch_.c_str());
|
||||||
|
for (int c = 0; c < boxes[i]->ch_.size(); ++c) {
|
||||||
|
tprintf(" 0x%x", boxes[i]->ch_[c]);
|
||||||
|
}
|
||||||
|
tprintf("\n");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (int j = 0; j < uni_vector.size(); ++j) {
|
||||||
|
UCharDirection dir = u_charDirection(uni_vector[j]);
|
||||||
|
if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC ||
|
||||||
|
dir == U_ARABIC_NUMBER) {
|
||||||
|
++num_rtl;
|
||||||
|
} else {
|
||||||
|
++num_ltr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return num_rtl > num_ltr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns true if the text is mostly laid out vertically.
|
||||||
|
/* static */
|
||||||
|
bool BoxChar::MostlyVertical(const vector<BoxChar*>& boxes) {
|
||||||
|
inT64 total_dx = 0, total_dy = 0;
|
||||||
|
for (int i = 1; i < boxes.size(); ++i) {
|
||||||
|
if (boxes[i - 1]->box_ != NULL && boxes[i]->box_ != NULL &&
|
||||||
|
boxes[i - 1]->page_ == boxes[i]->page_) {
|
||||||
|
int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x;
|
||||||
|
int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y;
|
||||||
|
if (abs(dx) > abs(dy) * kMinNewlineRatio ||
|
||||||
|
abs(dy) > abs(dx) * kMinNewlineRatio) {
|
||||||
|
total_dx += dx * dx;
|
||||||
|
total_dy += dy * dy;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return total_dy > total_dx;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the total length of all the strings in the boxes.
|
||||||
|
/* static */
|
||||||
|
int BoxChar::TotalByteLength(const vector<BoxChar*>& boxes) {
|
||||||
|
int total_length = 0;
|
||||||
|
for (int i = 0; i < boxes.size(); ++i) total_length += boxes[i]->ch_.size();
|
||||||
|
return total_length;
|
||||||
|
}
|
||||||
|
|
||||||
// Rotate the boxes in [start_box, end_box) by the given rotation.
|
// Rotate the boxes in [start_box, end_box) by the given rotation.
|
||||||
// The rotation is in radians clockwise about the given center.
|
// The rotation is in radians clockwise about the given center.
|
||||||
/* static */
|
/* static */
|
||||||
void BoxChar::RotateBoxes(float rotation,
|
void BoxChar::RotateBoxes(float rotation, int xcenter, int ycenter,
|
||||||
int xcenter,
|
int start_box, int end_box, vector<BoxChar*>* boxes) {
|
||||||
int ycenter,
|
|
||||||
int start_box,
|
|
||||||
int end_box,
|
|
||||||
vector<BoxChar*>* boxes) {
|
|
||||||
Boxa* orig = boxaCreate(0);
|
Boxa* orig = boxaCreate(0);
|
||||||
for (int i = start_box; i < end_box; ++i) {
|
for (int i = start_box; i < end_box; ++i) {
|
||||||
BOX* box = (*boxes)[i]->box_;
|
BOX* box = (*boxes)[i]->box_;
|
||||||
@ -79,16 +288,6 @@ void BoxChar::RotateBoxes(float rotation,
|
|||||||
}
|
}
|
||||||
|
|
||||||
const int kMaxLineLength = 1024;
|
const int kMaxLineLength = 1024;
|
||||||
// Helper appends a tab box to the string to indicate a newline. We can't use
|
|
||||||
// an actual newline as the file format is line-based text.
|
|
||||||
static void AppendTabBox(const Box* box, int height, int page, string* output) {
|
|
||||||
char buffer[kMaxLineLength];
|
|
||||||
int nbytes = snprintf(buffer, kMaxLineLength, "\t %d %d %d %d %d\n",
|
|
||||||
box->x + box->w, height - box->y - box->h,
|
|
||||||
box->x + box->w + 10, height - box->y, page);
|
|
||||||
output->append(buffer, nbytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* static */
|
/* static */
|
||||||
void BoxChar::WriteTesseractBoxFile(const string& filename, int height,
|
void BoxChar::WriteTesseractBoxFile(const string& filename, int height,
|
||||||
const vector<BoxChar*>& boxes) {
|
const vector<BoxChar*>& boxes) {
|
||||||
@ -96,43 +295,15 @@ void BoxChar::WriteTesseractBoxFile(const string& filename, int height,
|
|||||||
char buffer[kMaxLineLength];
|
char buffer[kMaxLineLength];
|
||||||
for (int i = 0; i < boxes.size(); ++i) {
|
for (int i = 0; i < boxes.size(); ++i) {
|
||||||
const Box* box = boxes[i]->box_;
|
const Box* box = boxes[i]->box_;
|
||||||
if (box != NULL) {
|
if (box == NULL) {
|
||||||
if (i > 0 && boxes[i - 1]->box_ != NULL &&
|
tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n");
|
||||||
boxes[i - 1]->page_ == boxes[i]->page_ &&
|
return;
|
||||||
box->x + box->w < boxes[i - 1]->box_->x) {
|
|
||||||
// We are on a newline. Output a tab character to indicate the newline.
|
|
||||||
AppendTabBox(boxes[i - 1]->box_, height, boxes[i]->page_, &output);
|
|
||||||
}
|
|
||||||
int nbytes = snprintf(buffer, kMaxLineLength,
|
|
||||||
"%s %d %d %d %d %d\n",
|
|
||||||
boxes[i]->ch_.c_str(),
|
|
||||||
box->x, height - box->y - box->h,
|
|
||||||
box->x + box->w, height - box->y,
|
|
||||||
boxes[i]->page_);
|
|
||||||
output.append(buffer, nbytes);
|
|
||||||
} else if (i > 0 && boxes[i - 1]->box_ != NULL) {
|
|
||||||
int j = i + 1;
|
|
||||||
// Find the next non-null box, as there may be multiple spaces.
|
|
||||||
while (j < boxes.size() && boxes[j]->box_ == NULL) ++j;
|
|
||||||
if (j < boxes.size() && boxes[i - 1]->page_ == boxes[j]->page_) {
|
|
||||||
const Box* prev = boxes[i - 1]->box_;
|
|
||||||
const Box* next = boxes[j]->box_;
|
|
||||||
if (next->x + next->w < prev->x) {
|
|
||||||
// We are on a newline. Output a tab character to indicate it.
|
|
||||||
AppendTabBox(prev, height, boxes[j]->page_, &output);
|
|
||||||
} else {
|
|
||||||
// Space between words.
|
|
||||||
int nbytes = snprintf(buffer, kMaxLineLength,
|
|
||||||
" %d %d %d %d %d\n",
|
|
||||||
prev->x + prev->w,
|
|
||||||
height - MAX(prev->y + prev->h,
|
|
||||||
next->y + next->h),
|
|
||||||
next->x, height - MIN(prev->y, next->y),
|
|
||||||
boxes[i - 1]->page_);
|
|
||||||
output.append(buffer, nbytes);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
int nbytes =
|
||||||
|
snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n",
|
||||||
|
boxes[i]->ch_.c_str(), box->x, height - box->y - box->h,
|
||||||
|
box->x + box->w, height - box->y, boxes[i]->page_);
|
||||||
|
output.append(buffer, nbytes);
|
||||||
}
|
}
|
||||||
File::WriteStringToFileOrDie(output, filename);
|
File::WriteStringToFileOrDie(output, filename);
|
||||||
}
|
}
|
||||||
|
@ -57,9 +57,36 @@ class BoxChar {
|
|||||||
string* mutable_ch() { return &ch_; }
|
string* mutable_ch() { return &ch_; }
|
||||||
Box* mutable_box() { return box_; }
|
Box* mutable_box() { return box_; }
|
||||||
|
|
||||||
|
// Sort function for sorting by left edge of box. Note that this will not
|
||||||
|
// work properly until after InsertNewlines and InsertSpaces.
|
||||||
|
bool operator<(const BoxChar& other) const {
|
||||||
|
if (box_ == NULL) return true;
|
||||||
|
if (other.box_ == NULL) return false;
|
||||||
|
return box_->x < other.box_->x;
|
||||||
|
}
|
||||||
|
|
||||||
static void TranslateBoxes(int xshift, int yshift,
|
static void TranslateBoxes(int xshift, int yshift,
|
||||||
vector<BoxChar*>* boxes);
|
vector<BoxChar*>* boxes);
|
||||||
|
|
||||||
|
// Prepares for writing the boxes to a file by inserting newlines, spaces,
|
||||||
|
// and re-ordering so the boxes are strictly left-to-right.
|
||||||
|
static void PrepareToWrite(vector<BoxChar*>* boxes);
|
||||||
|
// Inserts newline (tab) characters into the vector at newline positions.
|
||||||
|
static void InsertNewlines(bool rtl_rules, bool vertical_rules,
|
||||||
|
vector<BoxChar*>* boxes);
|
||||||
|
// Converts NULL boxes to space characters, with appropriate bounding boxes.
|
||||||
|
static void InsertSpaces(bool rtl_rules, bool vertical_rules,
|
||||||
|
vector<BoxChar*>* boxes);
|
||||||
|
// Reorders text in a right-to-left script in left-to-right order.
|
||||||
|
static void ReorderRTLText(vector<BoxChar*>* boxes);
|
||||||
|
// Returns true if the vector contains mostly RTL characters.
|
||||||
|
static bool ContainsMostlyRTL(const vector<BoxChar*>& boxes);
|
||||||
|
// Returns true if the text is mostly laid out vertically.
|
||||||
|
static bool MostlyVertical(const vector<BoxChar*>& boxes);
|
||||||
|
|
||||||
|
// Returns the total length of all the strings in the boxes.
|
||||||
|
static int TotalByteLength(const vector<BoxChar*>& boxes);
|
||||||
|
|
||||||
// Rotate the vector of boxes between start and end by the given rotation.
|
// Rotate the vector of boxes between start and end by the given rotation.
|
||||||
// The rotation is in radians clockwise about the given center.
|
// The rotation is in radians clockwise about the given center.
|
||||||
static void RotateBoxes(float rotation,
|
static void RotateBoxes(float rotation,
|
||||||
@ -79,6 +106,14 @@ class BoxChar {
|
|||||||
Box* box_;
|
Box* box_;
|
||||||
int page_;
|
int page_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Sort predicate to sort a vector of BoxChar*.
|
||||||
|
struct BoxCharPtrSort {
|
||||||
|
bool operator()(const BoxChar* box1, const BoxChar* box2) const {
|
||||||
|
return *box1 < *box2;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
} // namespace tesseract
|
} // namespace tesseract
|
||||||
|
|
||||||
#endif // TESSERACT_TRAINING_BOXCHAR_H_
|
#endif // TESSERACT_TRAINING_BOXCHAR_H_
|
||||||
|
@ -330,7 +330,8 @@ void StringRenderer::ClearBoxes() {
|
|||||||
boxaDestroy(&page_boxes_);
|
boxaDestroy(&page_boxes_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void StringRenderer::WriteAllBoxes(const string& filename) const {
|
void StringRenderer::WriteAllBoxes(const string& filename) {
|
||||||
|
BoxChar::PrepareToWrite(&boxchars_);
|
||||||
BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_);
|
BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -148,7 +148,7 @@ class StringRenderer {
|
|||||||
void RotatePageBoxes(float rotation);
|
void RotatePageBoxes(float rotation);
|
||||||
// Delete all boxes.
|
// Delete all boxes.
|
||||||
void ClearBoxes();
|
void ClearBoxes();
|
||||||
void WriteAllBoxes(const string& filename) const;
|
void WriteAllBoxes(const string& filename);
|
||||||
// Removes space-delimited words from the string that are not renderable by
|
// Removes space-delimited words from the string that are not renderable by
|
||||||
// the current font and returns the count of such words.
|
// the current font and returns the count of such words.
|
||||||
int StripUnrenderableWords(string* utf8_text) const;
|
int StripUnrenderableWords(string* utf8_text) const;
|
||||||
|
Loading…
Reference in New Issue
Block a user