diff --git a/ccutil/unichar.cpp b/ccutil/unichar.cpp index b75dff6d..0ceced13 100644 --- a/ccutil/unichar.cpp +++ b/ccutil/unichar.cpp @@ -206,12 +206,20 @@ UNICHAR::const_iterator UNICHAR::end(const char* utf8_str, const int len) { } // Converts a utf-8 string to a vector of unicodes. -void UNICHAR::UTF8ToUnicode(const char* utf8_str, +// Returns false if the input contains invalid UTF-8, and replaces +// the rest of the string with a single space. +bool UNICHAR::UTF8ToUnicode(const char* utf8_str, GenericVector* unicodes) { const int utf8_length = strlen(utf8_str); const_iterator end_it(end(utf8_str, utf8_length)); for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) { - unicodes->push_back(*it); + if (it.is_legal()) { + unicodes->push_back(*it); + } else { + unicodes->push_back(' '); + return false; + } } + return true; } diff --git a/ccutil/unichar.h b/ccutil/unichar.h index 7e5cd9fb..b2a1e013 100644 --- a/ccutil/unichar.h +++ b/ccutil/unichar.h @@ -151,7 +151,9 @@ class UNICHAR { static const_iterator end(const char* utf8_str, const int byte_length); // Converts a utf-8 string to a vector of unicodes. - static void UTF8ToUnicode(const char* utf8_str, GenericVector* unicodes); + // Returns false if the input contains invalid UTF-8, and replaces + // the rest of the string with a single space. + static bool UTF8ToUnicode(const char* utf8_str, GenericVector* unicodes); private: // A UTF-8 representation of 1 or more Unicode characters. diff --git a/training/boxchar.cpp b/training/boxchar.cpp index b3b71732..276d8af6 100644 --- a/training/boxchar.cpp +++ b/training/boxchar.cpp @@ -23,9 +23,18 @@ #include "boxchar.h" #include +#include #include "fileio.h" +#include "genericvector.h" #include "ndminx.h" +#include "normstrngs.h" +#include "tprintf.h" +#include "unicharset.h" +#include "unicode/uchar.h" // from libicu + +// Absolute Ratio of dx:dy or dy:dx to be a newline. +const int kMinNewlineRatio = 5; namespace tesseract { @@ -33,17 +42,14 @@ BoxChar::BoxChar(const char* utf8_str, int len) : ch_(utf8_str, len) { box_ = NULL; } -BoxChar::~BoxChar() { - boxDestroy(&box_); -} +BoxChar::~BoxChar() { boxDestroy(&box_); } void BoxChar::AddBox(int x, int y, int width, int height) { box_ = boxCreate(x, y, width, height); } /* static */ -void BoxChar::TranslateBoxes(int xshift, int yshift, - vector* boxes) { +void BoxChar::TranslateBoxes(int xshift, int yshift, vector* boxes) { for (int i = 0; i < boxes->size(); ++i) { BOX* box = (*boxes)[i]->box_; if (box != NULL) { @@ -53,15 +59,218 @@ void BoxChar::TranslateBoxes(int xshift, int yshift, } } +// Prepares for writing the boxes to a file by inserting newlines, spaces, +// and re-ordering so the boxes are strictly left-to-right. +/* static */ +void BoxChar::PrepareToWrite(vector* boxes) { + bool rtl_rules = ContainsMostlyRTL(*boxes); + bool vertical_rules = MostlyVertical(*boxes); + InsertNewlines(rtl_rules, vertical_rules, boxes); + InsertSpaces(rtl_rules, vertical_rules, boxes); + for (int i = 0; i < boxes->size(); ++i) { + if ((*boxes)[i]->box_ == NULL) tprintf("Null box at index %d\n", i); + } + if (rtl_rules) { + ReorderRTLText(boxes); + } + tprintf("Rtl = %d ,vertical=%d\n", rtl_rules, vertical_rules); +} + +// Inserts newline (tab) characters into the vector at newline positions. +/* static */ +void BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules, + vector* boxes) { + int prev_i = -1; + int max_shift = 0; + for (int i = 0; i < boxes->size(); ++i) { + Box* box = (*boxes)[i]->box_; + if (box == NULL) { + if (prev_i < 0 || prev_i < i - 1 || i + 1 == boxes->size()) { + // Erase null boxes at the start of a line and after another null box. + do { + delete (*boxes)[i]; + boxes->erase(boxes->begin() + i); + --i; + } while (i >= 0 && i + 1 == boxes->size() && (*boxes)[i]->box_ == NULL); + } + continue; + } + if (prev_i >= 0) { + Box* prev_box = (*boxes)[prev_i]->box_; + int shift = box->x - prev_box->x; + if (vertical_rules) { + shift = box->y - prev_box->y; + } else if (rtl_rules) { + shift = -shift; + } + if (-shift > max_shift) { + // This is a newline. + int width = prev_box->w; + int height = prev_box->h; + int x = prev_box->x + width; + int y = prev_box->y; + if (vertical_rules) { + x = prev_box->x; + y = prev_box->y + height; + } else if (rtl_rules) { + x = prev_box->x - width; + if (x < 0) { + tprintf("prev x = %d, width=%d\n", prev_box->x, width); + x = 0; + } + } + if (prev_i == i - 1) { + // New character needed. + BoxChar* new_box = new BoxChar("\t", 1); + new_box->AddBox(x, y, width, height); + new_box->page_ = (*boxes)[i]->page_; + boxes->insert(boxes->begin() + i, new_box); + ++i; + } else { + (*boxes)[i - 1]->AddBox(x, y, width, height); + (*boxes)[i - 1]->ch_ = "\t"; + } + max_shift = 0; + } else if (shift > max_shift) { + max_shift = shift; + } + } + prev_i = i; + } +} + +// Converts NULL boxes to space characters, with appropriate bounding boxes. +/* static */ +void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules, + vector* boxes) { + // After InsertNewlines, any remaining null boxes are not newlines, and are + // singletons, so add a box to each remaining null box. + for (int i = 1; i + 1 < boxes->size(); ++i) { + Box* box = (*boxes)[i]->box_; + if (box == NULL) { + Box* prev = (*boxes)[i - 1]->box_; + Box* next = (*boxes)[i + 1]->box_; + ASSERT_HOST(prev != NULL && next != NULL); + int top = MIN(prev->y, next->y); + int bottom = MAX(prev->y + prev->h, next->y + next->h); + int left = prev->x + prev->w; + int right = next->x; + if (vertical_rules) { + top = prev->y + prev->h; + bottom = next->y; + left = MIN(prev->x, next->x); + right = MAX(prev->x + prev->w, next->x + next->w); + } else if (rtl_rules) { + // With RTL we have to account for BiDi. + // Right becomes the min left of all prior boxes back to the first + // space or newline. + right = prev->x; + left = next->x + next->w; + for (int j = i - 2; + j >= 0 && (*boxes)[j]->ch_ != " " && (*boxes)[j]->ch_ != "\t"; + --j) { + prev = (*boxes)[j]->box_; + ASSERT_HOST(prev != NULL); + if (prev->x < right) { + right = prev->x; + } + } + // Left becomes the max right of all next boxes foward to the first + // space or newline. + for (int j = i + 2; j < boxes->size() && (*boxes)[j]->box_ != NULL && + (*boxes)[j]->ch_ != "\t"; + ++j) { + next = (*boxes)[j]->box_; + if (next->x + next->w > left) { + left = next->x + next->w; + } + } + } + // Italic and stylized characters can produce negative spaces, which + // Leptonica doesn't like, so clip to a positive size. + if (right <= left) right = left + 1; + if (bottom <= top) bottom = top + 1; + (*boxes)[i]->AddBox(left, top, right - left, bottom - top); + (*boxes)[i]->ch_ = " "; + } + } +} + +// Reorders text in a right-to-left script in left-to-right order. +/* static */ +void BoxChar::ReorderRTLText(vector* boxes) { + // After adding newlines and spaces, this task is simply a matter of sorting + // by left each group of boxes between newlines. + BoxCharPtrSort sorter; + int end = 0; + for (int start = 0; start < boxes->size(); start = end + 1) { + end = start + 1; + while (end < boxes->size() && (*boxes)[end]->ch_ != "\t") ++end; + sort(boxes->begin() + start, boxes->begin() + end, sorter); + } +} + +// Returns true if the vector contains mostly RTL characters. +/* static */ +bool BoxChar::ContainsMostlyRTL(const vector& boxes) { + int num_rtl = 0, num_ltr = 0; + for (int i = 0; i < boxes.size(); ++i) { + // Convert the unichar to UTF32 representation + GenericVector uni_vector; + if (!UNICHAR::UTF8ToUnicode(boxes[i]->ch_.c_str(), &uni_vector)) { + tprintf("Illegal utf8 in boxchar %d string:%s = ", i, + boxes[i]->ch_.c_str()); + for (int c = 0; c < boxes[i]->ch_.size(); ++c) { + tprintf(" 0x%x", boxes[i]->ch_[c]); + } + tprintf("\n"); + continue; + } + for (int j = 0; j < uni_vector.size(); ++j) { + UCharDirection dir = u_charDirection(uni_vector[j]); + if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC || + dir == U_ARABIC_NUMBER) { + ++num_rtl; + } else { + ++num_ltr; + } + } + } + return num_rtl > num_ltr; +} + +// Returns true if the text is mostly laid out vertically. +/* static */ +bool BoxChar::MostlyVertical(const vector& boxes) { + inT64 total_dx = 0, total_dy = 0; + for (int i = 1; i < boxes.size(); ++i) { + if (boxes[i - 1]->box_ != NULL && boxes[i]->box_ != NULL && + boxes[i - 1]->page_ == boxes[i]->page_) { + int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x; + int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y; + if (abs(dx) > abs(dy) * kMinNewlineRatio || + abs(dy) > abs(dx) * kMinNewlineRatio) { + total_dx += dx * dx; + total_dy += dy * dy; + } + } + } + return total_dy > total_dx; +} + +// Returns the total length of all the strings in the boxes. +/* static */ +int BoxChar::TotalByteLength(const vector& boxes) { + int total_length = 0; + for (int i = 0; i < boxes.size(); ++i) total_length += boxes[i]->ch_.size(); + return total_length; +} + // Rotate the boxes in [start_box, end_box) by the given rotation. // The rotation is in radians clockwise about the given center. /* static */ -void BoxChar::RotateBoxes(float rotation, - int xcenter, - int ycenter, - int start_box, - int end_box, - vector* boxes) { +void BoxChar::RotateBoxes(float rotation, int xcenter, int ycenter, + int start_box, int end_box, vector* boxes) { Boxa* orig = boxaCreate(0); for (int i = start_box; i < end_box; ++i) { BOX* box = (*boxes)[i]->box_; @@ -79,16 +288,6 @@ void BoxChar::RotateBoxes(float rotation, } const int kMaxLineLength = 1024; -// Helper appends a tab box to the string to indicate a newline. We can't use -// an actual newline as the file format is line-based text. -static void AppendTabBox(const Box* box, int height, int page, string* output) { - char buffer[kMaxLineLength]; - int nbytes = snprintf(buffer, kMaxLineLength, "\t %d %d %d %d %d\n", - box->x + box->w, height - box->y - box->h, - box->x + box->w + 10, height - box->y, page); - output->append(buffer, nbytes); -} - /* static */ void BoxChar::WriteTesseractBoxFile(const string& filename, int height, const vector& boxes) { @@ -96,43 +295,15 @@ void BoxChar::WriteTesseractBoxFile(const string& filename, int height, char buffer[kMaxLineLength]; for (int i = 0; i < boxes.size(); ++i) { const Box* box = boxes[i]->box_; - if (box != NULL) { - if (i > 0 && boxes[i - 1]->box_ != NULL && - boxes[i - 1]->page_ == boxes[i]->page_ && - box->x + box->w < boxes[i - 1]->box_->x) { - // We are on a newline. Output a tab character to indicate the newline. - AppendTabBox(boxes[i - 1]->box_, height, boxes[i]->page_, &output); - } - int nbytes = snprintf(buffer, kMaxLineLength, - "%s %d %d %d %d %d\n", - boxes[i]->ch_.c_str(), - box->x, height - box->y - box->h, - box->x + box->w, height - box->y, - boxes[i]->page_); - output.append(buffer, nbytes); - } else if (i > 0 && boxes[i - 1]->box_ != NULL) { - int j = i + 1; - // Find the next non-null box, as there may be multiple spaces. - while (j < boxes.size() && boxes[j]->box_ == NULL) ++j; - if (j < boxes.size() && boxes[i - 1]->page_ == boxes[j]->page_) { - const Box* prev = boxes[i - 1]->box_; - const Box* next = boxes[j]->box_; - if (next->x + next->w < prev->x) { - // We are on a newline. Output a tab character to indicate it. - AppendTabBox(prev, height, boxes[j]->page_, &output); - } else { - // Space between words. - int nbytes = snprintf(buffer, kMaxLineLength, - " %d %d %d %d %d\n", - prev->x + prev->w, - height - MAX(prev->y + prev->h, - next->y + next->h), - next->x, height - MIN(prev->y, next->y), - boxes[i - 1]->page_); - output.append(buffer, nbytes); - } - } + if (box == NULL) { + tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n"); + return; } + int nbytes = + snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n", + boxes[i]->ch_.c_str(), box->x, height - box->y - box->h, + box->x + box->w, height - box->y, boxes[i]->page_); + output.append(buffer, nbytes); } File::WriteStringToFileOrDie(output, filename); } diff --git a/training/boxchar.h b/training/boxchar.h index bcb102ce..27b568a1 100644 --- a/training/boxchar.h +++ b/training/boxchar.h @@ -57,9 +57,36 @@ class BoxChar { string* mutable_ch() { return &ch_; } Box* mutable_box() { return box_; } + // Sort function for sorting by left edge of box. Note that this will not + // work properly until after InsertNewlines and InsertSpaces. + bool operator<(const BoxChar& other) const { + if (box_ == NULL) return true; + if (other.box_ == NULL) return false; + return box_->x < other.box_->x; + } + static void TranslateBoxes(int xshift, int yshift, vector* boxes); + // Prepares for writing the boxes to a file by inserting newlines, spaces, + // and re-ordering so the boxes are strictly left-to-right. + static void PrepareToWrite(vector* boxes); + // Inserts newline (tab) characters into the vector at newline positions. + static void InsertNewlines(bool rtl_rules, bool vertical_rules, + vector* boxes); + // Converts NULL boxes to space characters, with appropriate bounding boxes. + static void InsertSpaces(bool rtl_rules, bool vertical_rules, + vector* boxes); + // Reorders text in a right-to-left script in left-to-right order. + static void ReorderRTLText(vector* boxes); + // Returns true if the vector contains mostly RTL characters. + static bool ContainsMostlyRTL(const vector& boxes); + // Returns true if the text is mostly laid out vertically. + static bool MostlyVertical(const vector& boxes); + + // Returns the total length of all the strings in the boxes. + static int TotalByteLength(const vector& boxes); + // Rotate the vector of boxes between start and end by the given rotation. // The rotation is in radians clockwise about the given center. static void RotateBoxes(float rotation, @@ -79,6 +106,14 @@ class BoxChar { Box* box_; int page_; }; + +// Sort predicate to sort a vector of BoxChar*. +struct BoxCharPtrSort { + bool operator()(const BoxChar* box1, const BoxChar* box2) const { + return *box1 < *box2; + } +}; + } // namespace tesseract #endif // TESSERACT_TRAINING_BOXCHAR_H_ diff --git a/training/stringrenderer.cpp b/training/stringrenderer.cpp index a448b92e..8bef6699 100644 --- a/training/stringrenderer.cpp +++ b/training/stringrenderer.cpp @@ -330,7 +330,8 @@ void StringRenderer::ClearBoxes() { boxaDestroy(&page_boxes_); } -void StringRenderer::WriteAllBoxes(const string& filename) const { +void StringRenderer::WriteAllBoxes(const string& filename) { + BoxChar::PrepareToWrite(&boxchars_); BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_); } diff --git a/training/stringrenderer.h b/training/stringrenderer.h index 1fd62d4d..d96e572e 100644 --- a/training/stringrenderer.h +++ b/training/stringrenderer.h @@ -148,7 +148,7 @@ class StringRenderer { void RotatePageBoxes(float rotation); // Delete all boxes. void ClearBoxes(); - void WriteAllBoxes(const string& filename) const; + void WriteAllBoxes(const string& filename); // Removes space-delimited words from the string that are not renderable by // the current font and returns the count of such words. int StripUnrenderableWords(string* utf8_text) const;