2014-01-10 02:04:20 +08:00
|
|
|
/**********************************************************************
|
|
|
|
* File: boxchar.h
|
|
|
|
* Description: Simple class to associate a Tesseract classification unit with
|
|
|
|
* its bounding box so that the boxes can be rotated as the image
|
|
|
|
* is rotated for degradation. Also includes routines to output
|
|
|
|
* the character-tagged boxes to a boxfile.
|
|
|
|
* Author: Ray Smith
|
|
|
|
* Created: Mon Nov 18 2013
|
|
|
|
*
|
|
|
|
* (C) Copyright 2013, Google Inc.
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
* you may not use this file except in compliance with the License.
|
|
|
|
* You may obtain a copy of the License at
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
* See the License for the specific language governing permissions and
|
|
|
|
* limitations under the License.
|
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
|
|
|
|
#ifndef TESSERACT_TRAINING_BOXCHAR_H_
|
|
|
|
#define TESSERACT_TRAINING_BOXCHAR_H_
|
|
|
|
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "allheaders.h" // from Leptonica
|
2017-07-20 07:42:48 +08:00
|
|
|
#include "platform.h"
|
2014-01-10 02:04:20 +08:00
|
|
|
|
|
|
|
struct Box;
|
|
|
|
|
|
|
|
namespace tesseract {
|
|
|
|
|
|
|
|
class BoxChar {
|
|
|
|
public:
|
|
|
|
BoxChar(const char* utf8_str, int len);
|
|
|
|
|
|
|
|
~BoxChar();
|
|
|
|
|
|
|
|
// Accessors.
|
2018-03-03 21:36:28 +08:00
|
|
|
const std::string& ch() const { return ch_; }
|
2014-01-10 02:04:20 +08:00
|
|
|
const Box* box() const { return box_; }
|
|
|
|
const int& page() const { return page_; }
|
2017-09-08 18:55:11 +08:00
|
|
|
void set_rtl_index(int index) { rtl_index_ = index; }
|
|
|
|
const int& rtl_index() const { return rtl_index_; }
|
2014-01-10 02:04:20 +08:00
|
|
|
|
|
|
|
// Set the box_ member.
|
|
|
|
void AddBox(int x, int y, int width, int height);
|
|
|
|
|
|
|
|
void set_page(int page) { page_ = page; }
|
|
|
|
|
2018-03-03 21:36:28 +08:00
|
|
|
std::string* mutable_ch() { return &ch_; }
|
2014-01-10 02:04:20 +08:00
|
|
|
Box* mutable_box() { return box_; }
|
|
|
|
|
2015-05-13 08:51:03 +08:00
|
|
|
// Sort function for sorting by left edge of box. Note that this will not
|
|
|
|
// work properly until after InsertNewlines and InsertSpaces.
|
|
|
|
bool operator<(const BoxChar& other) const {
|
2016-12-13 15:08:01 +08:00
|
|
|
if (box_ == nullptr) return true;
|
|
|
|
if (other.box_ == nullptr) return false;
|
2015-05-13 08:51:03 +08:00
|
|
|
return box_->x < other.box_->x;
|
|
|
|
}
|
2017-09-08 18:55:11 +08:00
|
|
|
// Increments *num_rtl and *num_ltr according to the directionality of
|
|
|
|
// characters in the box.
|
|
|
|
void GetDirection(int* num_rtl, int* num_ltr) const;
|
|
|
|
// Reverses the order of unicodes within the box. If Pango generates a
|
|
|
|
// ligature, these will get reversed on output, so reverse now.
|
|
|
|
void ReverseUnicodesInBox();
|
2015-05-13 08:51:03 +08:00
|
|
|
|
2014-01-10 02:04:20 +08:00
|
|
|
static void TranslateBoxes(int xshift, int yshift,
|
2017-04-28 08:15:35 +08:00
|
|
|
std::vector<BoxChar*>* boxes);
|
2014-01-10 02:04:20 +08:00
|
|
|
|
2015-05-13 08:51:03 +08:00
|
|
|
// Prepares for writing the boxes to a file by inserting newlines, spaces,
|
|
|
|
// and re-ordering so the boxes are strictly left-to-right.
|
2017-04-28 08:15:35 +08:00
|
|
|
static void PrepareToWrite(std::vector<BoxChar*>* boxes);
|
2015-05-13 08:51:03 +08:00
|
|
|
// Inserts newline (tab) characters into the vector at newline positions.
|
|
|
|
static void InsertNewlines(bool rtl_rules, bool vertical_rules,
|
2017-04-28 08:15:35 +08:00
|
|
|
std::vector<BoxChar*>* boxes);
|
2017-01-26 08:20:19 +08:00
|
|
|
// Converts nullptr boxes to space characters, with appropriate bounding
|
|
|
|
// boxes.
|
2015-05-13 08:51:03 +08:00
|
|
|
static void InsertSpaces(bool rtl_rules, bool vertical_rules,
|
2017-04-28 08:15:35 +08:00
|
|
|
std::vector<BoxChar*>* boxes);
|
2015-05-13 08:51:03 +08:00
|
|
|
// Reorders text in a right-to-left script in left-to-right order.
|
2017-04-28 08:15:35 +08:00
|
|
|
static void ReorderRTLText(std::vector<BoxChar*>* boxes);
|
2015-05-13 08:51:03 +08:00
|
|
|
// Returns true if the vector contains mostly RTL characters.
|
2017-04-28 08:15:35 +08:00
|
|
|
static bool ContainsMostlyRTL(const std::vector<BoxChar*>& boxes);
|
2015-05-13 08:51:03 +08:00
|
|
|
// Returns true if the text is mostly laid out vertically.
|
2017-04-28 08:15:35 +08:00
|
|
|
static bool MostlyVertical(const std::vector<BoxChar*>& boxes);
|
2015-05-13 08:51:03 +08:00
|
|
|
|
|
|
|
// Returns the total length of all the strings in the boxes.
|
2017-04-28 08:15:35 +08:00
|
|
|
static int TotalByteLength(const std::vector<BoxChar*>& boxes);
|
2015-05-13 08:51:03 +08:00
|
|
|
|
2014-01-10 02:04:20 +08:00
|
|
|
// Rotate the vector of boxes between start and end by the given rotation.
|
|
|
|
// The rotation is in radians clockwise about the given center.
|
|
|
|
static void RotateBoxes(float rotation,
|
|
|
|
int xcenter,
|
|
|
|
int ycenter,
|
|
|
|
int start_box,
|
|
|
|
int end_box,
|
2017-04-28 08:15:35 +08:00
|
|
|
std::vector<BoxChar*>* boxes);
|
2014-01-10 02:04:20 +08:00
|
|
|
|
|
|
|
// Create a tesseract box file from the vector of boxes. The image height
|
|
|
|
// is needed to convert to tesseract coordinates.
|
2018-03-03 21:36:28 +08:00
|
|
|
static void WriteTesseractBoxFile(const std::string& name, int height,
|
2017-04-28 08:15:35 +08:00
|
|
|
const std::vector<BoxChar*>& boxes);
|
2016-11-08 03:56:07 +08:00
|
|
|
// Gets the tesseract box file as a string from the vector of boxes.
|
|
|
|
// The image height is needed to convert to tesseract coordinates.
|
2018-03-03 21:36:28 +08:00
|
|
|
static std::string GetTesseractBoxStr(int height,
|
|
|
|
const std::vector<BoxChar*>& boxes);
|
2014-01-10 02:04:20 +08:00
|
|
|
|
|
|
|
private:
|
2018-03-03 21:36:28 +08:00
|
|
|
std::string ch_;
|
2014-01-10 02:04:20 +08:00
|
|
|
Box* box_;
|
|
|
|
int page_;
|
2017-09-08 18:55:11 +08:00
|
|
|
// If the box is an RTL character, contains the original position in the
|
|
|
|
// array of boxes (before reversal), otherwise -1.
|
|
|
|
int rtl_index_;
|
2014-01-10 02:04:20 +08:00
|
|
|
};
|
2015-05-13 08:51:03 +08:00
|
|
|
|
|
|
|
// Sort predicate to sort a vector of BoxChar*.
|
|
|
|
struct BoxCharPtrSort {
|
|
|
|
bool operator()(const BoxChar* box1, const BoxChar* box2) const {
|
2017-09-08 18:55:11 +08:00
|
|
|
if (box1->rtl_index() >= 0 && box2->rtl_index() >= 0)
|
|
|
|
return box2->rtl_index() < box1->rtl_index();
|
2015-05-13 08:51:03 +08:00
|
|
|
return *box1 < *box2;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2014-01-10 02:04:20 +08:00
|
|
|
} // namespace tesseract
|
|
|
|
|
|
|
|
#endif // TESSERACT_TRAINING_BOXCHAR_H_
|