mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-23 10:56:19 +08:00
ccmain/paragraphs: Make local function UnicodeFor and fix signed/unsigned
Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
parent
4c36e2e29a
commit
a9c3f6d87f
@ -73,7 +73,7 @@ static int Epsilon(int space_pix) {
|
||||
static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,
|
||||
const std::vector<RowScratchRegisters> *rows, int row_start,
|
||||
int row_end) {
|
||||
if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
|
||||
if (row_start < 0 || static_cast<size_t>(row_end) > rows->size() || row_start > row_end) {
|
||||
tprintf("Invalid arguments rows[%d, %d) while rows is of size %zu.\n", row_start, row_end,
|
||||
rows->size());
|
||||
return false;
|
||||
@ -94,8 +94,8 @@ static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *fun
|
||||
static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) {
|
||||
std::vector<int> max_col_widths;
|
||||
for (const auto &row : rows) {
|
||||
int num_columns = row.size();
|
||||
for (int c = 0; c < num_columns; c++) {
|
||||
auto num_columns = row.size();
|
||||
for (size_t c = 0; c < num_columns; c++) {
|
||||
int num_unicodes = 0;
|
||||
for (char i : row[c]) {
|
||||
if ((i & 0xC0) != 0x80) {
|
||||
@ -285,7 +285,7 @@ bool AsciiLikelyListItem(const std::string &word) {
|
||||
// ========== Brain Dead Language Model (Tesseract Version) ================
|
||||
|
||||
// Return the first Unicode Codepoint from werd[pos].
|
||||
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
|
||||
static int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, unsigned pos) {
|
||||
if (!u || !werd || pos > werd->length()) {
|
||||
return 0;
|
||||
}
|
||||
@ -297,33 +297,32 @@ int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
|
||||
class UnicodeSpanSkipper {
|
||||
public:
|
||||
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
|
||||
: u_(unicharset), word_(word) {
|
||||
wordlen_ = word->length();
|
||||
: u_(unicharset), word_(word), wordlen_(word->length()) {
|
||||
}
|
||||
|
||||
// Given an input position, return the first position >= pos not punc.
|
||||
int SkipPunc(int pos);
|
||||
unsigned SkipPunc(unsigned pos);
|
||||
// Given an input position, return the first position >= pos not digit.
|
||||
int SkipDigits(int pos);
|
||||
unsigned SkipDigits(unsigned pos);
|
||||
// Given an input position, return the first position >= pos not roman.
|
||||
int SkipRomans(int pos);
|
||||
unsigned SkipRomans(unsigned pos);
|
||||
// Given an input position, return the first position >= pos not alpha.
|
||||
int SkipAlpha(int pos);
|
||||
unsigned SkipAlpha(unsigned pos);
|
||||
|
||||
private:
|
||||
const UNICHARSET *u_;
|
||||
const WERD_CHOICE *word_;
|
||||
int wordlen_;
|
||||
unsigned wordlen_;
|
||||
};
|
||||
|
||||
int UnicodeSpanSkipper::SkipPunc(int pos) {
|
||||
unsigned UnicodeSpanSkipper::SkipPunc(unsigned pos) {
|
||||
while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) {
|
||||
pos++;
|
||||
}
|
||||
return pos;
|
||||
}
|
||||
|
||||
int UnicodeSpanSkipper::SkipDigits(int pos) {
|
||||
unsigned UnicodeSpanSkipper::SkipDigits(unsigned pos) {
|
||||
while (pos < wordlen_ &&
|
||||
(u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) {
|
||||
pos++;
|
||||
@ -331,7 +330,7 @@ int UnicodeSpanSkipper::SkipDigits(int pos) {
|
||||
return pos;
|
||||
}
|
||||
|
||||
int UnicodeSpanSkipper::SkipRomans(int pos) {
|
||||
unsigned UnicodeSpanSkipper::SkipRomans(unsigned pos) {
|
||||
const char *kRomans = "ivxlmdIVXLMD";
|
||||
while (pos < wordlen_) {
|
||||
int ch = UnicodeFor(u_, word_, pos);
|
||||
@ -343,7 +342,7 @@ int UnicodeSpanSkipper::SkipRomans(int pos) {
|
||||
return pos;
|
||||
}
|
||||
|
||||
int UnicodeSpanSkipper::SkipAlpha(int pos) {
|
||||
unsigned UnicodeSpanSkipper::SkipAlpha(unsigned pos) {
|
||||
while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) {
|
||||
pos++;
|
||||
}
|
||||
@ -386,13 +385,13 @@ static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
|
||||
|
||||
UnicodeSpanSkipper m(u, werd);
|
||||
int num_segments = 0;
|
||||
int pos = 0;
|
||||
unsigned pos = 0;
|
||||
while (pos < werd->length() && num_segments < 3) {
|
||||
int numeral_start = m.SkipPunc(pos);
|
||||
auto numeral_start = m.SkipPunc(pos);
|
||||
if (numeral_start > pos + 1) {
|
||||
break;
|
||||
}
|
||||
int numeral_end = m.SkipRomans(numeral_start);
|
||||
auto numeral_end = m.SkipRomans(numeral_start);
|
||||
if (numeral_end == numeral_start) {
|
||||
numeral_end = m.SkipDigits(numeral_start);
|
||||
if (numeral_end == numeral_start) {
|
||||
@ -2353,7 +2352,7 @@ void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
|
||||
LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end);
|
||||
bool pass2a_was_useful =
|
||||
leftovers2.size() > 1 ||
|
||||
(leftovers2.size() == 1 && (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
|
||||
(leftovers2.size() == 1 && (leftovers2[0].begin != 0 || static_cast<size_t>(leftovers2[0].end) != rows.size()));
|
||||
if (pass2a_was_useful) {
|
||||
for (auto &leftover2 : leftovers2) {
|
||||
StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory);
|
||||
|
@ -34,9 +34,6 @@ class WERD_CHOICE;
|
||||
TESS_API
|
||||
bool AsciiLikelyListItem(const std::string &word);
|
||||
|
||||
// Return the first Unicode Codepoint from werd[pos].
|
||||
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
|
||||
|
||||
// Set right word attributes given either a unicharset and werd or a utf8
|
||||
// string.
|
||||
TESS_API
|
||||
|
Loading…
Reference in New Issue
Block a user