ccmain/paragraphs: Make local function UnicodeFor and fix signed/unsigned

Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
Stefan Weil 2021-10-09 20:32:53 +02:00
parent 4c36e2e29a
commit a9c3f6d87f
2 changed files with 18 additions and 22 deletions

View File

@ -73,7 +73,7 @@ static int Epsilon(int space_pix) {
static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,
const std::vector<RowScratchRegisters> *rows, int row_start,
int row_end) {
if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
if (row_start < 0 || static_cast<size_t>(row_end) > rows->size() || row_start > row_end) {
tprintf("Invalid arguments rows[%d, %d) while rows is of size %zu.\n", row_start, row_end,
rows->size());
return false;
@ -94,8 +94,8 @@ static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *fun
static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) {
std::vector<int> max_col_widths;
for (const auto &row : rows) {
int num_columns = row.size();
for (int c = 0; c < num_columns; c++) {
auto num_columns = row.size();
for (size_t c = 0; c < num_columns; c++) {
int num_unicodes = 0;
for (char i : row[c]) {
if ((i & 0xC0) != 0x80) {
@ -285,7 +285,7 @@ bool AsciiLikelyListItem(const std::string &word) {
// ========== Brain Dead Language Model (Tesseract Version) ================
// Return the first Unicode Codepoint from werd[pos].
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
static int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, unsigned pos) {
if (!u || !werd || pos > werd->length()) {
return 0;
}
@ -297,33 +297,32 @@ int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
class UnicodeSpanSkipper {
public:
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
: u_(unicharset), word_(word) {
wordlen_ = word->length();
: u_(unicharset), word_(word), wordlen_(word->length()) {
}
// Given an input position, return the first position >= pos not punc.
int SkipPunc(int pos);
unsigned SkipPunc(unsigned pos);
// Given an input position, return the first position >= pos not digit.
int SkipDigits(int pos);
unsigned SkipDigits(unsigned pos);
// Given an input position, return the first position >= pos not roman.
int SkipRomans(int pos);
unsigned SkipRomans(unsigned pos);
// Given an input position, return the first position >= pos not alpha.
int SkipAlpha(int pos);
unsigned SkipAlpha(unsigned pos);
private:
const UNICHARSET *u_;
const WERD_CHOICE *word_;
int wordlen_;
unsigned wordlen_;
};
int UnicodeSpanSkipper::SkipPunc(int pos) {
unsigned UnicodeSpanSkipper::SkipPunc(unsigned pos) {
while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) {
pos++;
}
return pos;
}
int UnicodeSpanSkipper::SkipDigits(int pos) {
unsigned UnicodeSpanSkipper::SkipDigits(unsigned pos) {
while (pos < wordlen_ &&
(u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) {
pos++;
@ -331,7 +330,7 @@ int UnicodeSpanSkipper::SkipDigits(int pos) {
return pos;
}
int UnicodeSpanSkipper::SkipRomans(int pos) {
unsigned UnicodeSpanSkipper::SkipRomans(unsigned pos) {
const char *kRomans = "ivxlmdIVXLMD";
while (pos < wordlen_) {
int ch = UnicodeFor(u_, word_, pos);
@ -343,7 +342,7 @@ int UnicodeSpanSkipper::SkipRomans(int pos) {
return pos;
}
int UnicodeSpanSkipper::SkipAlpha(int pos) {
unsigned UnicodeSpanSkipper::SkipAlpha(unsigned pos) {
while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) {
pos++;
}
@ -386,13 +385,13 @@ static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
UnicodeSpanSkipper m(u, werd);
int num_segments = 0;
int pos = 0;
unsigned pos = 0;
while (pos < werd->length() && num_segments < 3) {
int numeral_start = m.SkipPunc(pos);
auto numeral_start = m.SkipPunc(pos);
if (numeral_start > pos + 1) {
break;
}
int numeral_end = m.SkipRomans(numeral_start);
auto numeral_end = m.SkipRomans(numeral_start);
if (numeral_end == numeral_start) {
numeral_end = m.SkipDigits(numeral_start);
if (numeral_end == numeral_start) {
@ -2353,7 +2352,7 @@ void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end);
bool pass2a_was_useful =
leftovers2.size() > 1 ||
(leftovers2.size() == 1 && (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
(leftovers2.size() == 1 && (leftovers2[0].begin != 0 || static_cast<size_t>(leftovers2[0].end) != rows.size()));
if (pass2a_was_useful) {
for (auto &leftover2 : leftovers2) {
StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory);

View File

@ -34,9 +34,6 @@ class WERD_CHOICE;
TESS_API
bool AsciiLikelyListItem(const std::string &word);
// Return the first Unicode Codepoint from werd[pos].
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
// Set right word attributes given either a unicharset and werd or a utf8
// string.
TESS_API