ccmain/paragraphs: Make local function UnicodeFor and fix signed/unsigned

Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
Stefan Weil 2021-10-09 20:32:53 +02:00
parent 4c36e2e29a
commit a9c3f6d87f
2 changed files with 18 additions and 22 deletions

View File

@ -73,7 +73,7 @@ static int Epsilon(int space_pix) {
static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name, static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,
const std::vector<RowScratchRegisters> *rows, int row_start, const std::vector<RowScratchRegisters> *rows, int row_start,
int row_end) { int row_end) {
if (row_start < 0 || row_end > rows->size() || row_start > row_end) { if (row_start < 0 || static_cast<size_t>(row_end) > rows->size() || row_start > row_end) {
tprintf("Invalid arguments rows[%d, %d) while rows is of size %zu.\n", row_start, row_end, tprintf("Invalid arguments rows[%d, %d) while rows is of size %zu.\n", row_start, row_end,
rows->size()); rows->size());
return false; return false;
@ -94,8 +94,8 @@ static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *fun
static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) { static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) {
std::vector<int> max_col_widths; std::vector<int> max_col_widths;
for (const auto &row : rows) { for (const auto &row : rows) {
int num_columns = row.size(); auto num_columns = row.size();
for (int c = 0; c < num_columns; c++) { for (size_t c = 0; c < num_columns; c++) {
int num_unicodes = 0; int num_unicodes = 0;
for (char i : row[c]) { for (char i : row[c]) {
if ((i & 0xC0) != 0x80) { if ((i & 0xC0) != 0x80) {
@ -285,7 +285,7 @@ bool AsciiLikelyListItem(const std::string &word) {
// ========== Brain Dead Language Model (Tesseract Version) ================ // ========== Brain Dead Language Model (Tesseract Version) ================
// Return the first Unicode Codepoint from werd[pos]. // Return the first Unicode Codepoint from werd[pos].
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) { static int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, unsigned pos) {
if (!u || !werd || pos > werd->length()) { if (!u || !werd || pos > werd->length()) {
return 0; return 0;
} }
@ -297,33 +297,32 @@ int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
class UnicodeSpanSkipper { class UnicodeSpanSkipper {
public: public:
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word) UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
: u_(unicharset), word_(word) { : u_(unicharset), word_(word), wordlen_(word->length()) {
wordlen_ = word->length();
} }
// Given an input position, return the first position >= pos not punc. // Given an input position, return the first position >= pos not punc.
int SkipPunc(int pos); unsigned SkipPunc(unsigned pos);
// Given an input position, return the first position >= pos not digit. // Given an input position, return the first position >= pos not digit.
int SkipDigits(int pos); unsigned SkipDigits(unsigned pos);
// Given an input position, return the first position >= pos not roman. // Given an input position, return the first position >= pos not roman.
int SkipRomans(int pos); unsigned SkipRomans(unsigned pos);
// Given an input position, return the first position >= pos not alpha. // Given an input position, return the first position >= pos not alpha.
int SkipAlpha(int pos); unsigned SkipAlpha(unsigned pos);
private: private:
const UNICHARSET *u_; const UNICHARSET *u_;
const WERD_CHOICE *word_; const WERD_CHOICE *word_;
int wordlen_; unsigned wordlen_;
}; };
int UnicodeSpanSkipper::SkipPunc(int pos) { unsigned UnicodeSpanSkipper::SkipPunc(unsigned pos) {
while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) { while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) {
pos++; pos++;
} }
return pos; return pos;
} }
int UnicodeSpanSkipper::SkipDigits(int pos) { unsigned UnicodeSpanSkipper::SkipDigits(unsigned pos) {
while (pos < wordlen_ && while (pos < wordlen_ &&
(u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) { (u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) {
pos++; pos++;
@ -331,7 +330,7 @@ int UnicodeSpanSkipper::SkipDigits(int pos) {
return pos; return pos;
} }
int UnicodeSpanSkipper::SkipRomans(int pos) { unsigned UnicodeSpanSkipper::SkipRomans(unsigned pos) {
const char *kRomans = "ivxlmdIVXLMD"; const char *kRomans = "ivxlmdIVXLMD";
while (pos < wordlen_) { while (pos < wordlen_) {
int ch = UnicodeFor(u_, word_, pos); int ch = UnicodeFor(u_, word_, pos);
@ -343,7 +342,7 @@ int UnicodeSpanSkipper::SkipRomans(int pos) {
return pos; return pos;
} }
int UnicodeSpanSkipper::SkipAlpha(int pos) { unsigned UnicodeSpanSkipper::SkipAlpha(unsigned pos) {
while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) { while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) {
pos++; pos++;
} }
@ -386,13 +385,13 @@ static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
UnicodeSpanSkipper m(u, werd); UnicodeSpanSkipper m(u, werd);
int num_segments = 0; int num_segments = 0;
int pos = 0; unsigned pos = 0;
while (pos < werd->length() && num_segments < 3) { while (pos < werd->length() && num_segments < 3) {
int numeral_start = m.SkipPunc(pos); auto numeral_start = m.SkipPunc(pos);
if (numeral_start > pos + 1) { if (numeral_start > pos + 1) {
break; break;
} }
int numeral_end = m.SkipRomans(numeral_start); auto numeral_end = m.SkipRomans(numeral_start);
if (numeral_end == numeral_start) { if (numeral_end == numeral_start) {
numeral_end = m.SkipDigits(numeral_start); numeral_end = m.SkipDigits(numeral_start);
if (numeral_end == numeral_start) { if (numeral_end == numeral_start) {
@ -2353,7 +2352,7 @@ void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end); LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end);
bool pass2a_was_useful = bool pass2a_was_useful =
leftovers2.size() > 1 || leftovers2.size() > 1 ||
(leftovers2.size() == 1 && (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size())); (leftovers2.size() == 1 && (leftovers2[0].begin != 0 || static_cast<size_t>(leftovers2[0].end) != rows.size()));
if (pass2a_was_useful) { if (pass2a_was_useful) {
for (auto &leftover2 : leftovers2) { for (auto &leftover2 : leftovers2) {
StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory); StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory);

View File

@ -34,9 +34,6 @@ class WERD_CHOICE;
TESS_API TESS_API
bool AsciiLikelyListItem(const std::string &word); bool AsciiLikelyListItem(const std::string &word);
// Return the first Unicode Codepoint from werd[pos].
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
// Set right word attributes given either a unicharset and werd or a utf8 // Set right word attributes given either a unicharset and werd or a utf8
// string. // string.
TESS_API TESS_API