ccmain/paragraphs: Make local function UnicodeFor and fix signed/unsigned

Signed-off-by: Stefan Weil <sw@weilnetz.de>
2025-01-23 10:56:19 +08:00 · 2021-10-09 20:32:53 +02:00 · 2021-10-09 20:32:53 +02:00 · a9c3f6d87f
commit a9c3f6d87f
parent 4c36e2e29a
2 changed files with 18 additions and 22 deletions
--- a/src/ccmain/paragraphs.cpp
+++ b/src/ccmain/paragraphs.cpp
@ -73,7 +73,7 @@ static int Epsilon(int space_pix) {
 static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,
                              const std::vector<RowScratchRegisters> *rows, int row_start,
                              int row_end) {
-  if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
+  if (row_start < 0 || static_cast<size_t>(row_end) > rows->size() || row_start > row_end) {
    tprintf("Invalid arguments rows[%d, %d) while rows is of size %zu.\n", row_start, row_end,
            rows->size());
    return false;
@ -94,8 +94,8 @@ static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *fun
 static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) {
  std::vector<int> max_col_widths;
  for (const auto &row : rows) {
-    int num_columns = row.size();
-    for (int c = 0; c < num_columns; c++) {
+    auto num_columns = row.size();
+    for (size_t c = 0; c < num_columns; c++) {
      int num_unicodes = 0;
      for (char i : row[c]) {
        if ((i & 0xC0) != 0x80) {
@ -285,7 +285,7 @@ bool AsciiLikelyListItem(const std::string &word) {
 // ========== Brain Dead Language Model (Tesseract Version) ================

 // Return the first Unicode Codepoint from werd[pos].
-int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
+static int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, unsigned pos) {
  if (!u || !werd || pos > werd->length()) {
    return 0;
  }
@ -297,33 +297,32 @@ int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
 class UnicodeSpanSkipper {
 public:
  UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
-      : u_(unicharset), word_(word) {
-    wordlen_ = word->length();
+      : u_(unicharset), word_(word), wordlen_(word->length()) {
  }

  // Given an input position, return the first position >= pos not punc.
-  int SkipPunc(int pos);
+  unsigned SkipPunc(unsigned pos);
  // Given an input position, return the first position >= pos not digit.
-  int SkipDigits(int pos);
+  unsigned SkipDigits(unsigned pos);
  // Given an input position, return the first position >= pos not roman.
-  int SkipRomans(int pos);
+  unsigned SkipRomans(unsigned pos);
  // Given an input position, return the first position >= pos not alpha.
-  int SkipAlpha(int pos);
+  unsigned SkipAlpha(unsigned pos);

 private:
  const UNICHARSET *u_;
  const WERD_CHOICE *word_;
-  int wordlen_;
+  unsigned wordlen_;
 };

-int UnicodeSpanSkipper::SkipPunc(int pos) {
+unsigned UnicodeSpanSkipper::SkipPunc(unsigned pos) {
  while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) {
    pos++;
  }
  return pos;
 }

-int UnicodeSpanSkipper::SkipDigits(int pos) {
+unsigned UnicodeSpanSkipper::SkipDigits(unsigned pos) {
  while (pos < wordlen_ &&
         (u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) {
    pos++;
@ -331,7 +330,7 @@ int UnicodeSpanSkipper::SkipDigits(int pos) {
  return pos;
 }

-int UnicodeSpanSkipper::SkipRomans(int pos) {
+unsigned UnicodeSpanSkipper::SkipRomans(unsigned pos) {
  const char *kRomans = "ivxlmdIVXLMD";
  while (pos < wordlen_) {
    int ch = UnicodeFor(u_, word_, pos);
@ -343,7 +342,7 @@ int UnicodeSpanSkipper::SkipRomans(int pos) {
  return pos;
 }

-int UnicodeSpanSkipper::SkipAlpha(int pos) {
+unsigned UnicodeSpanSkipper::SkipAlpha(unsigned pos) {
  while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) {
    pos++;
  }
@ -386,13 +385,13 @@ static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {

  UnicodeSpanSkipper m(u, werd);
  int num_segments = 0;
-  int pos = 0;
+  unsigned pos = 0;
  while (pos < werd->length() && num_segments < 3) {
-    int numeral_start = m.SkipPunc(pos);
+    auto numeral_start = m.SkipPunc(pos);
    if (numeral_start > pos + 1) {
      break;
    }
-    int numeral_end = m.SkipRomans(numeral_start);
+    auto numeral_end = m.SkipRomans(numeral_start);
    if (numeral_end == numeral_start) {
      numeral_end = m.SkipDigits(numeral_start);
      if (numeral_end == numeral_start) {
@ -2353,7 +2352,7 @@ void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
    LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end);
    bool pass2a_was_useful =
        leftovers2.size() > 1 ||
-        (leftovers2.size() == 1 && (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
+        (leftovers2.size() == 1 && (leftovers2[0].begin != 0 || static_cast<size_t>(leftovers2[0].end) != rows.size()));
    if (pass2a_was_useful) {
      for (auto &leftover2 : leftovers2) {
        StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory);
--- a/src/ccmain/paragraphs_internal.h
+++ b/src/ccmain/paragraphs_internal.h
@ -34,9 +34,6 @@ class WERD_CHOICE;
 TESS_API
 bool AsciiLikelyListItem(const std::string &word);

-// Return the first Unicode Codepoint from werd[pos].
-int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
-
 // Set right word attributes given either a unicharset and werd or a utf8
 // string.
 TESS_API