Better utf8/32 conversion

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@941 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-01-18 14:41:36 +08:00 · 2014-01-09 17:27:17 +00:00 · 2014-01-09 17:27:17 +00:00 · 086c8d50a8
commit 086c8d50a8
parent 7dc5296fe9
2 changed files with 120 additions and 0 deletions
--- a/ccutil/unichar.cpp
+++ b/ccutil/unichar.cpp
@ -18,6 +18,8 @@
 ///////////////////////////////////////////////////////////////////////

 #include "unichar.h"
+#include "errcode.h"
+#include "tprintf.h"

 #define UNI_MAX_LEGAL_UTF32 0x0010FFFF

@ -142,3 +144,58 @@ int UNICHAR::utf8_step(const char* utf8_str) {

  return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
 }
+
+UNICHAR::const_iterator& UNICHAR::const_iterator::operator++() {
+  ASSERT_HOST(it_ != NULL);
+  int step = utf8_step(it_);
+  if (step == 0) {
+    tprintf("ERROR: Illegal UTF8 encountered.\n");
+    for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {
+      tprintf("Index %d char = 0x%x", i, it_[i]);
+    }
+    step = 1;
+  }
+  it_ += step;
+  return *this;
+}
+
+int UNICHAR::const_iterator::operator*() const {
+  ASSERT_HOST(it_ != NULL);
+  const int len = utf8_step(it_);
+  if (len == 0) {
+    tprintf("WARNING: Illegal UTF8 encountered\n");
+    return ' ';
+  }
+  UNICHAR uch(it_, len);
+  return uch.first_uni();
+}
+
+int UNICHAR::const_iterator::get_utf8(char* utf8_output) const {
+  ASSERT_HOST(it_ != NULL);
+  const int len = utf8_step(it_);
+  if (len == 0) {
+    tprintf("WARNING: Illegal UTF8 encountered\n");
+    utf8_output[0] = ' ';
+    return 1;
+  }
+  strncpy(utf8_output, it_, len);
+  return len;
+}
+
+int UNICHAR::const_iterator::utf8_len() const {
+  ASSERT_HOST(it_ != NULL);
+  const int len = utf8_step(it_);
+  if (len == 0) {
+    tprintf("WARNING: Illegal UTF8 encountered\n");
+    return 1;
+  }
+  return len;
+}
+
+UNICHAR::const_iterator UNICHAR::begin(const char* utf8_str, const int len) {
+  return UNICHAR::const_iterator(utf8_str);
+}
+
+UNICHAR::const_iterator UNICHAR::end(const char* utf8_str, const int len) {
+  return UNICHAR::const_iterator(utf8_str + len);
+}
--- a/ccutil/unichar.h
+++ b/ccutil/unichar.h
@ -83,6 +83,69 @@ class UNICHAR {
  // Get the number of bytes in the first character of the given utf8 string.
  static int utf8_step(const char* utf8_str);

+  // A class to simplify iterating over and accessing elements of a UTF8
+  // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or
+  // take ownership of the underlying byte array. It also does not permit
+  // modification of the array (as the name suggests).
+  //
+  // Example:
+  //   for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len);
+  //        it != UNICHAR::end(str, len);
+  //        ++it) {
+  //     tprintf("UCS-4 symbol code = %d\n", *it);
+  //     char buf[5];
+  //     int char_len = it.get_utf8(buf); buf[char_len] = '\0';
+  //     tprintf("Char = %s\n", buf);
+  //   }
+  class const_iterator {
+    typedef const_iterator CI;
+
+   public:
+    // Step to the next UTF8 character.
+    // If the current position is at an illegal UTF8 character, then print an
+    // error message and step by one byte. If the current position is at a NULL
+    // value, don't step past it.
+    const_iterator& operator++();
+
+    // Return the UCS-4 value at the current position.
+    // If the current position is at an illegal UTF8 value, return a single
+    // space character.
+    int operator*() const;
+
+    // Store the UTF-8 encoding of the current codepoint into buf, which must be
+    // at least 4 bytes long. Return the number of bytes written.
+    // If the current position is at an illegal UTF8 value, writes a single
+    // space character and returns 1.
+    // Note that this method does not null-terminate the buffer.
+    int get_utf8(char* buf) const;
+    // Returns the number of bytes of the current codepoint. Returns 1 if the
+    // current position is at an illegal UTF8 value.
+    int utf8_len() const;
+
+    // Return the pointer into the string at the current position.
+    const char* utf8_data() const { return it_; }
+
+    // Iterator equality operators.
+    friend bool operator==(const CI& lhs, const CI& rhs) {
+      return lhs.it_ == rhs.it_;
+    }
+    friend bool operator!=(const CI& lhs, const CI& rhs) {
+      return !(lhs == rhs);
+    }
+
+   private:
+    friend class UNICHAR;
+    explicit const_iterator(const char* it) : it_(it) {}
+
+    const char* it_;  // Pointer into the string.
+  };
+
+  // Create a start/end iterator pointing to a string. Note that these methods
+  // are static and do NOT create a copy or take ownership of the underlying
+  // array.
+  static const_iterator begin(const char* utf8_str, const int byte_length);
+  static const_iterator end(const char* utf8_str, const int byte_length);
+
 private:
  // A UTF-8 representation of 1 or more Unicode characters.
  // The last element (chars[UNICHAR_LEN - 1]) is a length if