Better utf8/32 conversion

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@941 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
theraysmith@gmail.com 2014-01-09 17:27:17 +00:00
parent 7dc5296fe9
commit 086c8d50a8
2 changed files with 120 additions and 0 deletions

View File

@ -18,6 +18,8 @@
///////////////////////////////////////////////////////////////////////
#include "unichar.h"
#include "errcode.h"
#include "tprintf.h"
#define UNI_MAX_LEGAL_UTF32 0x0010FFFF
@ -142,3 +144,58 @@ int UNICHAR::utf8_step(const char* utf8_str) {
return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
}
UNICHAR::const_iterator& UNICHAR::const_iterator::operator++() {
ASSERT_HOST(it_ != NULL);
int step = utf8_step(it_);
if (step == 0) {
tprintf("ERROR: Illegal UTF8 encountered.\n");
for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {
tprintf("Index %d char = 0x%x", i, it_[i]);
}
step = 1;
}
it_ += step;
return *this;
}
int UNICHAR::const_iterator::operator*() const {
ASSERT_HOST(it_ != NULL);
const int len = utf8_step(it_);
if (len == 0) {
tprintf("WARNING: Illegal UTF8 encountered\n");
return ' ';
}
UNICHAR uch(it_, len);
return uch.first_uni();
}
int UNICHAR::const_iterator::get_utf8(char* utf8_output) const {
ASSERT_HOST(it_ != NULL);
const int len = utf8_step(it_);
if (len == 0) {
tprintf("WARNING: Illegal UTF8 encountered\n");
utf8_output[0] = ' ';
return 1;
}
strncpy(utf8_output, it_, len);
return len;
}
int UNICHAR::const_iterator::utf8_len() const {
ASSERT_HOST(it_ != NULL);
const int len = utf8_step(it_);
if (len == 0) {
tprintf("WARNING: Illegal UTF8 encountered\n");
return 1;
}
return len;
}
UNICHAR::const_iterator UNICHAR::begin(const char* utf8_str, const int len) {
return UNICHAR::const_iterator(utf8_str);
}
UNICHAR::const_iterator UNICHAR::end(const char* utf8_str, const int len) {
return UNICHAR::const_iterator(utf8_str + len);
}

View File

@ -83,6 +83,69 @@ class UNICHAR {
// Get the number of bytes in the first character of the given utf8 string.
static int utf8_step(const char* utf8_str);
// A class to simplify iterating over and accessing elements of a UTF8
// string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or
// take ownership of the underlying byte array. It also does not permit
// modification of the array (as the name suggests).
//
// Example:
// for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len);
// it != UNICHAR::end(str, len);
// ++it) {
// tprintf("UCS-4 symbol code = %d\n", *it);
// char buf[5];
// int char_len = it.get_utf8(buf); buf[char_len] = '\0';
// tprintf("Char = %s\n", buf);
// }
class const_iterator {
typedef const_iterator CI;
public:
// Step to the next UTF8 character.
// If the current position is at an illegal UTF8 character, then print an
// error message and step by one byte. If the current position is at a NULL
// value, don't step past it.
const_iterator& operator++();
// Return the UCS-4 value at the current position.
// If the current position is at an illegal UTF8 value, return a single
// space character.
int operator*() const;
// Store the UTF-8 encoding of the current codepoint into buf, which must be
// at least 4 bytes long. Return the number of bytes written.
// If the current position is at an illegal UTF8 value, writes a single
// space character and returns 1.
// Note that this method does not null-terminate the buffer.
int get_utf8(char* buf) const;
// Returns the number of bytes of the current codepoint. Returns 1 if the
// current position is at an illegal UTF8 value.
int utf8_len() const;
// Return the pointer into the string at the current position.
const char* utf8_data() const { return it_; }
// Iterator equality operators.
friend bool operator==(const CI& lhs, const CI& rhs) {
return lhs.it_ == rhs.it_;
}
friend bool operator!=(const CI& lhs, const CI& rhs) {
return !(lhs == rhs);
}
private:
friend class UNICHAR;
explicit const_iterator(const char* it) : it_(it) {}
const char* it_; // Pointer into the string.
};
// Create a start/end iterator pointing to a string. Note that these methods
// are static and do NOT create a copy or take ownership of the underlying
// array.
static const_iterator begin(const char* utf8_str, const int byte_length);
static const_iterator end(const char* utf8_str, const int byte_length);
private:
// A UTF-8 representation of 1 or more Unicode characters.
// The last element (chars[UNICHAR_LEN - 1]) is a length if