mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 14:41:36 +08:00
Better utf8/32 conversion
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@941 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
7dc5296fe9
commit
086c8d50a8
@ -18,6 +18,8 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "unichar.h"
|
||||
#include "errcode.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
#define UNI_MAX_LEGAL_UTF32 0x0010FFFF
|
||||
|
||||
@ -142,3 +144,58 @@ int UNICHAR::utf8_step(const char* utf8_str) {
|
||||
|
||||
return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
|
||||
}
|
||||
|
||||
UNICHAR::const_iterator& UNICHAR::const_iterator::operator++() {
|
||||
ASSERT_HOST(it_ != NULL);
|
||||
int step = utf8_step(it_);
|
||||
if (step == 0) {
|
||||
tprintf("ERROR: Illegal UTF8 encountered.\n");
|
||||
for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {
|
||||
tprintf("Index %d char = 0x%x", i, it_[i]);
|
||||
}
|
||||
step = 1;
|
||||
}
|
||||
it_ += step;
|
||||
return *this;
|
||||
}
|
||||
|
||||
int UNICHAR::const_iterator::operator*() const {
|
||||
ASSERT_HOST(it_ != NULL);
|
||||
const int len = utf8_step(it_);
|
||||
if (len == 0) {
|
||||
tprintf("WARNING: Illegal UTF8 encountered\n");
|
||||
return ' ';
|
||||
}
|
||||
UNICHAR uch(it_, len);
|
||||
return uch.first_uni();
|
||||
}
|
||||
|
||||
int UNICHAR::const_iterator::get_utf8(char* utf8_output) const {
|
||||
ASSERT_HOST(it_ != NULL);
|
||||
const int len = utf8_step(it_);
|
||||
if (len == 0) {
|
||||
tprintf("WARNING: Illegal UTF8 encountered\n");
|
||||
utf8_output[0] = ' ';
|
||||
return 1;
|
||||
}
|
||||
strncpy(utf8_output, it_, len);
|
||||
return len;
|
||||
}
|
||||
|
||||
int UNICHAR::const_iterator::utf8_len() const {
|
||||
ASSERT_HOST(it_ != NULL);
|
||||
const int len = utf8_step(it_);
|
||||
if (len == 0) {
|
||||
tprintf("WARNING: Illegal UTF8 encountered\n");
|
||||
return 1;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
UNICHAR::const_iterator UNICHAR::begin(const char* utf8_str, const int len) {
|
||||
return UNICHAR::const_iterator(utf8_str);
|
||||
}
|
||||
|
||||
UNICHAR::const_iterator UNICHAR::end(const char* utf8_str, const int len) {
|
||||
return UNICHAR::const_iterator(utf8_str + len);
|
||||
}
|
||||
|
@ -83,6 +83,69 @@ class UNICHAR {
|
||||
// Get the number of bytes in the first character of the given utf8 string.
|
||||
static int utf8_step(const char* utf8_str);
|
||||
|
||||
// A class to simplify iterating over and accessing elements of a UTF8
|
||||
// string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or
|
||||
// take ownership of the underlying byte array. It also does not permit
|
||||
// modification of the array (as the name suggests).
|
||||
//
|
||||
// Example:
|
||||
// for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len);
|
||||
// it != UNICHAR::end(str, len);
|
||||
// ++it) {
|
||||
// tprintf("UCS-4 symbol code = %d\n", *it);
|
||||
// char buf[5];
|
||||
// int char_len = it.get_utf8(buf); buf[char_len] = '\0';
|
||||
// tprintf("Char = %s\n", buf);
|
||||
// }
|
||||
class const_iterator {
|
||||
typedef const_iterator CI;
|
||||
|
||||
public:
|
||||
// Step to the next UTF8 character.
|
||||
// If the current position is at an illegal UTF8 character, then print an
|
||||
// error message and step by one byte. If the current position is at a NULL
|
||||
// value, don't step past it.
|
||||
const_iterator& operator++();
|
||||
|
||||
// Return the UCS-4 value at the current position.
|
||||
// If the current position is at an illegal UTF8 value, return a single
|
||||
// space character.
|
||||
int operator*() const;
|
||||
|
||||
// Store the UTF-8 encoding of the current codepoint into buf, which must be
|
||||
// at least 4 bytes long. Return the number of bytes written.
|
||||
// If the current position is at an illegal UTF8 value, writes a single
|
||||
// space character and returns 1.
|
||||
// Note that this method does not null-terminate the buffer.
|
||||
int get_utf8(char* buf) const;
|
||||
// Returns the number of bytes of the current codepoint. Returns 1 if the
|
||||
// current position is at an illegal UTF8 value.
|
||||
int utf8_len() const;
|
||||
|
||||
// Return the pointer into the string at the current position.
|
||||
const char* utf8_data() const { return it_; }
|
||||
|
||||
// Iterator equality operators.
|
||||
friend bool operator==(const CI& lhs, const CI& rhs) {
|
||||
return lhs.it_ == rhs.it_;
|
||||
}
|
||||
friend bool operator!=(const CI& lhs, const CI& rhs) {
|
||||
return !(lhs == rhs);
|
||||
}
|
||||
|
||||
private:
|
||||
friend class UNICHAR;
|
||||
explicit const_iterator(const char* it) : it_(it) {}
|
||||
|
||||
const char* it_; // Pointer into the string.
|
||||
};
|
||||
|
||||
// Create a start/end iterator pointing to a string. Note that these methods
|
||||
// are static and do NOT create a copy or take ownership of the underlying
|
||||
// array.
|
||||
static const_iterator begin(const char* utf8_str, const int byte_length);
|
||||
static const_iterator end(const char* utf8_str, const int byte_length);
|
||||
|
||||
private:
|
||||
// A UTF-8 representation of 1 or more Unicode characters.
|
||||
// The last element (chars[UNICHAR_LEN - 1]) is a length if
|
||||
|
Loading…
Reference in New Issue
Block a user