mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-07 18:02:40 +08:00
replace CubeUtils::UTF8ToUTF32 in pdfrenderer
This commit is contained in:
parent
7cb196d80d
commit
300b5246f3
@ -413,13 +413,25 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
|
||||
do {
|
||||
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
|
||||
if (grapheme && grapheme[0] != '\0') {
|
||||
// TODO(jbreiden) Do a real UTF-16BE conversion
|
||||
// http://en.wikipedia.org/wiki/UTF-16#Example_UTF-16_encoding_procedure
|
||||
string_32 utf32;
|
||||
CubeUtils::UTF8ToUTF32(grapheme, &utf32);
|
||||
GenericVector<int> unicodes;
|
||||
UNICHAR::UTF8ToUnicode(grapheme, &unicodes);
|
||||
char utf16[20];
|
||||
for (int i = 0; i < static_cast<int>(utf32.length()); i++) {
|
||||
snprintf(utf16, sizeof(utf16), "<%04X>", utf32[i]);
|
||||
for (int i = 0; i < unicodes.length(); i++) {
|
||||
int code = unicodes[i];
|
||||
// Convert to UTF-16BE https://en.wikipedia.org/wiki/UTF-16
|
||||
if ((code > 0xD7FF && code < 0xE0000) || code > 0x10FFFF) {
|
||||
tprintf("Dropping invalid codepoint %d\n", code);
|
||||
continue;
|
||||
}
|
||||
if (code < 0x10000) {
|
||||
snprintf(utf16, sizeof(utf16), "<%04X>", code);
|
||||
} else {
|
||||
int a = code - 0x010000;
|
||||
int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
|
||||
int low_surrogate = (0x03FF & a) + 0xDC00;
|
||||
snprintf(utf16, sizeof(utf16), "<%04X%04X>",
|
||||
high_surrogate, low_surrogate);
|
||||
}
|
||||
pdf_word += utf16;
|
||||
pdf_word_len++;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user