tesseract/ccutil/unichar.cpp

///////////////////////////////////////////////////////////////////////
// File:        unichar.cpp
// Description: Unicode character/ligature class.
// Author:      Ray Smith
// Created:     Wed Jun 28 17:05:01 PDT 2006
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#include "unichar.h"
#include "errcode.h"
#include "genericvector.h"
#include "tprintf.h"

#define UNI_MAX_LEGAL_UTF32 0x0010FFFF

namespace tesseract {

// Construct from a utf8 string. If len<0 then the string is null terminated.
// If the string is too long to fit in the UNICHAR then it takes only what
// will fit. Checks for illegal input and stops at an illegal sequence.
// The resulting UNICHAR may be empty.
UNICHAR::UNICHAR(const char* utf8_str, int len) {
  int total_len = 0;
  int step = 0;
  if (len < 0) {
    for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len);
  }
  for (total_len = 0; total_len < len; total_len += step) {
    step = utf8_step(utf8_str + total_len);
    if (total_len + step > UNICHAR_LEN)
      break;  // Too long.
    if (step == 0)
      break;  // Illegal first byte.
    int i;
    for (i = 1; i < step; ++i)
      if ((utf8_str[total_len + i] & 0xc0) != 0x80)
        break;
    if (i < step)
      break;  // Illegal surrogate
  }
  memcpy(chars, utf8_str, total_len);
  if (total_len < UNICHAR_LEN) {
    chars[UNICHAR_LEN - 1] = total_len;
    while (total_len < UNICHAR_LEN - 1)
      chars[total_len++] = 0;
  }
}

// Construct from a single UCS4 character. Illegal values are ignored,
// resulting in an empty UNICHAR.
UNICHAR::UNICHAR(int unicode) {
  const int bytemask = 0xBF;
  const int bytemark = 0x80;

  if (unicode < 0x80) {
    chars[UNICHAR_LEN - 1] = 1;
    chars[2] = 0;
    chars[1] = 0;
    chars[0] = static_cast<char>(unicode);
  } else if (unicode < 0x800) {
    chars[UNICHAR_LEN - 1] = 2;
    chars[2] = 0;
    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[0] = static_cast<char>(unicode | 0xc0);
  } else if (unicode < 0x10000) {
    chars[UNICHAR_LEN - 1] = 3;
    chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[0] = static_cast<char>(unicode | 0xe0);
  } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
    chars[UNICHAR_LEN - 1] = 4;
    chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
    unicode >>= 6;
    chars[0] = static_cast<char>(unicode | 0xf0);
  } else {
    memset(chars, 0, UNICHAR_LEN);
  }
}

// Get the first character as UCS-4.
int UNICHAR::first_uni() const {
  static const int utf8_offsets[5] = {
    0, 0, 0x3080, 0xE2080, 0x3C82080
  };
  int uni = 0;
  int len = utf8_step(chars);
  const char* src = chars;

  switch (len) {
  default:
    break;
  case 4:
    uni += static_cast<unsigned char>(*src++);
    uni <<= 6;
  case 3:
    uni += static_cast<unsigned char>(*src++);
    uni <<= 6;
  case 2:
    uni += static_cast<unsigned char>(*src++);
    uni <<= 6;
  case 1:
    uni += static_cast<unsigned char>(*src++);
  }
  uni -= utf8_offsets[len];
  return uni;
}

// Get a terminated UTF8 string: Must delete[] it after use.
char* UNICHAR::utf8_str() const {
  int len = utf8_len();
  char* str = new char[len + 1];
  memcpy(str, chars, len);
  str[len] = 0;
  return str;
}

// Get the number of bytes in the first character of the given utf8 string.
int UNICHAR::utf8_step(const char* utf8_str) {
  static const char utf8_bytes[256] = {
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
  };

  return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
}

UNICHAR::const_iterator& UNICHAR::const_iterator::operator++() {
  ASSERT_HOST(it_ != NULL);
  int step = utf8_step(it_);
  if (step == 0) {
    tprintf("ERROR: Illegal UTF8 encountered.\n");
    for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {
      tprintf("Index %d char = 0x%x\n", i, it_[i]);
    }
    step = 1;
  }
  it_ += step;
  return *this;
}

int UNICHAR::const_iterator::operator*() const {
  ASSERT_HOST(it_ != NULL);
  const int len = utf8_step(it_);
  if (len == 0) {
    tprintf("WARNING: Illegal UTF8 encountered\n");
    return ' ';
  }
  UNICHAR uch(it_, len);
  return uch.first_uni();
}

int UNICHAR::const_iterator::get_utf8(char* utf8_output) const {
  ASSERT_HOST(it_ != NULL);
  const int len = utf8_step(it_);
  if (len == 0) {
    tprintf("WARNING: Illegal UTF8 encountered\n");
    utf8_output[0] = ' ';
    return 1;
  }
  strncpy(utf8_output, it_, len);
  return len;
}

int UNICHAR::const_iterator::utf8_len() const {
  ASSERT_HOST(it_ != NULL);
  const int len = utf8_step(it_);
  if (len == 0) {
    tprintf("WARNING: Illegal UTF8 encountered\n");
    return 1;
  }
  return len;
}

bool UNICHAR::const_iterator::is_legal() const {
  return utf8_step(it_) > 0;
}

UNICHAR::const_iterator UNICHAR::begin(const char* utf8_str, const int len) {
  return UNICHAR::const_iterator(utf8_str);
}

UNICHAR::const_iterator UNICHAR::end(const char* utf8_str, const int len) {
  return UNICHAR::const_iterator(utf8_str + len);
}

// Converts a utf-8 string to a vector of unicodes.
// Returns an empty vector if the input contains invalid UTF-8.
/* static */
std::vector<char32> UNICHAR::UTF8ToUTF32(const char* utf8_str) {
  const int utf8_length = strlen(utf8_str);
  std::vector<char32> unicodes;
  unicodes.reserve(utf8_length);
  const_iterator end_it(end(utf8_str, utf8_length));
  for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
    if (it.is_legal()) {
      unicodes.push_back(*it);
    } else {
      unicodes.clear();
      return unicodes;
    }
  }
  return unicodes;
}

// Returns an empty string if the input contains an invalid unicode.
std::string UNICHAR::UTF32ToUTF8(const std::vector<char32>& str32) {
  std::string utf8_str;
  for (char32 ch : str32) {
    UNICHAR uni_ch(ch);
    int step;
    if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
      utf8_str.append(uni_ch.utf8(), step);
    } else {
      return "";
    }
  }
  return utf8_str;
}

}  // namespace tesseract
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`///////////////////////////////////////////////////////////////////////`
			`// File: unichar.cpp`
			`// Description: Unicode character/ligature class.`
			`// Author: Ray Smith`
			`// Created: Wed Jun 28 17:05:01 PDT 2006`
			`//`
			`// (C) Copyright 2006, Google Inc.`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`
			`//`
			`///////////////////////////////////////////////////////////////////////`

			`#include "unichar.h"`
Better utf8/32 conversion git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@941 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-01-10 01:27:17 +08:00			`#include "errcode.h"`
Started TFile conversion to remove fmemopen git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1139 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-08-12 07:09:25 +08:00			`#include "genericvector.h"`
Better utf8/32 conversion git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@941 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-01-10 01:27:17 +08:00			`#include "tprintf.h"`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00
			`#define UNI_MAX_LEGAL_UTF32 0x0010FFFF`

Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion 2017-07-15 00:30:14 +08:00			`namespace tesseract {`

top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`// Construct from a utf8 string. If len<0 then the string is null terminated.`
			`// If the string is too long to fit in the UNICHAR then it takes only what`
			`// will fit. Checks for illegal input and stops at an illegal sequence.`
			`// The resulting UNICHAR may be empty.`
			`UNICHAR::UNICHAR(const char* utf8_str, int len) {`
			`int total_len = 0;`
			`int step = 0;`
			`if (len < 0) {`
More minor fixes from issues and cleanup git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@974 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-01-10 09:38:00 +08:00			`for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len);`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`}`
			`for (total_len = 0; total_len < len; total_len += step) {`
			`step = utf8_step(utf8_str + total_len);`
			`if (total_len + step > UNICHAR_LEN)`
			`break; // Too long.`
			`if (step == 0)`
			`break; // Illegal first byte.`
			`int i;`
			`for (i = 1; i < step; ++i)`
			`if ((utf8_str[total_len + i] & 0xc0) != 0x80)`
			`break;`
			`if (i < step)`
			`break; // Illegal surrogate`
			`}`
			`memcpy(chars, utf8_str, total_len);`
			`if (total_len < UNICHAR_LEN) {`
			`chars[UNICHAR_LEN - 1] = total_len;`
			`while (total_len < UNICHAR_LEN - 1)`
			`chars[total_len++] = 0;`
			`}`
			`}`

			`// Construct from a single UCS4 character. Illegal values are ignored,`
			`// resulting in an empty UNICHAR.`
			`UNICHAR::UNICHAR(int unicode) {`
			`const int bytemask = 0xBF;`
			`const int bytemark = 0x80;`

			`if (unicode < 0x80) {`
			`chars[UNICHAR_LEN - 1] = 1;`
			`chars[2] = 0;`
			`chars[1] = 0;`
			`chars[0] = static_cast<char>(unicode);`
			`} else if (unicode < 0x800) {`
			`chars[UNICHAR_LEN - 1] = 2;`
			`chars[2] = 0;`
			`chars[1] = static_cast<char>((unicode \| bytemark) & bytemask);`
			`unicode >>= 6;`
			`chars[0] = static_cast<char>(unicode \| 0xc0);`
			`} else if (unicode < 0x10000) {`
			`chars[UNICHAR_LEN - 1] = 3;`
			`chars[2] = static_cast<char>((unicode \| bytemark) & bytemask);`
			`unicode >>= 6;`
			`chars[1] = static_cast<char>((unicode \| bytemark) & bytemask);`
			`unicode >>= 6;`
			`chars[0] = static_cast<char>(unicode \| 0xe0);`
			`} else if (unicode <= UNI_MAX_LEGAL_UTF32) {`
			`chars[UNICHAR_LEN - 1] = 4;`
			`chars[3] = static_cast<char>((unicode \| bytemark) & bytemask);`
			`unicode >>= 6;`
			`chars[2] = static_cast<char>((unicode \| bytemark) & bytemask);`
			`unicode >>= 6;`
			`chars[1] = static_cast<char>((unicode \| bytemark) & bytemask);`
			`unicode >>= 6;`
			`chars[0] = static_cast<char>(unicode \| 0xf0);`
			`} else {`
			`memset(chars, 0, UNICHAR_LEN);`
			`}`
			`}`

			`// Get the first character as UCS-4.`
			`int UNICHAR::first_uni() const {`
			`static const int utf8_offsets[5] = {`
			`0, 0, 0x3080, 0xE2080, 0x3C82080`
			`};`
			`int uni = 0;`
			`int len = utf8_step(chars);`
			`const char* src = chars;`

			`switch (len) {`
			`default:`
			`break;`
			`case 4:`
Fixed problems with signed characters. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@85 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:05:40 +08:00			`uni += static_cast<unsigned char>(*src++);`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`uni <<= 6;`
			`case 3:`
Fixed problems with signed characters. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@85 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:05:40 +08:00			`uni += static_cast<unsigned char>(*src++);`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`uni <<= 6;`
			`case 2:`
Fixed problems with signed characters. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@85 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:05:40 +08:00			`uni += static_cast<unsigned char>(*src++);`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`uni <<= 6;`
			`case 1:`
Fixed problems with signed characters. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@85 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:05:40 +08:00			`uni += static_cast<unsigned char>(*src++);`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`}`
			`uni -= utf8_offsets[len];`
			`return uni;`
			`}`

			`// Get a terminated UTF8 string: Must delete[] it after use.`
			`char* UNICHAR::utf8_str() const {`
			`int len = utf8_len();`
			`char* str = new char[len + 1];`
			`memcpy(str, chars, len);`
			`str[len] = 0;`
			`return str;`
			`}`

			`// Get the number of bytes in the first character of the given utf8 string.`
			`int UNICHAR::utf8_step(const char* utf8_str) {`
			`static const char utf8_bytes[256] = {`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
			`1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,`
Fixed various internationalization issues, mostly for training git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@106 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-08-31 02:18:35 +08:00			`0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,`
			`0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,`
			`3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0`
			`};`

Fixed problems with signed characters. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@85 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 09:05:40 +08:00			`return utf8_bytes[static_cast<unsigned char>(*utf8_str)];`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`}`
Better utf8/32 conversion git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@941 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-01-10 01:27:17 +08:00
			`UNICHAR::const_iterator& UNICHAR::const_iterator::operator++() {`
			`ASSERT_HOST(it_ != NULL);`
			`int step = utf8_step(it_);`
			`if (step == 0) {`
			`tprintf("ERROR: Illegal UTF8 encountered.\n");`
			`for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {`
Fixed issue 1133 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1080 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-04-25 05:18:00 +08:00			`tprintf("Index %d char = 0x%x\n", i, it_[i]);`
Better utf8/32 conversion git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@941 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-01-10 01:27:17 +08:00			`}`
			`step = 1;`
			`}`
			`it_ += step;`
			`return *this;`
			`}`

			`int UNICHAR::const_iterator::operator*() const {`
			`ASSERT_HOST(it_ != NULL);`
			`const int len = utf8_step(it_);`
			`if (len == 0) {`
			`tprintf("WARNING: Illegal UTF8 encountered\n");`
			`return ' ';`
			`}`
			`UNICHAR uch(it_, len);`
			`return uch.first_uni();`
			`}`

			`int UNICHAR::const_iterator::get_utf8(char* utf8_output) const {`
			`ASSERT_HOST(it_ != NULL);`
			`const int len = utf8_step(it_);`
			`if (len == 0) {`
			`tprintf("WARNING: Illegal UTF8 encountered\n");`
			`utf8_output[0] = ' ';`
			`return 1;`
			`}`
			`strncpy(utf8_output, it_, len);`
			`return len;`
			`}`

			`int UNICHAR::const_iterator::utf8_len() const {`
			`ASSERT_HOST(it_ != NULL);`
			`const int len = utf8_step(it_);`
			`if (len == 0) {`
			`tprintf("WARNING: Illegal UTF8 encountered\n");`
			`return 1;`
			`}`
			`return len;`
			`}`

Fixed issue 1133 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1080 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-04-25 05:18:00 +08:00			`bool UNICHAR::const_iterator::is_legal() const {`
			`return utf8_step(it_) > 0;`
			`}`

Better utf8/32 conversion git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@941 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-01-10 01:27:17 +08:00			`UNICHAR::const_iterator UNICHAR::begin(const char* utf8_str, const int len) {`
			`return UNICHAR::const_iterator(utf8_str);`
			`}`

			`UNICHAR::const_iterator UNICHAR::end(const char* utf8_str, const int len) {`
			`return UNICHAR::const_iterator(utf8_str + len);`
			`}`
Started TFile conversion to remove fmemopen git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1139 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-08-12 07:09:25 +08:00
			`// Converts a utf-8 string to a vector of unicodes.`
Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion 2017-07-15 00:30:14 +08:00			`// Returns an empty vector if the input contains invalid UTF-8.`
			`/* static */`
			`std::vector<char32> UNICHAR::UTF8ToUTF32(const char* utf8_str) {`
Started TFile conversion to remove fmemopen git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1139 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-08-12 07:09:25 +08:00			`const int utf8_length = strlen(utf8_str);`
Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion 2017-07-15 00:30:14 +08:00			`std::vector<char32> unicodes;`
			`unicodes.reserve(utf8_length);`
Started TFile conversion to remove fmemopen git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1139 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-08-12 07:09:25 +08:00			`const_iterator end_it(end(utf8_str, utf8_length));`
			`for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {`
Improved newlines and spaces in a box file so it works better with RTL languages. 2015-05-13 08:51:03 +08:00			`if (it.is_legal()) {`
Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion 2017-07-15 00:30:14 +08:00			`unicodes.push_back(*it);`
Improved newlines and spaces in a box file so it works better with RTL languages. 2015-05-13 08:51:03 +08:00			`} else {`
Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion 2017-07-15 00:30:14 +08:00			`unicodes.clear();`
			`return unicodes;`
Improved newlines and spaces in a box file so it works better with RTL languages. 2015-05-13 08:51:03 +08:00			`}`
Started TFile conversion to remove fmemopen git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1139 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-08-12 07:09:25 +08:00			`}`
Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion 2017-07-15 00:30:14 +08:00			`return unicodes;`
Started TFile conversion to remove fmemopen git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1139 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-08-12 07:09:25 +08:00			`}`

Added script-specific validation and normalization for virama-using scripts and updated normalization for others 2017-07-15 01:05:05 +08:00			`// Returns an empty string if the input contains an invalid unicode.`
Remove old code for string class (no longer needed) (#1354) * Remove old code for string class (no longer needed) Signed-off-by: Stefan Weil <sw@weilnetz.de> * Add std namespace to string class Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-03-03 21:36:28 +08:00			`std::string UNICHAR::UTF32ToUTF8(const std::vector<char32>& str32) {`
			`std::string utf8_str;`
Added script-specific validation and normalization for virama-using scripts and updated normalization for others 2017-07-15 01:05:05 +08:00			`for (char32 ch : str32) {`
			`UNICHAR uni_ch(ch);`
			`int step;`
			`if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {`
			`utf8_str.append(uni_ch.utf8(), step);`
			`} else {`
			`return "";`
			`}`
			`}`
			`return utf8_str;`
			`}`

			`} // namespace tesseract`