tesseract/ccutil/unicharset.cpp


///////////////////////////////////////////////////////////////////////
// File:        unicharset.cpp
// Description: Unicode character/ligature set class.
// Author:      Thomas Kielbus
// Created:     Wed Jun 28 17:05:01 PDT 2006
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#include <assert.h>
#include <stdio.h>
#include <string.h>

#include "tprintf.h"
#include "unichar.h"
#include "unicharset.h"
#include "varable.h"

static const int ISALPHA_MASK = 0x1;
static const int ISLOWER_MASK = 0x2;
static const int ISUPPER_MASK = 0x4;
static const int ISDIGIT_MASK = 0x8;
static const int ISPUNCTUATION_MASK = 0x10;

UNICHARSET::UNICHARSET() :
    unichars(NULL),
    ids(),
    size_used(0),
    size_reserved(0),
    script_table(0),
    script_table_size_used(0),
    script_table_size_reserved(0),
    null_script("NULL"),
    null_sid_(0),
    common_sid_(0),
    latin_sid_(0),
    cyrillic_sid_(0),
    greek_sid_(0),
    han_sid_(0) {}

UNICHARSET::~UNICHARSET() {
  if (size_reserved > 0) {
    for (int i = 0; i < script_table_size_used; ++i)
      delete[] script_table[i];
    delete[] script_table;
    delete_pointers_in_unichars();
    delete[] unichars;
  }
}

void UNICHARSET::reserve(int unichars_number) {
  if (unichars_number > size_reserved) {
    UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
    for (int i = 0; i < size_used; ++i)
      memcpy(&unichars_new[i], &unichars[i], sizeof(UNICHAR_SLOT));
    for (int j = size_used; j < unichars_number; ++j) {
      unichars_new[j].properties.script_id = add_script(null_script);
      unichars_new[j].properties.fragment = NULL;
    }
    delete[] unichars;
    unichars = unichars_new;
    size_reserved = unichars_number;
  }
}

const UNICHAR_ID
UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
  return ids.contains(unichar_repr) ?
    ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
}

const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
                                           int length) const {
  assert(length > 0 && length <= UNICHAR_LEN);
  return ids.contains(unichar_repr, length) ?
    ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
}

// Return the minimum number of bytes that matches a legal UNICHAR_ID,
// while leaving a legal UNICHAR_ID afterwards. In other words, if there
// is both a short and a long match to the string, return the length that
// ensures there is a legal match after it.
int UNICHARSET::step(const char* str) const {
  // Find the length of the first matching unicharset member.
  int minlength = ids.minmatch(str);
  if (minlength == 0)
    return 0;  // Empty string or illegal char.

  int goodlength = minlength;
  while (goodlength <= UNICHAR_LEN) {
    if (str[goodlength] == '\0' || ids.minmatch(str + goodlength) > 0)
      return goodlength;  // This length works!
    // The next char is illegal so find the next usable length.
    do {
      ++goodlength;
    } while (str[goodlength] != '\0' && goodlength <= UNICHAR_LEN &&
             !ids.contains(str, goodlength));
  }
  // Search to find a subsequent legal char failed so return the minlength.
  return minlength;
}

const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
  if (id == INVALID_UNICHAR_ID) {
    return INVALID_UNICHAR;
  }
  assert(id < this->size());
  return unichars[id].representation;
}

// Return a STRING that reformats the utf8 str into the str followed
// by its hex unicodes.
STRING UNICHARSET::debug_utf8_str(const char* str) {
  STRING result = str;
  result += " [";
  int step = 1;
  // Chop into unicodes and code each as hex.
  for (int i = 0; str[i] != '\0'; i += step) {
    char hex[sizeof(int) * 2 + 1];
    step = UNICHAR::utf8_step(str + i);
    if (step == 0) {
      step = 1;
      sprintf(hex, "%x", str[i]);
    } else {
      UNICHAR ch(str + i, step);
      sprintf(hex, "%x", ch.first_uni());
    }
    result += hex;
    result += " ";
  }
  result += "]";
  return result;
}

// Return a STRING containing debug information on the unichar, including
// the id_to_unichar, its hex unicodes and the properties.
STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
  if (fragment) {
    STRING base = debug_str(fragment->get_unichar());
    return CHAR_FRAGMENT::to_string(base.string(), fragment->get_pos(),
                                    fragment->get_total());
  }
  const char* str = id_to_unichar(id);
  if (id == INVALID_UNICHAR_ID) return STRING(str);
  STRING result = debug_utf8_str(str);
  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
  if (get_isalpha(id)) {
    if (get_islower(id))
      result += "a";
    else if (get_isupper(id))
      result += "A";
    else
      result += "x";
  }
  // Append 0 if a digit.
  if (get_isdigit(id)) {
    result += "0";
  }
  // Append p is a punctuation symbol.
  if (get_ispunctuation(id)) {
    result += "p";
  }
  return result;
}


void UNICHARSET::unichar_insert(const char* const unichar_repr) {
  if (!ids.contains(unichar_repr)) {
    if (strlen(unichar_repr) > UNICHAR_LEN) {
      fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
              int(strlen(unichar_repr)), unichar_repr);
      return;
    }
    if (size_used == size_reserved) {
      if (size_used == 0)
        reserve(8);
      else
        reserve(2 * size_used);
    }

    strcpy(unichars[size_used].representation, unichar_repr);
    this->set_isalpha(size_used, false);
    this->set_islower(size_used, false);
    this->set_isupper(size_used, false);
    this->set_isdigit(size_used, false);
    this->set_ispunctuation(size_used, false);
    this->set_isngram(size_used, false);
    this->set_script(size_used, null_script);
    // If the given unichar_repr represents a fragmented character, set
    // fragment property to a pointer to CHAR_FRAGMENT class instance with
    // information parsed from the unichar representation. Use the script
    // of the base unichar for the fragmented character if possible.
    CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar_repr);
    this->unichars[size_used].properties.fragment = frag;
    if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
      this->unichars[size_used].properties.script_id =
        this->get_script(frag->get_unichar());
    }
    this->unichars[size_used].properties.enabled = true;
    ids.insert(unichar_repr, size_used);
    ++size_used;
  }
}

bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
  return ids.contains(unichar_repr);
}

bool UNICHARSET::contains_unichar(const char* const unichar_repr,
                                  int length) const {
  if (length == 0) {
    return false;
  }
  return ids.contains(unichar_repr, length);
}

bool UNICHARSET::eq(UNICHAR_ID unichar_id,
                    const char* const unichar_repr) const {
  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
}

bool UNICHARSET::save_to_file(FILE *file) const {
  fprintf(file, "%d\n", this->size());
  for (UNICHAR_ID id = 0; id < this->size(); ++id) {
    unsigned int properties = 0;

    if (this->get_isalpha(id))
      properties |= ISALPHA_MASK;
    if (this->get_islower(id))
      properties |= ISLOWER_MASK;
    if (this->get_isupper(id))
      properties |= ISUPPER_MASK;
    if (this->get_isdigit(id))
      properties |= ISDIGIT_MASK;
    if (this->get_ispunctuation(id))
      properties |= ISPUNCTUATION_MASK;

    if (strcmp(this->id_to_unichar(id), " ") == 0)
      fprintf(file, "%s %x %s %d\n", "NULL", properties,
              this->get_script_from_script_id(this->get_script(id)),
              this->get_other_case(id));
    else
      fprintf(file, "%s %x %s %d\n", this->id_to_unichar(id), properties,
              this->get_script_from_script_id(this->get_script(id)),
              this->get_other_case(id));
  }
  return true;
}

bool UNICHARSET::load_from_file(FILE *file) {
  int unicharset_size;
  char buffer[256];

  this->clear();
  if (fgets(buffer, sizeof (buffer), file) == NULL ||
      sscanf(buffer, "%d", &unicharset_size) != 1) {
    return false;
  }
  this->reserve(unicharset_size);
  for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
    char unichar[256];
    unsigned int properties;
    char script[64];

    strcpy(script, null_script);
    this->unichars[id].properties.other_case = id;
    if (fgets(buffer, sizeof (buffer), file) == NULL ||
        (sscanf(buffer, "%s %x %63s %d", unichar, &properties,
                script, &(this->unichars[id].properties.other_case)) != 4 &&
         sscanf(buffer, "%s %x %63s", unichar, &properties, script) != 3 &&
         sscanf(buffer, "%s %x", unichar, &properties) != 2)) {
      return false;
    }
    if (strcmp(unichar, "NULL") == 0)
      this->unichar_insert(" ");
    else
      this->unichar_insert(unichar);

    this->set_isalpha(id, (properties & ISALPHA_MASK) != 0);
    this->set_islower(id, (properties & ISLOWER_MASK) != 0);
    this->set_isupper(id, (properties & ISUPPER_MASK) != 0);
    this->set_isdigit(id, (properties & ISDIGIT_MASK) != 0);
    this->set_ispunctuation(id, (properties & ISPUNCTUATION_MASK) != 0);
    this->set_isngram(id, false);
    this->set_script(id, script);
    this->unichars[id].properties.enabled = true;
  }

  null_sid_ = get_script_id_from_name(null_script);
  ASSERT_HOST(null_sid_ == 0);
  common_sid_ = get_script_id_from_name("Common");
  latin_sid_ = get_script_id_from_name("Latin");
  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
  greek_sid_ = get_script_id_from_name("Greek");
  han_sid_ = get_script_id_from_name("Han");
  return true;
}

// Set a whitelist and/or blacklist of characters to recognize.
// An empty or NULL whitelist enables everything (minus any blacklist).
// An empty or NULL blacklist disables nothing.
void UNICHARSET::set_black_and_whitelist(const char* blacklist,
                                         const char* whitelist) {
  bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
  // Set everything to default
  for (int ch = 0; ch < size_used; ++ch)
    unichars[ch].properties.enabled = def_enabled;
  int ch_step;
  if (!def_enabled) {
    // Enable the whitelist.
    for (int w_ind = 0; whitelist[w_ind] != '\0'; w_ind += ch_step) {
      ch_step = step(whitelist + w_ind);
      if (ch_step > 0) {
        UNICHAR_ID u_id = unichar_to_id(whitelist + w_ind, ch_step);
        unichars[u_id].properties.enabled = true;
      } else {
        ch_step = 1;
      }
    }
  }
  if (blacklist != NULL && blacklist[0] != '\0') {
    // Disable the blacklist.
    for (int b_ind = 0; blacklist[b_ind] != '\0'; b_ind += ch_step) {
      ch_step = step(blacklist + b_ind);
      if (ch_step > 0) {
        UNICHAR_ID u_id = unichar_to_id(blacklist + b_ind, ch_step);
        unichars[u_id].properties.enabled = false;
      } else {
        ch_step = 1;
      }
    }
  }
}

int UNICHARSET::add_script(const char* script) {
  for (int i = 0; i < script_table_size_used; ++i) {
    if (strcmp(script, script_table[i]) == 0)
      return i;
  }
  if (script_table_size_reserved == 0) {
    script_table_size_reserved = 8;
    script_table = new char*[script_table_size_reserved];
  }
  if (script_table_size_used + 1 >= script_table_size_reserved) {
    char** new_script_table = new char*[script_table_size_reserved * 2];
    memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*));
    delete[] script_table;
    script_table = new_script_table;
      script_table_size_reserved = 2 * script_table_size_reserved;
  }
  script_table[script_table_size_used] = new char[strlen(script) + 1];
  strcpy(script_table[script_table_size_used], script);
  return script_table_size_used++;
}

CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
  const char *ptr = string;
  int len = strlen(string);
  if (len < kMinLen || *ptr != kSeparator) {
    return NULL;  // this string can not represent a fragment
  }
  ptr++;  // move to the next character
  int step = 0;
  while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
    step += UNICHAR::utf8_step(ptr + step);
  }
  if (step == 0 || step > UNICHAR_LEN) {
    return NULL;  // no character for unichar or the character is too long
  }
  char unichar[UNICHAR_LEN + 1];
  strncpy(unichar, ptr, step);
  unichar[step] = '\0';  // null terminate unichar
  ptr += step;  // move to the next fragment separator
  int pos = 0;
  int total = 0;
  char *end_ptr = NULL;
  for (int i = 0; i < 2; i++) {
    if (ptr > string + len || *ptr != kSeparator) {
      return NULL;  // failed to parse fragment representation
    }
    ptr++;  // move to the next character
    i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
      : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
    ptr = end_ptr;
  }
  if (ptr != string + len) {
    return NULL;  // malformed fragment representation
  }
  CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT();
  fragment->set_all(unichar, pos, total);
  return fragment;
}

int UNICHARSET::get_script_id_from_name(const char* script_name) const {
  for (int i = 0; i < script_table_size_used; ++i) {
    if (strcmp(script_name, script_table[i]) == 0)
      return i;
  }
  return 0;  // 0 is always the null_script
}