tesseract/ccutil/unicharmap.cpp
theraysmith@gmail.com d71045fa3a Fixed issue 736
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@742 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2012-09-21 15:19:44 +00:00

173 lines
5.8 KiB
C++

///////////////////////////////////////////////////////////////////////
// File: unicharmap.cpp
// Description: Unicode character/ligature to integer id class.
// Author: Thomas Kielbus
// Created: Wed Jun 28 17:05:01 PDT 2006
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
#include <assert.h>
#include "unichar.h"
#include "host.h"
#include "unicharmap.h"
UNICHARMAP::UNICHARMAP() :
nodes(0) {
}
UNICHARMAP::~UNICHARMAP() {
if (nodes != 0)
delete[] nodes;
}
// Search the given unichar representation in the tree. Each character in the
// string is interpreted as an index in an array of nodes.
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const {
const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;
assert(*unichar_repr != '\0');
do {
if (*(current_char + 1) == '\0')
return current_nodes[static_cast<unsigned char>(*current_char)].id;
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
++current_char;
} while (true);
}
// Search the given unichar representation in the tree, using length characters
// from it maximum. Each character in the string is interpreted as an index in
// an array of nodes.
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
int length) const {
const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;
assert(*unichar_repr != '\0');
assert(length > 0 && length <= UNICHAR_LEN);
do {
if (length == 1 || *(current_char + 1) == '\0')
return current_nodes[static_cast<unsigned char>(*current_char)].id;
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
++current_char;
--length;
} while (true);
}
// Search the given unichar representation in the tree, creating the possibly
// missing nodes. Once the right place has been found, insert the given id and
// update the inserted flag to keep track of the insert. Each character in the
// string is interpreted as an index in an array of nodes.
void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
const char* current_char = unichar_repr;
UNICHARMAP_NODE** current_nodes_pointer = &nodes;
assert(*unichar_repr != '\0');
assert(id >= 0);
do {
if (*current_nodes_pointer == 0)
*current_nodes_pointer = new UNICHARMAP_NODE[256];
if (*(current_char + 1) == '\0') {
(*current_nodes_pointer)
[static_cast<unsigned char>(*current_char)].id = id;
return;
}
current_nodes_pointer =
&((*current_nodes_pointer)
[static_cast<unsigned char>(*current_char)].children);
++current_char;
} while (true);
}
// Search the given unichar representation in the tree. Each character in the
// string is interpreted as an index in an array of nodes. Stop once the tree
// does not have anymore nodes or once we found the right unichar_repr.
bool UNICHARMAP::contains(const char* const unichar_repr) const {
if (unichar_repr == NULL || *unichar_repr == '\0') return false;
const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;
while (current_nodes != 0 && *(current_char + 1) != '\0') {
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
++current_char;
}
return current_nodes != 0 && *(current_char + 1) == '\0' &&
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
}
// Search the given unichar representation in the tree, using length characters
// from it maximum. Each character in the string is interpreted as an index in
// an array of nodes. Stop once the tree does not have anymore nodes or once we
// found the right unichar_repr.
bool UNICHARMAP::contains(const char* const unichar_repr,
int length) const {
if (unichar_repr == NULL || *unichar_repr == '\0') return false;
if (length <= 0 || length > UNICHAR_LEN) return false;
const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;
while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) {
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
--length;
++current_char;
}
return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') &&
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
}
// Return the minimum number of characters that must be used from this string
// to obtain a match in the UNICHARMAP.
int UNICHARMAP::minmatch(const char* const unichar_repr) const {
const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;
while (current_nodes != NULL && *current_char != '\0') {
if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0)
return current_char + 1 - unichar_repr;
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
++current_char;
}
return 0;
}
void UNICHARMAP::clear() {
if (nodes != 0)
{
delete[] nodes;
nodes = 0;
}
}
UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() :
children(0),
id(-1) {
}
// Recursively delete the children
UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() {
if (children != 0) {
delete[] children;
}
}