mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-04 07:47:48 +08:00
570af48b8b
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@87 d0cd1f9f-072b-0410-8dd7-cf729c803f20
167 lines
4.6 KiB
C++
167 lines
4.6 KiB
C++
|
|
///////////////////////////////////////////////////////////////////////
|
|
// File: unicharset.cpp
|
|
// Description: Unicode character/ligature set class.
|
|
// Author: Thomas Kielbus
|
|
// Created: Wed Jun 28 17:05:01 PDT 2006
|
|
//
|
|
// (C) Copyright 2006, Google Inc.
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
#include <assert.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
|
|
#include "unichar.h"
|
|
#include "unicharset.h"
|
|
|
|
static const int ISALPHA_MASK = 0x1;
|
|
static const int ISLOWER_MASK = 0x2;
|
|
static const int ISUPPER_MASK = 0x4;
|
|
static const int ISDIGIT_MASK = 0x8;
|
|
|
|
UNICHARSET::UNICHARSET() :
|
|
unichars(NULL),
|
|
ids(),
|
|
size_used(0),
|
|
size_reserved(0)
|
|
{
|
|
}
|
|
|
|
UNICHARSET::~UNICHARSET() {
|
|
if (size_reserved > 0)
|
|
delete[] unichars;
|
|
}
|
|
|
|
void UNICHARSET::reserve(int unichars_number) {
|
|
if (unichars_number > size_reserved)
|
|
{
|
|
UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
|
|
for (int i = 0; i < size_used; ++i)
|
|
memcpy(&unichars_new[i], &unichars[i], sizeof (UNICHAR_SLOT));
|
|
delete[] unichars;
|
|
unichars = unichars_new;
|
|
size_reserved = unichars_number;
|
|
}
|
|
}
|
|
|
|
const UNICHAR_ID
|
|
UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
|
|
assert(ids.contains(unichar_repr));
|
|
return ids.unichar_to_id(unichar_repr);
|
|
}
|
|
|
|
const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
|
|
int length) const {
|
|
assert(length > 0 && length <= UNICHAR_LEN);
|
|
assert(ids.contains(unichar_repr, length));
|
|
return ids.unichar_to_id(unichar_repr, length);
|
|
}
|
|
|
|
const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
|
|
assert(id < this->size());
|
|
return unichars[id].representation;
|
|
}
|
|
|
|
void UNICHARSET::unichar_insert(const char* const unichar_repr) {
|
|
if (!ids.contains(unichar_repr)) {
|
|
if (size_used == size_reserved)
|
|
{
|
|
if (size_used == 0)
|
|
reserve(8);
|
|
else
|
|
reserve(2 * size_used);
|
|
}
|
|
|
|
strcpy(unichars[size_used].representation, unichar_repr);
|
|
ids.insert(unichar_repr, size_used);
|
|
++size_used;
|
|
}
|
|
}
|
|
|
|
bool UNICHARSET::contains_unichar(const char* const unichar_repr) {
|
|
return ids.contains(unichar_repr);
|
|
}
|
|
|
|
bool UNICHARSET::eq(UNICHAR_ID unichar_id, const char* const unichar_repr) {
|
|
return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
|
|
}
|
|
|
|
bool UNICHARSET::save_to_file(const char* filename) const {
|
|
FILE* file = fopen(filename, "w+");
|
|
|
|
if (file == NULL)
|
|
return false;
|
|
|
|
fprintf(file, "%d\n", this->size());
|
|
for (UNICHAR_ID id = 0; id < this->size(); ++id) {
|
|
unsigned int properties = 0;
|
|
|
|
if (this->get_isalpha(id))
|
|
properties |= ISALPHA_MASK;
|
|
if (this->get_islower(id))
|
|
properties |= ISLOWER_MASK;
|
|
if (this->get_isupper(id))
|
|
properties |= ISUPPER_MASK;
|
|
if (this->get_isdigit(id))
|
|
properties |= ISDIGIT_MASK;
|
|
|
|
if (strcmp(this->id_to_unichar(id), " ") == 0)
|
|
fprintf(file, "%s %x\n", "NULL", properties);
|
|
else
|
|
fprintf(file, "%s %x\n", this->id_to_unichar(id), properties);
|
|
}
|
|
fclose(file);
|
|
return true;
|
|
}
|
|
|
|
bool UNICHARSET::load_from_file(const char* filename) {
|
|
FILE* file = fopen(filename, "r");
|
|
int unicharset_size;
|
|
char buffer[256];
|
|
|
|
if (file == NULL)
|
|
return false;
|
|
|
|
this->clear();
|
|
if (fgets(buffer, sizeof (buffer), file) == NULL ||
|
|
sscanf(buffer, "%d", &unicharset_size) != 1)
|
|
{
|
|
fclose(file);
|
|
return false;
|
|
}
|
|
this->reserve(unicharset_size);
|
|
for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
|
|
char unichar[256];
|
|
unsigned int properties;
|
|
|
|
if (fgets(buffer, sizeof (buffer), file) == NULL ||
|
|
sscanf(buffer, "%s %x", unichar, &properties) != 2)
|
|
{
|
|
fclose(file);
|
|
return false;
|
|
}
|
|
if (strcmp(unichar, "NULL") == 0)
|
|
this->unichar_insert(" ");
|
|
else
|
|
this->unichar_insert(unichar);
|
|
|
|
this->set_isalpha(id, properties & ISALPHA_MASK);
|
|
this->set_islower(id, properties & ISLOWER_MASK);
|
|
this->set_isupper(id, properties & ISUPPER_MASK);
|
|
this->set_isdigit(id, properties & ISDIGIT_MASK);
|
|
}
|
|
fclose(file);
|
|
return true;
|
|
}
|