/********************************************************************** * File: char_samp_enum.cpp * Description: Implementation of a Character Set Class * Author: Ahmad Abdulkader * Created: 2007 * * (C) Copyright 2008, Google Inc. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/ #include <string> #include "char_set.h" #include "cube_utils.h" #include "tessdatamanager.h" namespace tesseract { CharSet::CharSet() { class_cnt_ = 0; class_strings_ = NULL; unicharset_map_ = NULL; init_ = false; // init hash table memset(hash_bin_size_, 0, sizeof(hash_bin_size_)); } CharSet::~CharSet() { if (class_strings_ != NULL) { for (int cls = 0; cls < class_cnt_; cls++) { if (class_strings_[cls] != NULL) { delete class_strings_[cls]; } } delete []class_strings_; class_strings_ = NULL; } delete []unicharset_map_; } // Creates CharSet object by reading the unicharset from the // TessDatamanager, and mapping Cube's unicharset to Tesseract's if // they differ. CharSet *CharSet::Create(TessdataManager *tessdata_manager, UNICHARSET *tess_unicharset) { CharSet *char_set = new CharSet(); if (char_set == NULL) { return NULL; } // First look for Cube's unicharset; if not there, use tesseract's bool cube_unicharset_exists; if (!(cube_unicharset_exists = tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET)) && !tessdata_manager->SeekToStart(TESSDATA_UNICHARSET)) { fprintf(stderr, "Cube ERROR (CharSet::Create): could not find " "either cube or tesseract unicharset\n"); return NULL; } FILE *charset_fp = tessdata_manager->GetDataFilePtr(); if (!charset_fp) { fprintf(stderr, "Cube ERROR (CharSet::Create): could not load " "a unicharset\n"); return NULL; } // If we found a cube unicharset separate from tesseract's, load it and // map its unichars to tesseract's; if only one unicharset exists, // just load it. bool loaded; if (cube_unicharset_exists) { char_set->cube_unicharset_.load_from_file(charset_fp); loaded = tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET); loaded = loaded && char_set->LoadSupportedCharList( tessdata_manager->GetDataFilePtr(), tess_unicharset); char_set->unicharset_ = &char_set->cube_unicharset_; } else { loaded = char_set->LoadSupportedCharList(charset_fp, NULL); char_set->unicharset_ = tess_unicharset; } if (!loaded) { delete char_set; return NULL; } char_set->init_ = true; return char_set; } // Load the list of supported chars from the given data file pointer. bool CharSet::LoadSupportedCharList(FILE *fp, UNICHARSET *tess_unicharset) { if (init_) return true; char str_line[256]; // init hash table memset(hash_bin_size_, 0, sizeof(hash_bin_size_)); // read the char count if (fgets(str_line, sizeof(str_line), fp) == NULL) { fprintf(stderr, "Cube ERROR (CharSet::InitMemory): could not " "read char count.\n"); return false; } class_cnt_ = atoi(str_line); if (class_cnt_ < 2) { fprintf(stderr, "Cube ERROR (CharSet::InitMemory): invalid " "class count: %d\n", class_cnt_); return false; } // memory for class strings class_strings_ = new string_32*[class_cnt_]; if (class_strings_ == NULL) { fprintf(stderr, "Cube ERROR (CharSet::InitMemory): could not " "allocate memory for class strings.\n"); return false; } // memory for unicharset map if (tess_unicharset) { unicharset_map_ = new int[class_cnt_]; if (unicharset_map_ == NULL) { fprintf(stderr, "Cube ERROR (CharSet::InitMemory): could not " "allocate memory for unicharset map.\n"); return false; } } // Read in character strings and add to hash table for (int class_id = 0; class_id < class_cnt_; class_id++) { // Read the class string if (fgets(str_line, sizeof(str_line), fp) == NULL) { fprintf(stderr, "Cube ERROR (CharSet::ReadAndHashStrings): " "could not read class string with class_id=%d.\n", class_id); return false; } // Terminate at space if any char *p = strchr(str_line, ' '); if (p != NULL) *p = '\0'; // Convert to UTF32 and store string_32 str32; // Convert NULL to a space if (strcmp(str_line, "NULL") == 0) { strcpy(str_line, " "); } CubeUtils::UTF8ToUTF32(str_line, &str32); class_strings_[class_id] = new string_32(str32); if (class_strings_[class_id] == NULL) { fprintf(stderr, "Cube ERROR (CharSet::ReadAndHashStrings): could not " "allocate memory for class string with class_id=%d.\n", class_id); return false; } // Add to hash-table int hash_val = Hash(reinterpret_cast<const char_32 *>(str32.c_str())); if (hash_bin_size_[hash_val] >= kMaxHashSize) { fprintf(stderr, "Cube ERROR (CharSet::LoadSupportedCharList): hash " "table is full.\n"); return false; } hash_bins_[hash_val][hash_bin_size_[hash_val]++] = class_id; if (tess_unicharset != NULL) { // Add class id to unicharset map UNICHAR_ID tess_id = tess_unicharset->unichar_to_id(str_line); if (tess_id == INVALID_UNICHAR_ID) { tess_unicharset->unichar_insert(str_line); tess_id = tess_unicharset->unichar_to_id(str_line); } ASSERT_HOST(tess_id != INVALID_UNICHAR_ID); unicharset_map_[class_id] = tess_id; } } return true; } } // tesseract