/////////////////////////////////////////////////////////////////////// // File: tessdatamanager.cpp // Description: Functions to handle loading/combining tesseract data files. // Author: Daria Antonova // Created: Wed Jun 03 11:26:43 PST 2009 // // (C) Copyright 2009, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // /////////////////////////////////////////////////////////////////////// #ifdef _MSC_VER #pragma warning(disable:4244) // Conversion warnings #endif #include "tessdatamanager.h" #include #include "serialis.h" #include "strngs.h" #include "tprintf.h" #include "varable.h" BOOL_VAR(global_load_system_dawg, true, "Load system word dawg."); BOOL_VAR(global_load_freq_dawg, true, "Load frequent word dawg."); BOOL_VAR(global_load_punc_dawg, true, "Load dawg with punctuation patterns."); BOOL_VAR(global_load_number_dawg, true, "Load dawg with number patterns."); INT_VAR(global_tessdata_manager_debug_level, 0, "Debug level for TessdataManager functions."); namespace tesseract { void TessdataManager::Init(const char *data_file_name) { int i; data_file_ = fopen(data_file_name, "rb"); if (data_file_ == NULL) { tprintf("Error openning data file %s\n", data_file_name); exit(1); } fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_); bool swap = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries); if (swap) { actual_tessdata_num_entries_ = reverse32(actual_tessdata_num_entries_); } ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES); fread(offset_table_, sizeof(inT64), actual_tessdata_num_entries_, data_file_); if (swap) { for (i = 0 ; i < actual_tessdata_num_entries_; ++i) { offset_table_[i] = reverse64(offset_table_[i]); } } if (global_tessdata_manager_debug_level) { tprintf("TessdataManager loaded %d types of tesseract data files.\n", actual_tessdata_num_entries_); for (i = 0; i < actual_tessdata_num_entries_; ++i) { tprintf("Offset for type %d is %lld\n", i, offset_table_[i]); } } } void TessdataManager::CopyFile(FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy) { if (num_bytes_to_copy == 0) return; int buffer_size = 1024; if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) { buffer_size = num_bytes_to_copy; } inT64 num_bytes_copied = 0; char *chunk = new char[buffer_size]; int bytes_read; char last_char = 0x0; while ((bytes_read = fread(chunk, sizeof(char), buffer_size, input_file))) { fwrite(chunk, sizeof(char), bytes_read, output_file); last_char = chunk[bytes_read-1]; if (num_bytes_to_copy > 0) { num_bytes_copied += bytes_read; if (num_bytes_copied == num_bytes_to_copy) break; if (num_bytes_copied + buffer_size > num_bytes_to_copy) { buffer_size = num_bytes_to_copy - num_bytes_copied; } } } if (newline_end) ASSERT_HOST(last_char == '\n'); delete[] chunk; } void TessdataManager::WriteMetadata(inT64 *offset_table, FILE *output_file) { fseek(output_file, 0, SEEK_SET); inT32 num_entries = TESSDATA_NUM_ENTRIES; fwrite(&num_entries, sizeof(inT32), 1, output_file); fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file); fclose(output_file); tprintf("TessdataManager combined tesseract data files.\n"); for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { tprintf("Offset for type %d is %lld\n", i, offset_table[i]); } } bool TessdataManager::CombineDataFiles( const char *language_data_path_prefix, const char *output_filename) { int i; inT64 offset_table[TESSDATA_NUM_ENTRIES]; for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1; FILE *output_file = fopen(output_filename, "wb"); if (output_file == NULL) { tprintf("Error opening %s for writing\n", output_filename); return false; } // Leave some space for recording the offset_table. fseek(output_file, sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET); TessdataType type; bool text_file; FILE *file_ptr[TESSDATA_NUM_ENTRIES]; // Load individual tessdata components from files. for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { ASSERT_HOST(TessdataTypeFromFileSuffix( kTessdataFileSuffixes[i], &type, &text_file)); STRING filename = language_data_path_prefix; filename += kTessdataFileSuffixes[i]; file_ptr[i] = fopen(filename.string(), text_file ? "r" : "rb"); if (file_ptr[i] != NULL) { offset_table[type] = ftell(output_file); CopyFile(file_ptr[i], output_file, text_file, -1); fclose(file_ptr[i]); } } // Make sure that the required components are present. if (file_ptr[TESSDATA_UNICHARSET] == NULL) { tprintf("Error opening unicharset file\n"); fclose(output_file); return false; } if (file_ptr[TESSDATA_INTTEMP] != NULL && (file_ptr[TESSDATA_PFFMTABLE] == NULL || file_ptr[TESSDATA_NORMPROTO] == NULL)) { tprintf("Error opening pffmtable and/or normproto files" " while inttemp file was present\n"); fclose(output_file); return false; } WriteMetadata(offset_table, output_file); return true; } bool TessdataManager::OverwriteComponents( const char *new_traineddata_filename, char **component_filenames, int num_new_components) { int i; inT64 offset_table[TESSDATA_NUM_ENTRIES]; TessdataType type; bool text_file; FILE *file_ptr[TESSDATA_NUM_ENTRIES]; for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { offset_table[i] = -1; file_ptr[i] = NULL; } FILE *output_file = fopen(new_traineddata_filename, "wb"); if (output_file == NULL) { tprintf("Error opening %s for writing\n", new_traineddata_filename); return false; } // Leave some space for recording the offset_table. fseek(output_file, sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET); // Open the files with the new components. for (i = 0; i < num_new_components; ++i) { TessdataTypeFromFileName(component_filenames[i], &type, &text_file); file_ptr[type] = fopen(component_filenames[i], text_file ? "r" : "rb"); } // Write updated data to the output traineddata file. for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { if (file_ptr[i] != NULL) { // Get the data from the opened component file. offset_table[i] = ftell(output_file); CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1); fclose(file_ptr[i]); } else { // Get this data component from the loaded data file. if (SeekToStart(static_cast(i))) { offset_table[i] = ftell(output_file); CopyFile(data_file_, output_file, kTessdataFileIsText[i], GetEndOffset(static_cast(i)) - ftell(data_file_) + 1); } } } WriteMetadata(offset_table, output_file); return true; } bool TessdataManager::TessdataTypeFromFileSuffix( const char *suffix, TessdataType *type, bool *text_file) { for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) { *type = static_cast(i); *text_file = kTessdataFileIsText[i]; return true; } } printf("TessdataManager can't determine which tessdata" " component is represented by %s\n", suffix); return false; } bool TessdataManager::TessdataTypeFromFileName( const char *filename, TessdataType *type, bool *text_file) { // Get the file suffix (extension) const char *suffix = strrchr(filename, '.'); if (suffix == NULL || *(++suffix) == '\0') return false; return TessdataTypeFromFileSuffix(suffix, type, text_file); } bool TessdataManager::ExtractToFile(const char *filename) { TessdataType type; bool text_file; ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName( filename, &type, &text_file)); if (!SeekToStart(type)) return false; FILE *output_file = fopen(filename, "wb"); if (output_file == NULL) { printf("Error openning %s\n", filename); exit(1); } inT64 begin_offset = ftell(GetDataFilePtr()); inT64 end_offset = GetEndOffset(type); tesseract::TessdataManager::CopyFile( GetDataFilePtr(), output_file, text_file, end_offset - begin_offset + 1); fclose(output_file); return true; } } // namespace tesseract