diff --git a/ccutil/tessdatamanager.cpp b/ccutil/tessdatamanager.cpp index 1c8dc61c2..b1542a25b 100644 --- a/ccutil/tessdatamanager.cpp +++ b/ccutil/tessdatamanager.cpp @@ -65,22 +65,14 @@ void TessdataManager::Init(const char *data_file_name) { } } -FILE *TessdataManager::GetFilePtr(const char *language_data_path_prefix, - const char *file_suffix, bool required_file, - bool text_file) { - STRING file_name = language_data_path_prefix; - file_name += file_suffix; - FILE *file_ptr = fopen(file_name.string(), text_file ? "r" : "rb"); - if (required_file && (file_ptr == NULL)) { - tprintf("Error openning required file %s\n", file_name.string()); - exit(1); - } - return file_ptr; -} - void TessdataManager::CopyFile(FILE *input_file, FILE *output_file, - bool newline_end) { + bool newline_end, inT64 num_bytes_to_copy) { + if (num_bytes_to_copy == 0) return; int buffer_size = 1024; + if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) { + buffer_size = num_bytes_to_copy; + } + inT64 num_bytes_copied = 0; char *chunk = new char[buffer_size]; int bytes_read; char last_char = 0x0; @@ -88,106 +80,19 @@ void TessdataManager::CopyFile(FILE *input_file, FILE *output_file, buffer_size, input_file))) { fwrite(chunk, sizeof(char), bytes_read, output_file); last_char = chunk[bytes_read-1]; + if (num_bytes_to_copy > 0) { + num_bytes_copied += bytes_read; + if (num_bytes_copied == num_bytes_to_copy) break; + if (num_bytes_copied + buffer_size > num_bytes_to_copy) { + buffer_size = num_bytes_to_copy - num_bytes_copied; + } + } } if (newline_end) ASSERT_HOST(last_char == '\n'); delete[] chunk; } -void TessdataManager::CombineDataFiles( - const char *language_data_path_prefix, - const char *output_filename) { - FILE *file_ptr; - STRING file_name; - int i; - inT64 offset_table[TESSDATA_NUM_ENTRIES]; - for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1; - FILE *output_file = fopen(output_filename, "wb"); - // Leave some space for recording the offset_table. - fseek(output_file, - sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET); - - // Record language-specific tesseract config file. - file_ptr = GetFilePtr(language_data_path_prefix, - kLangConfigFileSuffix, false, true); - if (file_ptr != NULL) { - offset_table[TESSDATA_LANG_CONFIG] = ftell(output_file); - CopyFile(file_ptr, output_file, true); - fclose(file_ptr); - } - - // Record unicharset. - file_ptr = GetFilePtr(language_data_path_prefix, - kUnicharsetFileSuffix, true, true); - offset_table[TESSDATA_UNICHARSET] = ftell(output_file); - CopyFile(file_ptr, output_file, true); - fclose(file_ptr); - - // Record ambiguities. - file_ptr = GetFilePtr(language_data_path_prefix, - kAmbigsFileSuffix, false, true); - if (file_ptr != NULL) { - offset_table[TESSDATA_AMBIGS] = ftell(output_file); - CopyFile(file_ptr, output_file, true); - fclose(file_ptr); - } - - // Record inttemp. - file_ptr = - GetFilePtr(language_data_path_prefix, - kBuiltInTemplatesFileSuffix, false, false); - if (file_ptr != NULL) { - offset_table[TESSDATA_INTTEMP] = ftell(output_file); - CopyFile(file_ptr, output_file, false); - fclose(file_ptr); - - // Record pffmtable. - file_ptr = GetFilePtr(language_data_path_prefix, - kBuiltInCutoffsFileSuffix, true, true); - offset_table[TESSDATA_PFFMTABLE] = ftell(output_file); - CopyFile(file_ptr, output_file, true); - fclose(file_ptr); - - // Record normproto. - file_ptr = GetFilePtr(language_data_path_prefix, - kNormProtoFileSuffix, true, true); - offset_table[TESSDATA_NORMPROTO] = ftell(output_file); - CopyFile(file_ptr, output_file, true); - fclose(file_ptr); - } - - // Record dawgs. - file_ptr = GetFilePtr(language_data_path_prefix, - kPuncDawgFileSuffix, false, false); - if (file_ptr != NULL) { - offset_table[TESSDATA_PUNC_DAWG] = ftell(output_file); - CopyFile(file_ptr, output_file, false); - fclose(file_ptr); - } - - file_ptr = GetFilePtr(language_data_path_prefix, - kSystemDawgFileSuffix, false, false); - if (file_ptr != NULL) { - offset_table[TESSDATA_SYSTEM_DAWG] = ftell(output_file); - CopyFile(file_ptr, output_file, false); - fclose(file_ptr); - } - - file_ptr = GetFilePtr(language_data_path_prefix, - kNumberDawgFileSuffix, false, false); - if (file_ptr != NULL) { - offset_table[TESSDATA_NUMBER_DAWG] = ftell(output_file); - CopyFile(file_ptr, output_file, false); - fclose(file_ptr); - } - - file_ptr = GetFilePtr(language_data_path_prefix, - kFreqDawgFileSuffix, false, false); - if (file_ptr != NULL) { - offset_table[TESSDATA_FREQ_DAWG] = ftell(output_file); - CopyFile(file_ptr, output_file, false); - fclose(file_ptr); - } - +void TessdataManager::WriteMetadata(inT64 *offset_table, FILE *output_file) { fseek(output_file, 0, SEEK_SET); inT32 num_entries = TESSDATA_NUM_ENTRIES; fwrite(&num_entries, sizeof(inT32), 1, output_file); @@ -195,9 +100,155 @@ void TessdataManager::CombineDataFiles( fclose(output_file); tprintf("TessdataManager combined tesseract data files.\n"); - for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { tprintf("Offset for type %d is %lld\n", i, offset_table[i]); } } +bool TessdataManager::CombineDataFiles( + const char *language_data_path_prefix, + const char *output_filename) { + int i; + inT64 offset_table[TESSDATA_NUM_ENTRIES]; + for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1; + FILE *output_file = fopen(output_filename, "wb"); + if (output_file == NULL) { + tprintf("Error opening %s for writing\n", output_filename); + return false; + } + // Leave some space for recording the offset_table. + fseek(output_file, + sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET); + + TessdataType type; + bool text_file; + FILE *file_ptr[TESSDATA_NUM_ENTRIES]; + + // Load individual tessdata components from files. + for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + ASSERT_HOST(TessdataTypeFromFileSuffix( + kTessdataFileSuffixes[i], &type, &text_file)); + STRING filename = language_data_path_prefix; + filename += kTessdataFileSuffixes[i]; + file_ptr[i] = fopen(filename.string(), text_file ? "r" : "rb"); + if (file_ptr[i] != NULL) { + offset_table[type] = ftell(output_file); + CopyFile(file_ptr[i], output_file, text_file, -1); + fclose(file_ptr[i]); + } + } + + // Make sure that the required components are present. + if (file_ptr[TESSDATA_UNICHARSET] == NULL) { + tprintf("Error opening unicharset file\n"); + fclose(output_file); + return false; + } + if (file_ptr[TESSDATA_INTTEMP] != NULL && + (file_ptr[TESSDATA_PFFMTABLE] == NULL || + file_ptr[TESSDATA_NORMPROTO] == NULL)) { + tprintf("Error opening pffmtable and/or normproto files" + " while inttemp file was present\n"); + fclose(output_file); + return false; + } + + WriteMetadata(offset_table, output_file); + return true; +} + +bool TessdataManager::OverwriteComponents( + const char *new_traineddata_filename, + char **component_filenames, + int num_new_components) { + int i; + inT64 offset_table[TESSDATA_NUM_ENTRIES]; + TessdataType type; + bool text_file; + FILE *file_ptr[TESSDATA_NUM_ENTRIES]; + for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + offset_table[i] = -1; + file_ptr[i] = NULL; + } + FILE *output_file = fopen(new_traineddata_filename, "wb"); + if (output_file == NULL) { + tprintf("Error opening %s for writing\n", new_traineddata_filename); + return false; + } + + // Leave some space for recording the offset_table. + fseek(output_file, + sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET); + + // Open the files with the new components. + for (i = 0; i < num_new_components; ++i) { + TessdataTypeFromFileName(component_filenames[i], &type, &text_file); + file_ptr[type] = fopen(component_filenames[i], text_file ? "r" : "rb"); + } + + // Write updated data to the output traineddata file. + for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + if (file_ptr[i] != NULL) { + // Get the data from the opened component file. + offset_table[i] = ftell(output_file); + CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1); + fclose(file_ptr[i]); + } else { + // Get this data component from the loaded data file. + if (SeekToStart(static_cast(i))) { + offset_table[i] = ftell(output_file); + CopyFile(data_file_, output_file, kTessdataFileIsText[i], + GetEndOffset(static_cast(i)) - + ftell(data_file_) + 1); + } + } + } + + WriteMetadata(offset_table, output_file); + return true; +} + +bool TessdataManager::TessdataTypeFromFileSuffix( + const char *suffix, TessdataType *type, bool *text_file) { + for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) { + *type = static_cast(i); + *text_file = kTessdataFileIsText[i]; + return true; + } + } + printf("TessdataManager can't determine which tessdata" + " component is represented by %s\n", suffix); + return false; +} + +bool TessdataManager::TessdataTypeFromFileName( + const char *filename, TessdataType *type, bool *text_file) { + // Get the file suffix (extension) + const char *suffix = strrchr(filename, '.'); + if (suffix == NULL || *(++suffix) == '\0') return false; + return TessdataTypeFromFileSuffix(suffix, type, text_file); +} + +bool TessdataManager::ExtractToFile(const char *filename) { + TessdataType type; + bool text_file; + ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName( + filename, &type, &text_file)); + if (!SeekToStart(type)) return false; + + FILE *output_file = fopen(filename, "wb"); + if (output_file == NULL) { + printf("Error openning %s\n", filename); + exit(1); + } + inT64 begin_offset = ftell(GetDataFilePtr()); + inT64 end_offset = GetEndOffset(type); + tesseract::TessdataManager::CopyFile( + GetDataFilePtr(), output_file, text_file, + end_offset - begin_offset + 1); + fclose(output_file); + return true; +} + } // namespace tesseract diff --git a/ccutil/tessdatamanager.h b/ccutil/tessdatamanager.h index c6defbfd2..93ae43c24 100644 --- a/ccutil/tessdatamanager.h +++ b/ccutil/tessdatamanager.h @@ -37,6 +37,8 @@ extern INT_VAR_H(global_tessdata_manager_debug_level, 0, static const char kTrainedDataSuffix[] = "traineddata"; +// When adding new tessdata types and file suffixes, please make sure to +// update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText. static const char kLangConfigFileSuffix[] = "config"; static const char kUnicharsetFileSuffix[] = "unicharset"; static const char kAmbigsFileSuffix[] = "unicharambigs"; @@ -65,6 +67,36 @@ enum TessdataType { TESSDATA_NUM_ENTRIES }; +// kTessdataFileSuffixes[i] indicates the file suffix for +// tessdata of type i (from TessdataType enum). +static const char * const kTessdataFileSuffixes[] = { + kLangConfigFileSuffix, // 0 + kUnicharsetFileSuffix, // 1 + kAmbigsFileSuffix, // 2 + kBuiltInTemplatesFileSuffix, // 3 + kBuiltInCutoffsFileSuffix, // 4 + kNormProtoFileSuffix, // 5 + kPuncDawgFileSuffix, // 6 + kSystemDawgFileSuffix, // 7 + kNumberDawgFileSuffix, // 8 + kFreqDawgFileSuffix, // 9 +}; + +// If kTessdataFileIsText[i] is true - the tessdata component +// of type i (from TessdataType enum) is text, and is binary otherwise. +static const bool kTessdataFileIsText[] = { + true, // 0 + true, // 1 + true, // 2 + false, // 3 + true, // 4 + true, // 5 + false, // 6 + false, // 7 + false, // 8 + false, // 9 +}; + // TessdataType could be updated to contain more entries, however // we do not expect that number to be astronomically high. // In order to automatically detect endianness TessdataManager will @@ -102,7 +134,8 @@ class TessdataManager { return false; } else { ASSERT_HOST(fseek(data_file_, - offset_table_[tessdata_type], SEEK_SET) == 0); + static_cast(offset_table_[tessdata_type]), + SEEK_SET) == 0); return true; } } @@ -128,24 +161,55 @@ class TessdataManager { } } + // Writes the number of entries and the given offset table to output_file. + static void WriteMetadata(inT64 *offset_table, FILE *output_file); + // Reads all the standard tesseract config and data files for a language // at the given path and bundles them up into one binary data file. - static void CombineDataFiles(const char *language_data_path_prefix, + // Returns true if the combined traineddata file was successfully written. + static bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename); + // Gets the individual components from the data_file_ with which the class was + // initialized. Overwrites the components specified by component_filenames. + // Writes the updated traineddata file to new_traineddata_filename. + bool OverwriteComponents(const char *new_traineddata_filename, + char **component_filenames, + int num_new_components); + + // Extracts tessdata component implied by the name of the input file from + // the combined traineddata loaded into TessdataManager. + // Writes the extracted component to the file indicated by the file name. + // E.g. if the filename given is somepath/somelang.unicharset, unicharset + // will be extracted from the data loaded into the TessdataManager and will + // be written to somepath/somelang.unicharset. + // Returns true if the component was successfully extracted, false if the + // component was not present in the traineddata loaded into TessdataManager. + bool ExtractToFile(const char *filename); + + // Copies data from the given input file to the output_file provided. + // If num_bytes_to_copy is >= 0, only num_bytes_to_copy is copied from + // the input file, otherwise all the data in the input file is copied. + static void CopyFile(FILE *input_file, FILE *output_file, + bool newline_end, inT64 num_bytes_to_copy); + + // Fills type with TessdataType of the tessdata component represented by the + // given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET. + // Sets *text_file to true if the component is in text format (e.g. + // unicharset, unichar ambigs, config, etc). + // Returns true if the tessdata component type could be determined + // from the given file name. + static bool TessdataTypeFromFileSuffix(const char *suffix, + TessdataType *type, + bool *text_file); + + // Tries to determine tessdata component file suffix from filename, + // returns true on success. + static bool TessdataTypeFromFileName(const char *filename, + TessdataType *type, + bool *text_file); + private: - - // Opens the file whose name is a concatentation of language_data_path_prefix - // and file_suffix. Terminates the program if required_file is set to true, - // but the file could not be found or opened for reading. - // Returns a file pointer to the opened file. - static FILE *GetFilePtr(const char *language_data_path_prefix, - const char *file_suffix, bool required_file, - bool text_file); - - // Copies all the bytes in the given input file to the output_file provided. - static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end); - // Each offset_table_[i] contains a file offset in the combined data file // where the data of TessdataFileType i is stored. inT64 offset_table_[TESSDATA_NUM_ENTRIES]; diff --git a/training/combine_tessdata.cpp b/training/combine_tessdata.cpp index 3f939c5dd..2a4f3ed80 100644 --- a/training/combine_tessdata.cpp +++ b/training/combine_tessdata.cpp @@ -20,12 +20,113 @@ #include "tessdatamanager.h" +// Main program to combine/extract/overwrite tessdata components +// in [lang].traineddata files. +// +// To combine all the individual tessdata components (unicharset, DAWGs, +// classifier templates, ambiguities, language configs) located at, say, +// /home/$USER/temp/eng.* run: +// +// combine_tessdata /home/$USER/temp/eng. +// +// The result will be a combined tessdata file /home/$USER/temp/eng.traineddata +// +// Specify option -e if you would like to extract individual components +// from a combined traineddata file. For example, to extract language config +// file and the unicharset from tessdata/eng.traineddata run: +// +// combine_tessdata -e tessdata/eng.traineddata +// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset +// +// The desired config file and unicharset will be written to +// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset +// +// Specify option -o to overwrite individual components of the given +// [lang].traineddata file. For example, to overwrite language config +// and unichar ambiguities files in tessdata/eng.traineddata use: +// +// combine_tessdata -o tessdata/eng.traineddata +// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs +// +// As a result, tessdata/eng.traineddata will contain the new language config +// and unichar ambigs, plus all the original DAWGs, classifier teamples, etc. +// +// Note: the file names of the files to extract to and to overwrite from should +// have the appropriate file suffixes (extensions) indicating their tessdata +// component type (.unicharset for the unicharset, .unicharambigs for unichar +// ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h. +// +// Specify option -u to unpack all the components to the specified path: +// +// combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng. +// +// This will create /home/$USER/temp/eng.* files with individual tessdata +// components from tessdata/eng.traineddata. +// int main(int argc, char **argv) { - if (!(argc == 2)) { - printf("Usage: %s language_data_path_prefix (e.g. tessdata/eng.)", argv[0]); + int i; + if (argc == 2) { + printf("Combininig tessdata files\n"); + STRING output_file = argv[1]; + output_file += kTrainedDataSuffix; + if (!tesseract::TessdataManager::CombineDataFiles( + argv[1], output_file.string())) { + printf("Error combining tessdata files into %s\n", + output_file.string()); + } + } else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 || + strcmp(argv[1], "-u") == 0)) { + // Initialize TessdataManager with the data in the given traineddata file. + tesseract::TessdataManager tm; + tm.Init(argv[2]); + printf("Extracting tessdata components from %s\n", argv[2]); + if (strcmp(argv[1], "-e") == 0) { + for (i = 3; i < argc; ++i) { + if (tm.ExtractToFile(argv[i])) { + printf("Wrote %s\n", argv[i]); + } else { + printf("Not extracting %s, since this component" + " is not present\n", argv[i]); + } + } + } else { // extract all the components + for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) { + STRING filename = argv[3]; + filename += tesseract::kTessdataFileSuffixes[i]; + if (tm.ExtractToFile(filename.string())) { + printf("Wrote %s\n", filename.string()); + } + } + } + tm.End(); + } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) { + // Rename the current traineddata file to a temporary name. + const char *new_traineddata_filename = argv[2]; + STRING traineddata_filename = new_traineddata_filename; + traineddata_filename += ".__tmp__"; + if (rename(new_traineddata_filename, traineddata_filename.string()) != 0) { + tprintf("Failed to create a temporary file %s\n", + traineddata_filename.string()); + exit(1); + } + + // Initialize TessdataManager with the data in the given traineddata file. + tesseract::TessdataManager tm; + tm.Init(traineddata_filename.string()); + + // Write the updated traineddata file. + tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3); + tm.End(); + } else { + printf("Usage for combining tessdata components:\n" + "%s language_data_path_prefix (e.g. tessdata/eng.)\n", argv[0]); + printf("Usage for extracting tessdata components:\n" + "%s -e traineddata_file [output_component_file...]\n", argv[0]); + printf("Usage for overwriting tessdata components:\n" + "%s -o traineddata_file [input_component_file...]\n", argv[0]); + printf("Usage for unpacking all tessdata components:\n" + "%s -u traineddata_file output_path_prefix" + " (e.g. /tmp/eng.)\n", argv[0]); return 1; } - STRING output_file = argv[1]; - output_file += kTrainedDataSuffix; - tesseract::TessdataManager::CombineDataFiles(argv[1], output_file.string()); }