Updated tessdatamanager/combine_tessdata to give more functionality

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@353 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2025-01-18 06:30:14 +08:00 · 2010-05-20 23:07:24 +00:00 · 2010-05-20 23:07:24 +00:00 · 45aacc077e
commit 45aacc077e
parent a5b4570180
3 changed files with 345 additions and 129 deletions
--- a/ccutil/tessdatamanager.cpp
+++ b/ccutil/tessdatamanager.cpp
@ -65,22 +65,14 @@ void TessdataManager::Init(const char *data_file_name) {
  }
 }

-FILE *TessdataManager::GetFilePtr(const char *language_data_path_prefix,
-                                  const char *file_suffix, bool required_file,
-                                  bool text_file) {
-  STRING file_name = language_data_path_prefix;
-  file_name += file_suffix;
-  FILE *file_ptr = fopen(file_name.string(), text_file ? "r" : "rb");
-  if (required_file && (file_ptr == NULL)) {
-    tprintf("Error openning required file %s\n", file_name.string());
-    exit(1);
-  }
-  return file_ptr;
-}
-
 void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
-                               bool newline_end) {
+                               bool newline_end, inT64 num_bytes_to_copy) {
+  if (num_bytes_to_copy == 0) return;
  int buffer_size = 1024;
+  if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) {
+    buffer_size = num_bytes_to_copy;
+  }
+  inT64 num_bytes_copied = 0;
  char *chunk = new char[buffer_size];
  int bytes_read;
  char last_char = 0x0;
@ -88,106 +80,19 @@ void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
                             buffer_size, input_file))) {
    fwrite(chunk, sizeof(char), bytes_read, output_file);
    last_char = chunk[bytes_read-1];
+    if (num_bytes_to_copy > 0) {
+      num_bytes_copied += bytes_read;
+      if (num_bytes_copied == num_bytes_to_copy) break;
+      if (num_bytes_copied + buffer_size > num_bytes_to_copy) {
+        buffer_size = num_bytes_to_copy - num_bytes_copied;
+      }
+    }
  }
  if (newline_end) ASSERT_HOST(last_char == '\n');
  delete[] chunk;
 }

-void TessdataManager::CombineDataFiles(
-    const char *language_data_path_prefix,
-    const char *output_filename) {
-  FILE *file_ptr;
-  STRING file_name;
-  int i;
-  inT64 offset_table[TESSDATA_NUM_ENTRIES];
-  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
-  FILE *output_file = fopen(output_filename, "wb");
-  // Leave some space for recording the offset_table.
-  fseek(output_file,
-        sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
-
-  // Record language-specific tesseract config file.
-  file_ptr = GetFilePtr(language_data_path_prefix,
-                        kLangConfigFileSuffix, false, true);
-  if (file_ptr != NULL) {
-    offset_table[TESSDATA_LANG_CONFIG] = ftell(output_file);
-    CopyFile(file_ptr, output_file, true);
-    fclose(file_ptr);
-  }
-
-  // Record unicharset.
-  file_ptr = GetFilePtr(language_data_path_prefix,
-                        kUnicharsetFileSuffix, true, true);
-  offset_table[TESSDATA_UNICHARSET] = ftell(output_file);
-  CopyFile(file_ptr, output_file, true);
-  fclose(file_ptr);
-
-  // Record ambiguities.
-  file_ptr = GetFilePtr(language_data_path_prefix,
-                        kAmbigsFileSuffix, false, true);
-  if (file_ptr != NULL) {
-    offset_table[TESSDATA_AMBIGS] = ftell(output_file);
-    CopyFile(file_ptr, output_file, true);
-    fclose(file_ptr);
-  }
-
-  // Record inttemp.
-  file_ptr =
-    GetFilePtr(language_data_path_prefix,
-               kBuiltInTemplatesFileSuffix, false, false);
-  if (file_ptr != NULL) {
-    offset_table[TESSDATA_INTTEMP] = ftell(output_file);
-    CopyFile(file_ptr, output_file, false);
-    fclose(file_ptr);
-
-    // Record pffmtable.
-    file_ptr = GetFilePtr(language_data_path_prefix,
-                          kBuiltInCutoffsFileSuffix, true, true);
-    offset_table[TESSDATA_PFFMTABLE] = ftell(output_file);
-    CopyFile(file_ptr, output_file, true);
-    fclose(file_ptr);
-
-    // Record normproto.
-    file_ptr = GetFilePtr(language_data_path_prefix,
-                          kNormProtoFileSuffix, true, true);
-    offset_table[TESSDATA_NORMPROTO] = ftell(output_file);
-    CopyFile(file_ptr, output_file, true);
-    fclose(file_ptr);
-  }
-
-  // Record dawgs.
-  file_ptr = GetFilePtr(language_data_path_prefix,
-                        kPuncDawgFileSuffix, false, false);
-  if (file_ptr != NULL) {
-    offset_table[TESSDATA_PUNC_DAWG] = ftell(output_file);
-    CopyFile(file_ptr, output_file, false);
-    fclose(file_ptr);
-  }
-
-  file_ptr = GetFilePtr(language_data_path_prefix,
-                        kSystemDawgFileSuffix, false, false);
-  if (file_ptr != NULL) {
-    offset_table[TESSDATA_SYSTEM_DAWG] = ftell(output_file);
-    CopyFile(file_ptr, output_file, false);
-    fclose(file_ptr);
-  }
-
-  file_ptr = GetFilePtr(language_data_path_prefix,
-                        kNumberDawgFileSuffix, false, false);
-  if (file_ptr != NULL) {
-    offset_table[TESSDATA_NUMBER_DAWG] = ftell(output_file);
-    CopyFile(file_ptr, output_file, false);
-    fclose(file_ptr);
-  }
-
-  file_ptr = GetFilePtr(language_data_path_prefix,
-                        kFreqDawgFileSuffix, false, false);
-  if (file_ptr != NULL) {
-    offset_table[TESSDATA_FREQ_DAWG] = ftell(output_file);
-    CopyFile(file_ptr, output_file, false);
-    fclose(file_ptr);
-  }
-
+void TessdataManager::WriteMetadata(inT64 *offset_table, FILE *output_file) {
  fseek(output_file, 0, SEEK_SET);
  inT32 num_entries = TESSDATA_NUM_ENTRIES;
  fwrite(&num_entries, sizeof(inT32), 1, output_file);
@ -195,9 +100,155 @@ void TessdataManager::CombineDataFiles(
  fclose(output_file);

  tprintf("TessdataManager combined tesseract data files.\n");
-  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
    tprintf("Offset for type %d is %lld\n", i, offset_table[i]);
  }
 }

+bool TessdataManager::CombineDataFiles(
+    const char *language_data_path_prefix,
+    const char *output_filename) {
+  int i;
+  inT64 offset_table[TESSDATA_NUM_ENTRIES];
+  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
+  FILE *output_file = fopen(output_filename, "wb");
+  if (output_file == NULL) {
+    tprintf("Error opening %s for writing\n", output_filename);
+    return false;
+  }
+  // Leave some space for recording the offset_table.
+  fseek(output_file,
+        sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
+
+  TessdataType type;
+  bool text_file;
+  FILE *file_ptr[TESSDATA_NUM_ENTRIES];
+
+  // Load individual tessdata components from files.
+  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+    ASSERT_HOST(TessdataTypeFromFileSuffix(
+        kTessdataFileSuffixes[i], &type, &text_file));
+    STRING filename = language_data_path_prefix;
+    filename += kTessdataFileSuffixes[i];
+    file_ptr[i] =  fopen(filename.string(), text_file ? "r" : "rb");
+    if (file_ptr[i] != NULL) {
+      offset_table[type] = ftell(output_file);
+      CopyFile(file_ptr[i], output_file, text_file, -1);
+      fclose(file_ptr[i]);
+    }
+  }
+
+  // Make sure that the required components are present.
+  if (file_ptr[TESSDATA_UNICHARSET] == NULL) {
+    tprintf("Error opening unicharset file\n");
+    fclose(output_file);
+    return false;
+  }
+  if (file_ptr[TESSDATA_INTTEMP] != NULL &&
+      (file_ptr[TESSDATA_PFFMTABLE] == NULL ||
+       file_ptr[TESSDATA_NORMPROTO] == NULL)) {
+    tprintf("Error opening pffmtable and/or normproto files"
+            " while inttemp file was present\n");
+    fclose(output_file);
+    return false;
+  }
+
+  WriteMetadata(offset_table, output_file);
+  return true;
+}
+
+bool TessdataManager::OverwriteComponents(
+    const char *new_traineddata_filename,
+    char **component_filenames,
+    int num_new_components) {
+  int i;
+  inT64 offset_table[TESSDATA_NUM_ENTRIES];
+  TessdataType type;
+  bool text_file;
+  FILE *file_ptr[TESSDATA_NUM_ENTRIES];
+  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+    offset_table[i] = -1;
+    file_ptr[i] = NULL;
+  }
+  FILE *output_file = fopen(new_traineddata_filename, "wb");
+  if (output_file == NULL) {
+    tprintf("Error opening %s for writing\n", new_traineddata_filename);
+    return false;
+  }
+
+  // Leave some space for recording the offset_table.
+  fseek(output_file,
+        sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
+
+  // Open the files with the new components.
+  for (i = 0; i < num_new_components; ++i) {
+    TessdataTypeFromFileName(component_filenames[i], &type, &text_file);
+    file_ptr[type] = fopen(component_filenames[i], text_file ? "r" : "rb");
+  }
+
+  // Write updated data to the output traineddata file.
+  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+    if (file_ptr[i] != NULL) {
+      // Get the data from the opened component file.
+      offset_table[i] = ftell(output_file);
+      CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1);
+      fclose(file_ptr[i]);
+    } else {
+      // Get this data component from the loaded data file.
+      if (SeekToStart(static_cast<TessdataType>(i))) {
+        offset_table[i] = ftell(output_file);
+        CopyFile(data_file_, output_file, kTessdataFileIsText[i],
+                 GetEndOffset(static_cast<TessdataType>(i)) -
+                 ftell(data_file_) + 1);
+      }
+    }
+  }
+
+  WriteMetadata(offset_table, output_file);
+  return true;
+}
+
+bool TessdataManager::TessdataTypeFromFileSuffix(
+    const char *suffix, TessdataType *type, bool *text_file) {
+  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+    if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
+      *type = static_cast<TessdataType>(i);
+      *text_file = kTessdataFileIsText[i];
+      return true;
+    }
+  }
+  printf("TessdataManager can't determine which tessdata"
+         " component is represented by %s\n", suffix);
+  return false;
+}
+
+bool TessdataManager::TessdataTypeFromFileName(
+    const char *filename, TessdataType *type, bool *text_file) {
+  // Get the file suffix (extension)
+  const char *suffix = strrchr(filename, '.');
+  if (suffix == NULL || *(++suffix) == '\0') return false;
+  return TessdataTypeFromFileSuffix(suffix, type, text_file);
+}
+
+bool TessdataManager::ExtractToFile(const char *filename) {
+  TessdataType type;
+  bool text_file;
+  ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(
+      filename, &type, &text_file));
+  if (!SeekToStart(type)) return false;
+
+  FILE *output_file = fopen(filename, "wb");
+  if (output_file == NULL) {
+    printf("Error openning %s\n", filename);
+    exit(1);
+  }
+  inT64 begin_offset = ftell(GetDataFilePtr());
+  inT64 end_offset = GetEndOffset(type);
+  tesseract::TessdataManager::CopyFile(
+      GetDataFilePtr(), output_file, text_file,
+      end_offset - begin_offset + 1);
+  fclose(output_file);
+  return true;
+}
+
 }  // namespace tesseract
--- a/ccutil/tessdatamanager.h
+++ b/ccutil/tessdatamanager.h
@ -37,6 +37,8 @@ extern INT_VAR_H(global_tessdata_manager_debug_level, 0,

 static const char kTrainedDataSuffix[] = "traineddata";

+// When adding new tessdata types and file suffixes, please make sure to
+// update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.
 static const char kLangConfigFileSuffix[] = "config";
 static const char kUnicharsetFileSuffix[] = "unicharset";
 static const char kAmbigsFileSuffix[] = "unicharambigs";
@ -65,6 +67,36 @@ enum TessdataType {
  TESSDATA_NUM_ENTRIES
 };

+// kTessdataFileSuffixes[i] indicates the file suffix for
+// tessdata of type i (from TessdataType enum).
+static const char * const kTessdataFileSuffixes[] = {
+  kLangConfigFileSuffix,        // 0
+  kUnicharsetFileSuffix,        // 1
+  kAmbigsFileSuffix,            // 2
+  kBuiltInTemplatesFileSuffix,  // 3
+  kBuiltInCutoffsFileSuffix,    // 4
+  kNormProtoFileSuffix,         // 5
+  kPuncDawgFileSuffix,          // 6
+  kSystemDawgFileSuffix,        // 7
+  kNumberDawgFileSuffix,        // 8
+  kFreqDawgFileSuffix,          // 9
+};
+
+// If kTessdataFileIsText[i] is true - the tessdata component
+// of type i (from TessdataType enum) is text, and is binary otherwise.
+static const bool kTessdataFileIsText[] = {
+  true,                         // 0
+  true,                         // 1
+  true,                         // 2
+  false,                        // 3
+  true,                         // 4
+  true,                         // 5
+  false,                        // 6
+  false,                        // 7
+  false,                        // 8
+  false,                        // 9
+};
+
 // TessdataType could be updated to contain more entries, however
 // we do not expect that number to be astronomically high.
 // In order to automatically detect endianness TessdataManager will
@ -102,7 +134,8 @@ class TessdataManager {
      return false;
    } else {
      ASSERT_HOST(fseek(data_file_,
-                        offset_table_[tessdata_type], SEEK_SET) == 0);
+                        static_cast<size_t>(offset_table_[tessdata_type]),
+                        SEEK_SET) == 0);
      return true;
    }
  }
@ -128,24 +161,55 @@ class TessdataManager {
    }
  }

+  // Writes the number of entries and the given offset table to output_file.
+  static void WriteMetadata(inT64 *offset_table, FILE *output_file);
+
  // Reads all the standard tesseract config and data files for a language
  // at the given path and bundles them up into one binary data file.
-  static void CombineDataFiles(const char *language_data_path_prefix,
+  // Returns true if the combined traineddata file was successfully written.
+  static bool CombineDataFiles(const char *language_data_path_prefix,
                               const char *output_filename);

+  // Gets the individual components from the data_file_ with which the class was
+  // initialized. Overwrites the components specified by component_filenames.
+  // Writes the updated traineddata file to new_traineddata_filename.
+  bool OverwriteComponents(const char *new_traineddata_filename,
+                            char **component_filenames,
+                            int num_new_components);
+
+  // Extracts tessdata component implied by the name of the input file from
+  // the combined traineddata loaded into TessdataManager.
+  // Writes the extracted component to the file indicated by the file name.
+  // E.g. if the filename given is somepath/somelang.unicharset, unicharset
+  // will be extracted from the data loaded into the TessdataManager and will
+  // be written to somepath/somelang.unicharset.
+  // Returns true if the component was successfully extracted, false if the
+  // component was not present in the traineddata loaded into TessdataManager.
+  bool ExtractToFile(const char *filename);
+
+  // Copies data from the given input file to the output_file provided.
+  // If num_bytes_to_copy is >= 0, only num_bytes_to_copy is copied from
+  // the input file, otherwise all the data in the input file is copied.
+  static void CopyFile(FILE *input_file, FILE *output_file,
+                       bool newline_end, inT64 num_bytes_to_copy);
+
+  // Fills type with TessdataType of the tessdata component represented by the
+  // given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.
+  // Sets *text_file to true if the component is in text format (e.g.
+  // unicharset, unichar ambigs, config, etc).
+  // Returns true if the tessdata component type could be determined
+  // from the given file name.
+  static bool TessdataTypeFromFileSuffix(const char *suffix,
+                                         TessdataType *type,
+                                         bool *text_file);
+
+  // Tries to determine tessdata component file suffix from filename,
+  // returns true on success.
+  static bool TessdataTypeFromFileName(const char *filename,
+                                       TessdataType *type,
+                                       bool *text_file);
+
 private:
-
-  // Opens the file whose name is a concatentation of language_data_path_prefix
-  // and file_suffix. Terminates the program if required_file is set to true,
-  // but the file could not be found or opened for reading.
-  // Returns a file pointer to the opened file.
-  static FILE *GetFilePtr(const char *language_data_path_prefix,
-                          const char *file_suffix, bool required_file,
-                          bool text_file);
-
-  // Copies all the bytes in the given input file to the output_file provided.
-  static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end);
-
  // Each offset_table_[i] contains a file offset in the combined data file
  // where the data of TessdataFileType i is stored.
  inT64 offset_table_[TESSDATA_NUM_ENTRIES];
--- a/training/combine_tessdata.cpp
+++ b/training/combine_tessdata.cpp
@ -20,12 +20,113 @@

 #include "tessdatamanager.h"

+// Main program to combine/extract/overwrite tessdata components
+// in [lang].traineddata files.
+//
+// To combine all the individual tessdata components (unicharset, DAWGs,
+// classifier templates, ambiguities, language configs) located at, say,
+// /home/$USER/temp/eng.* run:
+//
+//   combine_tessdata /home/$USER/temp/eng.
+//
+// The result will be a combined tessdata file /home/$USER/temp/eng.traineddata
+//
+// Specify option -e if you would like to extract individual components
+// from a combined traineddata file. For example, to extract language config
+// file and the unicharset from tessdata/eng.traineddata run:
+//
+//   combine_tessdata -e tessdata/eng.traineddata
+//   /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
+//
+// The desired config file and unicharset will be written to
+// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
+//
+// Specify option -o to overwrite individual components of the given
+// [lang].traineddata file. For example, to overwrite language config
+// and unichar ambiguities files in tessdata/eng.traineddata use:
+//
+//   combine_tessdata -o tessdata/eng.traineddata
+//   /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
+//
+// As a result, tessdata/eng.traineddata will contain the new language config
+// and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.
+//
+// Note: the file names of the files to extract to and to overwrite from should
+// have the appropriate file suffixes (extensions) indicating their tessdata
+// component type (.unicharset for the unicharset, .unicharambigs for unichar
+// ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.
+//
+// Specify option -u to unpack all the components to the specified path:
+//
+// combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
+//
+// This will create  /home/$USER/temp/eng.* files with individual tessdata
+// components from tessdata/eng.traineddata.
+//
 int main(int argc, char **argv) {
-  if (!(argc == 2)) {
-    printf("Usage: %s language_data_path_prefix (e.g. tessdata/eng.)", argv[0]);
+  int i;
+  if (argc == 2) {
+    printf("Combininig tessdata files\n");
+    STRING output_file = argv[1];
+    output_file += kTrainedDataSuffix;
+    if (!tesseract::TessdataManager::CombineDataFiles(
+        argv[1], output_file.string())) {
+      printf("Error combining tessdata files into %s\n",
+             output_file.string());
+    }
+  } else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 ||
+                           strcmp(argv[1], "-u") == 0)) {
+    // Initialize TessdataManager with the data in the given traineddata file.
+    tesseract::TessdataManager tm;
+    tm.Init(argv[2]);
+    printf("Extracting tessdata components from %s\n", argv[2]);
+    if (strcmp(argv[1], "-e") == 0) {
+      for (i = 3; i < argc; ++i) {
+        if (tm.ExtractToFile(argv[i])) {
+          printf("Wrote %s\n", argv[i]);
+        } else {
+          printf("Not extracting %s, since this component"
+                 " is not present\n", argv[i]);
+        }
+      }
+    } else {  // extract all the components
+      for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
+        STRING filename = argv[3];
+        filename += tesseract::kTessdataFileSuffixes[i];
+        if (tm.ExtractToFile(filename.string())) {
+          printf("Wrote %s\n", filename.string());
+        }
+      }
+    }
+    tm.End();
+  } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
+    // Rename the current traineddata file to a temporary name.
+    const char *new_traineddata_filename = argv[2];
+    STRING traineddata_filename = new_traineddata_filename;
+    traineddata_filename += ".__tmp__";
+    if (rename(new_traineddata_filename, traineddata_filename.string()) != 0) {
+      tprintf("Failed to create a temporary file %s\n",
+              traineddata_filename.string());
+      exit(1);
+    }
+
+    // Initialize TessdataManager with the data in the given traineddata file.
+    tesseract::TessdataManager tm;
+    tm.Init(traineddata_filename.string());
+
+    // Write the updated traineddata file.
+    tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
+    tm.End();
+  } else {
+    printf("Usage for combining tessdata components:\n"
+           "%s language_data_path_prefix (e.g. tessdata/eng.)\n", argv[0]);
+    printf("Usage for extracting tessdata components:\n"
+           "%s -e traineddata_file [output_component_file...]\n", argv[0]);
+    printf("Usage for overwriting tessdata components:\n"
+           "%s -o traineddata_file [input_component_file...]\n", argv[0]);
+    printf("Usage for unpacking all tessdata components:\n"
+           "%s -u traineddata_file output_path_prefix"
+           " (e.g. /tmp/eng.)\n", argv[0]);
    return 1;
  }
-  STRING output_file = argv[1];
-  output_file += kTrainedDataSuffix;
-  tesseract::TessdataManager::CombineDataFiles(argv[1], output_file.string());
 }