Add initial support for traineddata files in standard archive formats

This requires libarchive-dev. Tesseract can now load traineddata files in any of the archive formats which are supported by libarchive. Example of a zipped BagIt archive: $ unzip -l /usr/local/share/tessdata/zip.traineddata Archive: /usr/local/share/tessdata/zip.traineddata Length Date Time Name --------- ---------- ----- ---- 55 2019-03-05 15:27 bagit.txt 0 2019-03-05 15:25 data/ 1557 2019-03-05 15:28 manifest-sha256.txt 1082890 2019-03-05 15:25 data/eng.word-dawg 1487588 2019-03-05 15:25 data/eng.lstm 7477 2019-03-05 15:25 data/eng.unicharset 63346 2019-03-05 15:25 data/eng.shapetable 976552 2019-03-05 15:25 data/eng.inttemp 13408 2019-03-05 15:25 data/eng.normproto 4322 2019-03-05 15:25 data/eng.punc-dawg 4738 2019-03-05 15:25 data/eng.lstm-number-dawg 1410 2019-03-05 15:25 data/eng.freq-dawg 844 2019-03-05 15:25 data/eng.pffmtable 6360 2019-03-05 15:25 data/eng.lstm-unicharset 1012 2019-03-05 15:25 data/eng.lstm-recoder 1047 2019-03-05 15:25 data/eng.unicharambigs 4322 2019-03-05 15:25 data/eng.lstm-punc-dawg 16109842 2019-03-05 15:25 data/eng.bigram-dawg 80 2019-03-05 15:25 data/eng.version 6426 2019-03-05 15:25 data/eng.number-dawg 3694794 2019-03-05 15:25 data/eng.lstm-word-dawg --------- ------- 23468070 21 files `combine_tessdata -d` and `combine_tessdata -u` also work. The traineddata files in the new format can be generated with standard tools like zip or tar. More work is needed for other training tools and big endian support. Signed-off-by: Stefan Weil <sw@weilnetz.de>
2025-01-19 06:53:36 +08:00 · 2019-03-05 15:52:15 +01:00 · 2019-03-05 15:52:15 +01:00 · 1c7e00611b
commit 1c7e00611b
parent 7fbde96a04
7 changed files with 95 additions and 10 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -21,6 +21,7 @@ addons:
    sources:
    #- ubuntu-toolchain-r-test
    packages:
+      - libarchive-dev
    #- g++-6

 #matrix:
--- a/configure.ac
+++ b/configure.ac
@ -422,6 +422,12 @@ else
  AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.])
 fi

+PKG_CHECK_MODULES([libarchive], [libarchive], [have_libarchive=true], [have_libarchive=false])
+AM_CONDITIONAL([HAVE_LIBARCHIVE], [$have_libarchive])
+if $have_libarchive; then
+  AC_DEFINE([HAVE_LIBARCHIVE], [], [Enable libarchive])
+fi
+
 AM_CONDITIONAL([ENABLE_TRAINING], true)

 # Check availability of ICU packages.
--- a/src/api/Makefile.am
+++ b/src/api/Makefile.am
@ -88,6 +88,7 @@ tesseract_LDFLAGS = $(OPENCL_LDFLAGS)

 tesseract_LDADD += $(LEPTONICA_LIBS)
 tesseract_LDADD += $(OPENMP_CXXFLAGS)
+tesseract_LDADD += $(libarchive_LIBS)

 if T_WIN
 tesseract_LDADD += -ltiff
--- a/src/ccutil/Makefile.am
+++ b/src/ccutil/Makefile.am
@ -40,6 +40,8 @@ libtesseract_ccutil_la_SOURCES = \
    unichar.cpp unicharcompress.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \
    params.cpp universalambigs.cpp

+AM_CPPFLAGS += $(libarchive_CFLAGS)
+
 if T_WIN
 AM_CPPFLAGS += -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\"
 endif
--- a/src/ccutil/tessdatamanager.cpp
+++ b/src/ccutil/tessdatamanager.cpp
@ -2,7 +2,6 @@
 // File:        tessdatamanager.cpp
 // Description: Functions to handle loading/combining tesseract data files.
 // Author:      Daria Antonova
-// Created:     Wed Jun 03 11:26:43 PST 2009
 //
 // (C) Copyright 2009, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
@ -24,6 +23,12 @@
 #include "tessdatamanager.h"

 #include <cstdio>
+#include <string>
+
+#if defined(HAVE_LIBARCHIVE)
+#include <archive.h>
+#include <archive_entry.h>
+#endif

 #include "errcode.h"
 #include "helpers.h"
@ -52,9 +57,49 @@ void TessdataManager::LoadFileLater(const char *data_file_name) {
  data_file_name_ = data_file_name;
 }

+#if defined(HAVE_LIBARCHIVE)
+bool TessdataManager::LoadArchiveFile(const char *filename) {
+  bool result = false;
+  archive *a = archive_read_new();
+  if (a != nullptr) {
+    archive_read_support_filter_all(a);
+    archive_read_support_format_all(a);
+    if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {
+      archive_entry *ae;
+      while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {
+        const char *component = archive_entry_pathname(ae);
+        if (component != nullptr) {
+          TessdataType type;
+          if (TessdataTypeFromFileName(component, &type)) {
+            int64_t size = archive_entry_size(ae);
+            if (size > 0) {
+              entries_[type].resize_no_init(size);
+              if (archive_read_data(a, &entries_[type][0], size) == size) {
+                is_loaded_ = true;
+              }
+            }
+          }
+        }
+      }
+      result = is_loaded_;
+#if defined(DEBUG)
+    } else {
+      tprintf("archive_read_open_filename(...,%s,...) failed, %s\n",
+              filename, strerror(archive_errno(a)));
+#endif
+    }
+    archive_read_free(a);
+  }
+  return result;
+}
+#endif
+
 bool TessdataManager::Init(const char *data_file_name) {
  GenericVector<char> data;
  if (reader_ == nullptr) {
+#if defined(HAVE_LIBARCHIVE)
+    if (LoadArchiveFile(data_file_name)) return true;
+#endif
    if (!LoadDataFromFile(data_file_name, &data)) return false;
  } else {
    if (!(*reader_)(data_file_name, &data)) return false;
@ -65,6 +110,7 @@ bool TessdataManager::Init(const char *data_file_name) {
 // Loads from the given memory buffer as if a file.
 bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
                                    int size) {
+  // TODO: This method supports only the proprietary file format.
  Clear();
  data_file_name_ = name;
  TFile fp;
@ -78,10 +124,10 @@ bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
  GenericVector<int64_t> offset_table;
  offset_table.resize_no_init(num_entries);
  if (!fp.DeSerialize(&offset_table[0], num_entries)) return false;
-  for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
+  for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
    if (offset_table[i] >= 0) {
      int64_t entry_size = size - offset_table[i];
-      int j = i + 1;
+      unsigned j = i + 1;
      while (j < num_entries && offset_table[j] == -1) ++j;
      if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
      entries_[i].resize_no_init(entry_size);
@ -106,6 +152,7 @@ void TessdataManager::OverwriteEntry(TessdataType type, const char *data,
 // Saves to the given filename.
 bool TessdataManager::SaveFile(const STRING &filename,
                               FileWriter writer) const {
+  // TODO: This method supports only the proprietary file format.
  ASSERT_HOST(is_loaded_);
  GenericVector<char> data;
  Serialize(&data);
@ -117,11 +164,12 @@ bool TessdataManager::SaveFile(const STRING &filename,

 // Serializes to the given vector.
 void TessdataManager::Serialize(GenericVector<char> *data) const {
+  // TODO: This method supports only the proprietary file format.
  ASSERT_HOST(is_loaded_);
  // Compute the offset_table and total size.
  int64_t offset_table[TESSDATA_NUM_ENTRIES];
  int64_t offset = sizeof(int32_t) + sizeof(offset_table);
-  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
    if (entries_[i].empty()) {
      offset_table[i] = -1;
    } else {
@ -135,7 +183,7 @@ void TessdataManager::Serialize(GenericVector<char> *data) const {
  fp.OpenWrite(data);
  fp.Serialize(&num_entries);
  fp.Serialize(&offset_table[0], countof(offset_table));
-  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
    if (!entries_[i].empty()) {
      fp.Serialize(&entries_[i][0], entries_[i].size());
    }
@ -144,7 +192,7 @@ void TessdataManager::Serialize(GenericVector<char> *data) const {

 // Resets to the initial state, keeping the reader.
 void TessdataManager::Clear() {
-  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
    entries_[i].clear();
  }
  is_loaded_ = false;
@ -154,7 +202,7 @@ void TessdataManager::Clear() {
 void TessdataManager::Directory() const {
  tprintf("Version string:%s\n", VersionString().c_str());
  int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
-  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
    if (!entries_[i].empty()) {
      tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
              entries_[i].size(), offset);
@ -197,7 +245,7 @@ bool TessdataManager::CombineDataFiles(
    const char *language_data_path_prefix,
    const char *output_filename) {
  // Load individual tessdata components from files.
-  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
    TessdataType type;
    ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
    STRING filename = language_data_path_prefix;
@ -229,6 +277,7 @@ bool TessdataManager::OverwriteComponents(
    char **component_filenames,
    int num_new_components) {
  // Open the files with the new components.
+  // TODO: This method supports only the proprietary file format.
  for (int i = 0; i < num_new_components; ++i) {
    TessdataType type;
    if (TessdataTypeFromFileName(component_filenames[i], &type)) {
@ -253,14 +302,16 @@ bool TessdataManager::ExtractToFile(const char *filename) {

 bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix,
                                                 TessdataType *type) {
-  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
    if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
      *type = static_cast<TessdataType>(i);
      return true;
    }
  }
+#if defined(DEBUG)
  tprintf("TessdataManager can't determine which tessdata"
         " component is represented by %s\n", suffix);
+#endif
  return false;
 }

--- a/src/ccutil/tessdatamanager.h
+++ b/src/ccutil/tessdatamanager.h
@ -214,6 +214,11 @@ class TessdataManager {
   */
  bool ExtractToFile(const char *filename);

+ private:
+
+  // Use libarchive.
+  bool LoadArchiveFile(const char *filename);
+
  /**
   * Fills type with TessdataType of the tessdata component represented by the
   * given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.
@ -230,7 +235,6 @@ class TessdataManager {
  static bool TessdataTypeFromFileName(const char *filename,
                                       TessdataType *type);

- private:
  // Name of file it came from.
  STRING data_file_name_;
  // Function to load the file when we need it.
--- a/src/training/Makefile.am
+++ b/src/training/Makefile.am
@ -280,3 +280,23 @@ set_unicharset_properties_LDADD += $(LEPTONICA_LIBS)
 text2image_LDADD += $(LEPTONICA_LIBS)
 unicharset_extractor_LDADD += $(LEPTONICA_LIBS)
 wordlist2dawg_LDADD += $(LEPTONICA_LIBS)
+
+extralib = $(libarchive_LIBS)
+
+if !DISABLED_LEGACY_ENGINE
+ambiguous_words_LDADD += $(extralib)
+classifier_tester_LDADD += $(extralib)
+cntraining_LDADD += $(extralib)
+mftraining_LDADD += $(extralib)
+shapeclustering_LDADD += $(extralib)
+endif
+combine_lang_model_LDADD += $(extralib)
+combine_tessdata_LDADD += $(extralib)
+dawg2wordlist_LDADD += $(extralib)
+lstmeval_LDADD += $(extralib)
+lstmtraining_LDADD += $(extralib)
+merge_unicharsets_LDADD += $(extralib)
+set_unicharset_properties_LDADD += $(extralib)
+text2image_LDADD += $(extralib)
+unicharset_extractor_LDADD += $(extralib)
+wordlist2dawg_LDADD += $(extralib)