Merge pull request #2290 from stweil/libarchive

Add initial support for traineddata files in standard archive formats
2024-11-27 20:59:36 +08:00 · 2019-03-05 17:42:13 +01:00 · 2019-03-05 17:42:13 +01:00 · 868a623f8d
commit 868a623f8d
parent 7fbde96a04 1c7e00611b
7 changed files with 95 additions and 10 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -21,6 +21,7 @@ addons:
    sources:
    #- ubuntu-toolchain-r-test
    packages:
+      - libarchive-dev
    #- g++-6

 #matrix:
--- a/configure.ac
+++ b/configure.ac
@ -422,6 +422,12 @@ else
  AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.])
 fi

+PKG_CHECK_MODULES([libarchive], [libarchive], [have_libarchive=true], [have_libarchive=false])
+AM_CONDITIONAL([HAVE_LIBARCHIVE], [$have_libarchive])
+if $have_libarchive; then
+  AC_DEFINE([HAVE_LIBARCHIVE], [], [Enable libarchive])
+fi
+
 AM_CONDITIONAL([ENABLE_TRAINING], true)

 # Check availability of ICU packages.
--- a/src/api/Makefile.am
+++ b/src/api/Makefile.am
@ -88,6 +88,7 @@ tesseract_LDFLAGS = $(OPENCL_LDFLAGS)

 tesseract_LDADD += $(LEPTONICA_LIBS)
 tesseract_LDADD += $(OPENMP_CXXFLAGS)
+tesseract_LDADD += $(libarchive_LIBS)

 if T_WIN
 tesseract_LDADD += -ltiff
--- a/src/ccutil/Makefile.am
+++ b/src/ccutil/Makefile.am
@ -40,6 +40,8 @@ libtesseract_ccutil_la_SOURCES = \
    unichar.cpp unicharcompress.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \
    params.cpp universalambigs.cpp

+AM_CPPFLAGS += $(libarchive_CFLAGS)
+
 if T_WIN
 AM_CPPFLAGS += -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\"
 endif
--- a/src/ccutil/tessdatamanager.cpp
+++ b/src/ccutil/tessdatamanager.cpp
@ -2,7 +2,6 @@
 // File:        tessdatamanager.cpp
 // Description: Functions to handle loading/combining tesseract data files.
 // Author:      Daria Antonova
-// Created:     Wed Jun 03 11:26:43 PST 2009
 //
 // (C) Copyright 2009, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
@ -24,6 +23,12 @@
 #include "tessdatamanager.h"

 #include <cstdio>
+#include <string>
+
+#if defined(HAVE_LIBARCHIVE)
+#include <archive.h>
+#include <archive_entry.h>
+#endif

 #include "errcode.h"
 #include "helpers.h"
@ -52,9 +57,49 @@ void TessdataManager::LoadFileLater(const char *data_file_name) {
  data_file_name_ = data_file_name;
 }

+#if defined(HAVE_LIBARCHIVE)
+bool TessdataManager::LoadArchiveFile(const char *filename) {
+  bool result = false;
+  archive *a = archive_read_new();
+  if (a != nullptr) {
+    archive_read_support_filter_all(a);
+    archive_read_support_format_all(a);
+    if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {
+      archive_entry *ae;
+      while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {
+        const char *component = archive_entry_pathname(ae);
+        if (component != nullptr) {
+          TessdataType type;
+          if (TessdataTypeFromFileName(component, &type)) {
+            int64_t size = archive_entry_size(ae);
+            if (size > 0) {
+              entries_[type].resize_no_init(size);
+              if (archive_read_data(a, &entries_[type][0], size) == size) {
+                is_loaded_ = true;
+              }
+            }
+          }
+        }
+      }
+      result = is_loaded_;
+#if defined(DEBUG)
+    } else {
+      tprintf("archive_read_open_filename(...,%s,...) failed, %s\n",
+              filename, strerror(archive_errno(a)));
+#endif
+    }
+    archive_read_free(a);
+  }
+  return result;
+}
+#endif
+
 bool TessdataManager::Init(const char *data_file_name) {
  GenericVector<char> data;
  if (reader_ == nullptr) {
+#if defined(HAVE_LIBARCHIVE)
+    if (LoadArchiveFile(data_file_name)) return true;
+#endif
    if (!LoadDataFromFile(data_file_name, &data)) return false;
  } else {
    if (!(*reader_)(data_file_name, &data)) return false;
@ -65,6 +110,7 @@ bool TessdataManager::Init(const char *data_file_name) {
 // Loads from the given memory buffer as if a file.
 bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
                                    int size) {
+  // TODO: This method supports only the proprietary file format.
  Clear();
  data_file_name_ = name;
  TFile fp;
@ -78,10 +124,10 @@ bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
  GenericVector<int64_t> offset_table;
  offset_table.resize_no_init(num_entries);
  if (!fp.DeSerialize(&offset_table[0], num_entries)) return false;
-  for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
+  for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
    if (offset_table[i] >= 0) {
      int64_t entry_size = size - offset_table[i];
-      int j = i + 1;
+      unsigned j = i + 1;
      while (j < num_entries && offset_table[j] == -1) ++j;
      if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
      entries_[i].resize_no_init(entry_size);
@ -106,6 +152,7 @@ void TessdataManager::OverwriteEntry(TessdataType type, const char *data,
 // Saves to the given filename.
 bool TessdataManager::SaveFile(const STRING &filename,
                               FileWriter writer) const {
+  // TODO: This method supports only the proprietary file format.
  ASSERT_HOST(is_loaded_);
  GenericVector<char> data;
  Serialize(&data);
@ -117,11 +164,12 @@ bool TessdataManager::SaveFile(const STRING &filename,

 // Serializes to the given vector.
 void TessdataManager::Serialize(GenericVector<char> *data) const {
+  // TODO: This method supports only the proprietary file format.
  ASSERT_HOST(is_loaded_);
  // Compute the offset_table and total size.
  int64_t offset_table[TESSDATA_NUM_ENTRIES];
  int64_t offset = sizeof(int32_t) + sizeof(offset_table);
-  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
    if (entries_[i].empty()) {
      offset_table[i] = -1;
    } else {
@ -135,7 +183,7 @@ void TessdataManager::Serialize(GenericVector<char> *data) const {
  fp.OpenWrite(data);
  fp.Serialize(&num_entries);
  fp.Serialize(&offset_table[0], countof(offset_table));
-  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
    if (!entries_[i].empty()) {
      fp.Serialize(&entries_[i][0], entries_[i].size());
    }
@ -144,7 +192,7 @@ void TessdataManager::Serialize(GenericVector<char> *data) const {

 // Resets to the initial state, keeping the reader.
 void TessdataManager::Clear() {
-  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
    entries_[i].clear();
  }
  is_loaded_ = false;
@ -154,7 +202,7 @@ void TessdataManager::Clear() {
 void TessdataManager::Directory() const {
  tprintf("Version string:%s\n", VersionString().c_str());
  int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
-  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
    if (!entries_[i].empty()) {
      tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
              entries_[i].size(), offset);
@ -197,7 +245,7 @@ bool TessdataManager::CombineDataFiles(
    const char *language_data_path_prefix,
    const char *output_filename) {
  // Load individual tessdata components from files.
-  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
    TessdataType type;
    ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
    STRING filename = language_data_path_prefix;
@ -229,6 +277,7 @@ bool TessdataManager::OverwriteComponents(
    char **component_filenames,
    int num_new_components) {
  // Open the files with the new components.
+  // TODO: This method supports only the proprietary file format.
  for (int i = 0; i < num_new_components; ++i) {
    TessdataType type;
    if (TessdataTypeFromFileName(component_filenames[i], &type)) {
@ -253,14 +302,16 @@ bool TessdataManager::ExtractToFile(const char *filename) {

 bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix,
                                                 TessdataType *type) {
-  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
+  for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
    if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
      *type = static_cast<TessdataType>(i);
      return true;
    }
  }
+#if defined(DEBUG)
  tprintf("TessdataManager can't determine which tessdata"
         " component is represented by %s\n", suffix);
+#endif
  return false;
 }

--- a/src/ccutil/tessdatamanager.h
+++ b/src/ccutil/tessdatamanager.h
@ -214,6 +214,11 @@ class TessdataManager {
   */
  bool ExtractToFile(const char *filename);

+ private:
+
+  // Use libarchive.
+  bool LoadArchiveFile(const char *filename);
+
  /**
   * Fills type with TessdataType of the tessdata component represented by the
   * given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.
@ -230,7 +235,6 @@ class TessdataManager {
  static bool TessdataTypeFromFileName(const char *filename,
                                       TessdataType *type);

- private:
  // Name of file it came from.
  STRING data_file_name_;
  // Function to load the file when we need it.
--- a/src/training/Makefile.am
+++ b/src/training/Makefile.am
@ -280,3 +280,23 @@ set_unicharset_properties_LDADD += $(LEPTONICA_LIBS)
 text2image_LDADD += $(LEPTONICA_LIBS)
 unicharset_extractor_LDADD += $(LEPTONICA_LIBS)
 wordlist2dawg_LDADD += $(LEPTONICA_LIBS)
+
+extralib = $(libarchive_LIBS)
+
+if !DISABLED_LEGACY_ENGINE
+ambiguous_words_LDADD += $(extralib)
+classifier_tester_LDADD += $(extralib)
+cntraining_LDADD += $(extralib)
+mftraining_LDADD += $(extralib)
+shapeclustering_LDADD += $(extralib)
+endif
+combine_lang_model_LDADD += $(extralib)
+combine_tessdata_LDADD += $(extralib)
+dawg2wordlist_LDADD += $(extralib)
+lstmeval_LDADD += $(extralib)
+lstmtraining_LDADD += $(extralib)
+merge_unicharsets_LDADD += $(extralib)
+set_unicharset_properties_LDADD += $(extralib)
+text2image_LDADD += $(extralib)
+unicharset_extractor_LDADD += $(extralib)
+wordlist2dawg_LDADD += $(extralib)