diff --git a/.travis.yml b/.travis.yml index 02f7c567..5df8866b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,6 +21,7 @@ addons: sources: #- ubuntu-toolchain-r-test packages: + - libarchive-dev #- g++-6 #matrix: diff --git a/configure.ac b/configure.ac index 1ba3ff38..ef5d1b47 100644 --- a/configure.ac +++ b/configure.ac @@ -422,6 +422,12 @@ else AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.]) fi +PKG_CHECK_MODULES([libarchive], [libarchive], [have_libarchive=true], [have_libarchive=false]) +AM_CONDITIONAL([HAVE_LIBARCHIVE], [$have_libarchive]) +if $have_libarchive; then + AC_DEFINE([HAVE_LIBARCHIVE], [], [Enable libarchive]) +fi + AM_CONDITIONAL([ENABLE_TRAINING], true) # Check availability of ICU packages. diff --git a/src/api/Makefile.am b/src/api/Makefile.am index 2ab91184..88eee32b 100644 --- a/src/api/Makefile.am +++ b/src/api/Makefile.am @@ -88,6 +88,7 @@ tesseract_LDFLAGS = $(OPENCL_LDFLAGS) tesseract_LDADD += $(LEPTONICA_LIBS) tesseract_LDADD += $(OPENMP_CXXFLAGS) +tesseract_LDADD += $(libarchive_LIBS) if T_WIN tesseract_LDADD += -ltiff diff --git a/src/ccutil/Makefile.am b/src/ccutil/Makefile.am index 05d80b62..692fd330 100644 --- a/src/ccutil/Makefile.am +++ b/src/ccutil/Makefile.am @@ -40,6 +40,8 @@ libtesseract_ccutil_la_SOURCES = \ unichar.cpp unicharcompress.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \ params.cpp universalambigs.cpp +AM_CPPFLAGS += $(libarchive_CFLAGS) + if T_WIN AM_CPPFLAGS += -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\" endif diff --git a/src/ccutil/tessdatamanager.cpp b/src/ccutil/tessdatamanager.cpp index 9a7a75c9..daf57eb6 100644 --- a/src/ccutil/tessdatamanager.cpp +++ b/src/ccutil/tessdatamanager.cpp @@ -2,7 +2,6 @@ // File: tessdatamanager.cpp // Description: Functions to handle loading/combining tesseract data files. // Author: Daria Antonova -// Created: Wed Jun 03 11:26:43 PST 2009 // // (C) Copyright 2009, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); @@ -24,6 +23,12 @@ #include "tessdatamanager.h" #include +#include + +#if defined(HAVE_LIBARCHIVE) +#include +#include +#endif #include "errcode.h" #include "helpers.h" @@ -52,9 +57,49 @@ void TessdataManager::LoadFileLater(const char *data_file_name) { data_file_name_ = data_file_name; } +#if defined(HAVE_LIBARCHIVE) +bool TessdataManager::LoadArchiveFile(const char *filename) { + bool result = false; + archive *a = archive_read_new(); + if (a != nullptr) { + archive_read_support_filter_all(a); + archive_read_support_format_all(a); + if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) { + archive_entry *ae; + while (archive_read_next_header(a, &ae) == ARCHIVE_OK) { + const char *component = archive_entry_pathname(ae); + if (component != nullptr) { + TessdataType type; + if (TessdataTypeFromFileName(component, &type)) { + int64_t size = archive_entry_size(ae); + if (size > 0) { + entries_[type].resize_no_init(size); + if (archive_read_data(a, &entries_[type][0], size) == size) { + is_loaded_ = true; + } + } + } + } + } + result = is_loaded_; +#if defined(DEBUG) + } else { + tprintf("archive_read_open_filename(...,%s,...) failed, %s\n", + filename, strerror(archive_errno(a))); +#endif + } + archive_read_free(a); + } + return result; +} +#endif + bool TessdataManager::Init(const char *data_file_name) { GenericVector data; if (reader_ == nullptr) { +#if defined(HAVE_LIBARCHIVE) + if (LoadArchiveFile(data_file_name)) return true; +#endif if (!LoadDataFromFile(data_file_name, &data)) return false; } else { if (!(*reader_)(data_file_name, &data)) return false; @@ -65,6 +110,7 @@ bool TessdataManager::Init(const char *data_file_name) { // Loads from the given memory buffer as if a file. bool TessdataManager::LoadMemBuffer(const char *name, const char *data, int size) { + // TODO: This method supports only the proprietary file format. Clear(); data_file_name_ = name; TFile fp; @@ -78,10 +124,10 @@ bool TessdataManager::LoadMemBuffer(const char *name, const char *data, GenericVector offset_table; offset_table.resize_no_init(num_entries); if (!fp.DeSerialize(&offset_table[0], num_entries)) return false; - for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) { + for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) { if (offset_table[i] >= 0) { int64_t entry_size = size - offset_table[i]; - int j = i + 1; + unsigned j = i + 1; while (j < num_entries && offset_table[j] == -1) ++j; if (j < num_entries) entry_size = offset_table[j] - offset_table[i]; entries_[i].resize_no_init(entry_size); @@ -106,6 +152,7 @@ void TessdataManager::OverwriteEntry(TessdataType type, const char *data, // Saves to the given filename. bool TessdataManager::SaveFile(const STRING &filename, FileWriter writer) const { + // TODO: This method supports only the proprietary file format. ASSERT_HOST(is_loaded_); GenericVector data; Serialize(&data); @@ -117,11 +164,12 @@ bool TessdataManager::SaveFile(const STRING &filename, // Serializes to the given vector. void TessdataManager::Serialize(GenericVector *data) const { + // TODO: This method supports only the proprietary file format. ASSERT_HOST(is_loaded_); // Compute the offset_table and total size. int64_t offset_table[TESSDATA_NUM_ENTRIES]; int64_t offset = sizeof(int32_t) + sizeof(offset_table); - for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { if (entries_[i].empty()) { offset_table[i] = -1; } else { @@ -135,7 +183,7 @@ void TessdataManager::Serialize(GenericVector *data) const { fp.OpenWrite(data); fp.Serialize(&num_entries); fp.Serialize(&offset_table[0], countof(offset_table)); - for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { if (!entries_[i].empty()) { fp.Serialize(&entries_[i][0], entries_[i].size()); } @@ -144,7 +192,7 @@ void TessdataManager::Serialize(GenericVector *data) const { // Resets to the initial state, keeping the reader. void TessdataManager::Clear() { - for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { entries_[i].clear(); } is_loaded_ = false; @@ -154,7 +202,7 @@ void TessdataManager::Clear() { void TessdataManager::Directory() const { tprintf("Version string:%s\n", VersionString().c_str()); int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t); - for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { if (!entries_[i].empty()) { tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i], entries_[i].size(), offset); @@ -197,7 +245,7 @@ bool TessdataManager::CombineDataFiles( const char *language_data_path_prefix, const char *output_filename) { // Load individual tessdata components from files. - for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { TessdataType type; ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type)); STRING filename = language_data_path_prefix; @@ -229,6 +277,7 @@ bool TessdataManager::OverwriteComponents( char **component_filenames, int num_new_components) { // Open the files with the new components. + // TODO: This method supports only the proprietary file format. for (int i = 0; i < num_new_components; ++i) { TessdataType type; if (TessdataTypeFromFileName(component_filenames[i], &type)) { @@ -253,14 +302,16 @@ bool TessdataManager::ExtractToFile(const char *filename) { bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type) { - for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) { *type = static_cast(i); return true; } } +#if defined(DEBUG) tprintf("TessdataManager can't determine which tessdata" " component is represented by %s\n", suffix); +#endif return false; } diff --git a/src/ccutil/tessdatamanager.h b/src/ccutil/tessdatamanager.h index f003adb4..4372f291 100644 --- a/src/ccutil/tessdatamanager.h +++ b/src/ccutil/tessdatamanager.h @@ -214,6 +214,11 @@ class TessdataManager { */ bool ExtractToFile(const char *filename); + private: + + // Use libarchive. + bool LoadArchiveFile(const char *filename); + /** * Fills type with TessdataType of the tessdata component represented by the * given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET. @@ -230,7 +235,6 @@ class TessdataManager { static bool TessdataTypeFromFileName(const char *filename, TessdataType *type); - private: // Name of file it came from. STRING data_file_name_; // Function to load the file when we need it. diff --git a/src/training/Makefile.am b/src/training/Makefile.am index d86b8d41..41bf3735 100644 --- a/src/training/Makefile.am +++ b/src/training/Makefile.am @@ -280,3 +280,23 @@ set_unicharset_properties_LDADD += $(LEPTONICA_LIBS) text2image_LDADD += $(LEPTONICA_LIBS) unicharset_extractor_LDADD += $(LEPTONICA_LIBS) wordlist2dawg_LDADD += $(LEPTONICA_LIBS) + +extralib = $(libarchive_LIBS) + +if !DISABLED_LEGACY_ENGINE +ambiguous_words_LDADD += $(extralib) +classifier_tester_LDADD += $(extralib) +cntraining_LDADD += $(extralib) +mftraining_LDADD += $(extralib) +shapeclustering_LDADD += $(extralib) +endif +combine_lang_model_LDADD += $(extralib) +combine_tessdata_LDADD += $(extralib) +dawg2wordlist_LDADD += $(extralib) +lstmeval_LDADD += $(extralib) +lstmtraining_LDADD += $(extralib) +merge_unicharsets_LDADD += $(extralib) +set_unicharset_properties_LDADD += $(extralib) +text2image_LDADD += $(extralib) +unicharset_extractor_LDADD += $(extralib) +wordlist2dawg_LDADD += $(extralib)