Merge pull request #2290 from stweil/libarchive

Add initial support for traineddata files in standard archive formats
This commit is contained in:
zdenop 2019-03-05 17:42:13 +01:00 committed by GitHub
commit 868a623f8d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 95 additions and 10 deletions

View File

@ -21,6 +21,7 @@ addons:
sources:
#- ubuntu-toolchain-r-test
packages:
- libarchive-dev
#- g++-6
#matrix:

View File

@ -422,6 +422,12 @@ else
AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.])
fi
PKG_CHECK_MODULES([libarchive], [libarchive], [have_libarchive=true], [have_libarchive=false])
AM_CONDITIONAL([HAVE_LIBARCHIVE], [$have_libarchive])
if $have_libarchive; then
AC_DEFINE([HAVE_LIBARCHIVE], [], [Enable libarchive])
fi
AM_CONDITIONAL([ENABLE_TRAINING], true)
# Check availability of ICU packages.

View File

@ -88,6 +88,7 @@ tesseract_LDFLAGS = $(OPENCL_LDFLAGS)
tesseract_LDADD += $(LEPTONICA_LIBS)
tesseract_LDADD += $(OPENMP_CXXFLAGS)
tesseract_LDADD += $(libarchive_LIBS)
if T_WIN
tesseract_LDADD += -ltiff

View File

@ -40,6 +40,8 @@ libtesseract_ccutil_la_SOURCES = \
unichar.cpp unicharcompress.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \
params.cpp universalambigs.cpp
AM_CPPFLAGS += $(libarchive_CFLAGS)
if T_WIN
AM_CPPFLAGS += -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\"
endif

View File

@ -2,7 +2,6 @@
// File: tessdatamanager.cpp
// Description: Functions to handle loading/combining tesseract data files.
// Author: Daria Antonova
// Created: Wed Jun 03 11:26:43 PST 2009
//
// (C) Copyright 2009, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
@ -24,6 +23,12 @@
#include "tessdatamanager.h"
#include <cstdio>
#include <string>
#if defined(HAVE_LIBARCHIVE)
#include <archive.h>
#include <archive_entry.h>
#endif
#include "errcode.h"
#include "helpers.h"
@ -52,9 +57,49 @@ void TessdataManager::LoadFileLater(const char *data_file_name) {
data_file_name_ = data_file_name;
}
#if defined(HAVE_LIBARCHIVE)
bool TessdataManager::LoadArchiveFile(const char *filename) {
bool result = false;
archive *a = archive_read_new();
if (a != nullptr) {
archive_read_support_filter_all(a);
archive_read_support_format_all(a);
if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {
archive_entry *ae;
while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {
const char *component = archive_entry_pathname(ae);
if (component != nullptr) {
TessdataType type;
if (TessdataTypeFromFileName(component, &type)) {
int64_t size = archive_entry_size(ae);
if (size > 0) {
entries_[type].resize_no_init(size);
if (archive_read_data(a, &entries_[type][0], size) == size) {
is_loaded_ = true;
}
}
}
}
}
result = is_loaded_;
#if defined(DEBUG)
} else {
tprintf("archive_read_open_filename(...,%s,...) failed, %s\n",
filename, strerror(archive_errno(a)));
#endif
}
archive_read_free(a);
}
return result;
}
#endif
bool TessdataManager::Init(const char *data_file_name) {
GenericVector<char> data;
if (reader_ == nullptr) {
#if defined(HAVE_LIBARCHIVE)
if (LoadArchiveFile(data_file_name)) return true;
#endif
if (!LoadDataFromFile(data_file_name, &data)) return false;
} else {
if (!(*reader_)(data_file_name, &data)) return false;
@ -65,6 +110,7 @@ bool TessdataManager::Init(const char *data_file_name) {
// Loads from the given memory buffer as if a file.
bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
int size) {
// TODO: This method supports only the proprietary file format.
Clear();
data_file_name_ = name;
TFile fp;
@ -78,10 +124,10 @@ bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
GenericVector<int64_t> offset_table;
offset_table.resize_no_init(num_entries);
if (!fp.DeSerialize(&offset_table[0], num_entries)) return false;
for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
if (offset_table[i] >= 0) {
int64_t entry_size = size - offset_table[i];
int j = i + 1;
unsigned j = i + 1;
while (j < num_entries && offset_table[j] == -1) ++j;
if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
entries_[i].resize_no_init(entry_size);
@ -106,6 +152,7 @@ void TessdataManager::OverwriteEntry(TessdataType type, const char *data,
// Saves to the given filename.
bool TessdataManager::SaveFile(const STRING &filename,
FileWriter writer) const {
// TODO: This method supports only the proprietary file format.
ASSERT_HOST(is_loaded_);
GenericVector<char> data;
Serialize(&data);
@ -117,11 +164,12 @@ bool TessdataManager::SaveFile(const STRING &filename,
// Serializes to the given vector.
void TessdataManager::Serialize(GenericVector<char> *data) const {
// TODO: This method supports only the proprietary file format.
ASSERT_HOST(is_loaded_);
// Compute the offset_table and total size.
int64_t offset_table[TESSDATA_NUM_ENTRIES];
int64_t offset = sizeof(int32_t) + sizeof(offset_table);
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (entries_[i].empty()) {
offset_table[i] = -1;
} else {
@ -135,7 +183,7 @@ void TessdataManager::Serialize(GenericVector<char> *data) const {
fp.OpenWrite(data);
fp.Serialize(&num_entries);
fp.Serialize(&offset_table[0], countof(offset_table));
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (!entries_[i].empty()) {
fp.Serialize(&entries_[i][0], entries_[i].size());
}
@ -144,7 +192,7 @@ void TessdataManager::Serialize(GenericVector<char> *data) const {
// Resets to the initial state, keeping the reader.
void TessdataManager::Clear() {
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
entries_[i].clear();
}
is_loaded_ = false;
@ -154,7 +202,7 @@ void TessdataManager::Clear() {
void TessdataManager::Directory() const {
tprintf("Version string:%s\n", VersionString().c_str());
int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (!entries_[i].empty()) {
tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
entries_[i].size(), offset);
@ -197,7 +245,7 @@ bool TessdataManager::CombineDataFiles(
const char *language_data_path_prefix,
const char *output_filename) {
// Load individual tessdata components from files.
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
TessdataType type;
ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
STRING filename = language_data_path_prefix;
@ -229,6 +277,7 @@ bool TessdataManager::OverwriteComponents(
char **component_filenames,
int num_new_components) {
// Open the files with the new components.
// TODO: This method supports only the proprietary file format.
for (int i = 0; i < num_new_components; ++i) {
TessdataType type;
if (TessdataTypeFromFileName(component_filenames[i], &type)) {
@ -253,14 +302,16 @@ bool TessdataManager::ExtractToFile(const char *filename) {
bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix,
TessdataType *type) {
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
*type = static_cast<TessdataType>(i);
return true;
}
}
#if defined(DEBUG)
tprintf("TessdataManager can't determine which tessdata"
" component is represented by %s\n", suffix);
#endif
return false;
}

View File

@ -214,6 +214,11 @@ class TessdataManager {
*/
bool ExtractToFile(const char *filename);
private:
// Use libarchive.
bool LoadArchiveFile(const char *filename);
/**
* Fills type with TessdataType of the tessdata component represented by the
* given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.
@ -230,7 +235,6 @@ class TessdataManager {
static bool TessdataTypeFromFileName(const char *filename,
TessdataType *type);
private:
// Name of file it came from.
STRING data_file_name_;
// Function to load the file when we need it.

View File

@ -280,3 +280,23 @@ set_unicharset_properties_LDADD += $(LEPTONICA_LIBS)
text2image_LDADD += $(LEPTONICA_LIBS)
unicharset_extractor_LDADD += $(LEPTONICA_LIBS)
wordlist2dawg_LDADD += $(LEPTONICA_LIBS)
extralib = $(libarchive_LIBS)
if !DISABLED_LEGACY_ENGINE
ambiguous_words_LDADD += $(extralib)
classifier_tester_LDADD += $(extralib)
cntraining_LDADD += $(extralib)
mftraining_LDADD += $(extralib)
shapeclustering_LDADD += $(extralib)
endif
combine_lang_model_LDADD += $(extralib)
combine_tessdata_LDADD += $(extralib)
dawg2wordlist_LDADD += $(extralib)
lstmeval_LDADD += $(extralib)
lstmtraining_LDADD += $(extralib)
merge_unicharsets_LDADD += $(extralib)
set_unicharset_properties_LDADD += $(extralib)
text2image_LDADD += $(extralib)
unicharset_extractor_LDADD += $(extralib)
wordlist2dawg_LDADD += $(extralib)