Add initial support for traineddata files in standard archive formats

This requires libarchive-dev.

Tesseract can now load traineddata files in any of the archive formats
which are supported by libarchive. Example of a zipped BagIt archive:

    $ unzip -l /usr/local/share/tessdata/zip.traineddata
    Archive:  /usr/local/share/tessdata/zip.traineddata
      Length      Date    Time    Name
    ---------  ---------- -----   ----
           55  2019-03-05 15:27   bagit.txt
            0  2019-03-05 15:25   data/
         1557  2019-03-05 15:28   manifest-sha256.txt
      1082890  2019-03-05 15:25   data/eng.word-dawg
      1487588  2019-03-05 15:25   data/eng.lstm
         7477  2019-03-05 15:25   data/eng.unicharset
        63346  2019-03-05 15:25   data/eng.shapetable
       976552  2019-03-05 15:25   data/eng.inttemp
        13408  2019-03-05 15:25   data/eng.normproto
         4322  2019-03-05 15:25   data/eng.punc-dawg
         4738  2019-03-05 15:25   data/eng.lstm-number-dawg
         1410  2019-03-05 15:25   data/eng.freq-dawg
          844  2019-03-05 15:25   data/eng.pffmtable
         6360  2019-03-05 15:25   data/eng.lstm-unicharset
         1012  2019-03-05 15:25   data/eng.lstm-recoder
         1047  2019-03-05 15:25   data/eng.unicharambigs
         4322  2019-03-05 15:25   data/eng.lstm-punc-dawg
     16109842  2019-03-05 15:25   data/eng.bigram-dawg
           80  2019-03-05 15:25   data/eng.version
         6426  2019-03-05 15:25   data/eng.number-dawg
      3694794  2019-03-05 15:25   data/eng.lstm-word-dawg
    ---------                     -------
     23468070                     21 files

`combine_tessdata -d` and `combine_tessdata -u` also work.

The traineddata files in the new format can be generated with
standard tools like zip or tar.

More work is needed for other training tools and big endian support.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
Stefan Weil 2019-03-05 15:52:15 +01:00
parent 7fbde96a04
commit 1c7e00611b
7 changed files with 95 additions and 10 deletions

View File

@ -21,6 +21,7 @@ addons:
sources:
#- ubuntu-toolchain-r-test
packages:
- libarchive-dev
#- g++-6
#matrix:

View File

@ -422,6 +422,12 @@ else
AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.])
fi
PKG_CHECK_MODULES([libarchive], [libarchive], [have_libarchive=true], [have_libarchive=false])
AM_CONDITIONAL([HAVE_LIBARCHIVE], [$have_libarchive])
if $have_libarchive; then
AC_DEFINE([HAVE_LIBARCHIVE], [], [Enable libarchive])
fi
AM_CONDITIONAL([ENABLE_TRAINING], true)
# Check availability of ICU packages.

View File

@ -88,6 +88,7 @@ tesseract_LDFLAGS = $(OPENCL_LDFLAGS)
tesseract_LDADD += $(LEPTONICA_LIBS)
tesseract_LDADD += $(OPENMP_CXXFLAGS)
tesseract_LDADD += $(libarchive_LIBS)
if T_WIN
tesseract_LDADD += -ltiff

View File

@ -40,6 +40,8 @@ libtesseract_ccutil_la_SOURCES = \
unichar.cpp unicharcompress.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \
params.cpp universalambigs.cpp
AM_CPPFLAGS += $(libarchive_CFLAGS)
if T_WIN
AM_CPPFLAGS += -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\"
endif

View File

@ -2,7 +2,6 @@
// File: tessdatamanager.cpp
// Description: Functions to handle loading/combining tesseract data files.
// Author: Daria Antonova
// Created: Wed Jun 03 11:26:43 PST 2009
//
// (C) Copyright 2009, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
@ -24,6 +23,12 @@
#include "tessdatamanager.h"
#include <cstdio>
#include <string>
#if defined(HAVE_LIBARCHIVE)
#include <archive.h>
#include <archive_entry.h>
#endif
#include "errcode.h"
#include "helpers.h"
@ -52,9 +57,49 @@ void TessdataManager::LoadFileLater(const char *data_file_name) {
data_file_name_ = data_file_name;
}
#if defined(HAVE_LIBARCHIVE)
bool TessdataManager::LoadArchiveFile(const char *filename) {
bool result = false;
archive *a = archive_read_new();
if (a != nullptr) {
archive_read_support_filter_all(a);
archive_read_support_format_all(a);
if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {
archive_entry *ae;
while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {
const char *component = archive_entry_pathname(ae);
if (component != nullptr) {
TessdataType type;
if (TessdataTypeFromFileName(component, &type)) {
int64_t size = archive_entry_size(ae);
if (size > 0) {
entries_[type].resize_no_init(size);
if (archive_read_data(a, &entries_[type][0], size) == size) {
is_loaded_ = true;
}
}
}
}
}
result = is_loaded_;
#if defined(DEBUG)
} else {
tprintf("archive_read_open_filename(...,%s,...) failed, %s\n",
filename, strerror(archive_errno(a)));
#endif
}
archive_read_free(a);
}
return result;
}
#endif
bool TessdataManager::Init(const char *data_file_name) {
GenericVector<char> data;
if (reader_ == nullptr) {
#if defined(HAVE_LIBARCHIVE)
if (LoadArchiveFile(data_file_name)) return true;
#endif
if (!LoadDataFromFile(data_file_name, &data)) return false;
} else {
if (!(*reader_)(data_file_name, &data)) return false;
@ -65,6 +110,7 @@ bool TessdataManager::Init(const char *data_file_name) {
// Loads from the given memory buffer as if a file.
bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
int size) {
// TODO: This method supports only the proprietary file format.
Clear();
data_file_name_ = name;
TFile fp;
@ -78,10 +124,10 @@ bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
GenericVector<int64_t> offset_table;
offset_table.resize_no_init(num_entries);
if (!fp.DeSerialize(&offset_table[0], num_entries)) return false;
for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
if (offset_table[i] >= 0) {
int64_t entry_size = size - offset_table[i];
int j = i + 1;
unsigned j = i + 1;
while (j < num_entries && offset_table[j] == -1) ++j;
if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
entries_[i].resize_no_init(entry_size);
@ -106,6 +152,7 @@ void TessdataManager::OverwriteEntry(TessdataType type, const char *data,
// Saves to the given filename.
bool TessdataManager::SaveFile(const STRING &filename,
FileWriter writer) const {
// TODO: This method supports only the proprietary file format.
ASSERT_HOST(is_loaded_);
GenericVector<char> data;
Serialize(&data);
@ -117,11 +164,12 @@ bool TessdataManager::SaveFile(const STRING &filename,
// Serializes to the given vector.
void TessdataManager::Serialize(GenericVector<char> *data) const {
// TODO: This method supports only the proprietary file format.
ASSERT_HOST(is_loaded_);
// Compute the offset_table and total size.
int64_t offset_table[TESSDATA_NUM_ENTRIES];
int64_t offset = sizeof(int32_t) + sizeof(offset_table);
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (entries_[i].empty()) {
offset_table[i] = -1;
} else {
@ -135,7 +183,7 @@ void TessdataManager::Serialize(GenericVector<char> *data) const {
fp.OpenWrite(data);
fp.Serialize(&num_entries);
fp.Serialize(&offset_table[0], countof(offset_table));
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (!entries_[i].empty()) {
fp.Serialize(&entries_[i][0], entries_[i].size());
}
@ -144,7 +192,7 @@ void TessdataManager::Serialize(GenericVector<char> *data) const {
// Resets to the initial state, keeping the reader.
void TessdataManager::Clear() {
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
entries_[i].clear();
}
is_loaded_ = false;
@ -154,7 +202,7 @@ void TessdataManager::Clear() {
void TessdataManager::Directory() const {
tprintf("Version string:%s\n", VersionString().c_str());
int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (!entries_[i].empty()) {
tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
entries_[i].size(), offset);
@ -197,7 +245,7 @@ bool TessdataManager::CombineDataFiles(
const char *language_data_path_prefix,
const char *output_filename) {
// Load individual tessdata components from files.
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
TessdataType type;
ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
STRING filename = language_data_path_prefix;
@ -229,6 +277,7 @@ bool TessdataManager::OverwriteComponents(
char **component_filenames,
int num_new_components) {
// Open the files with the new components.
// TODO: This method supports only the proprietary file format.
for (int i = 0; i < num_new_components; ++i) {
TessdataType type;
if (TessdataTypeFromFileName(component_filenames[i], &type)) {
@ -253,14 +302,16 @@ bool TessdataManager::ExtractToFile(const char *filename) {
bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix,
TessdataType *type) {
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
*type = static_cast<TessdataType>(i);
return true;
}
}
#if defined(DEBUG)
tprintf("TessdataManager can't determine which tessdata"
" component is represented by %s\n", suffix);
#endif
return false;
}

View File

@ -214,6 +214,11 @@ class TessdataManager {
*/
bool ExtractToFile(const char *filename);
private:
// Use libarchive.
bool LoadArchiveFile(const char *filename);
/**
* Fills type with TessdataType of the tessdata component represented by the
* given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.
@ -230,7 +235,6 @@ class TessdataManager {
static bool TessdataTypeFromFileName(const char *filename,
TessdataType *type);
private:
// Name of file it came from.
STRING data_file_name_;
// Function to load the file when we need it.

View File

@ -280,3 +280,23 @@ set_unicharset_properties_LDADD += $(LEPTONICA_LIBS)
text2image_LDADD += $(LEPTONICA_LIBS)
unicharset_extractor_LDADD += $(LEPTONICA_LIBS)
wordlist2dawg_LDADD += $(LEPTONICA_LIBS)
extralib = $(libarchive_LIBS)
if !DISABLED_LEGACY_ENGINE
ambiguous_words_LDADD += $(extralib)
classifier_tester_LDADD += $(extralib)
cntraining_LDADD += $(extralib)
mftraining_LDADD += $(extralib)
shapeclustering_LDADD += $(extralib)
endif
combine_lang_model_LDADD += $(extralib)
combine_tessdata_LDADD += $(extralib)
dawg2wordlist_LDADD += $(extralib)
lstmeval_LDADD += $(extralib)
lstmtraining_LDADD += $(extralib)
merge_unicharsets_LDADD += $(extralib)
set_unicharset_properties_LDADD += $(extralib)
text2image_LDADD += $(extralib)
unicharset_extractor_LDADD += $(extralib)
wordlist2dawg_LDADD += $(extralib)