mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 20:59:36 +08:00
Merge pull request #2290 from stweil/libarchive
Add initial support for traineddata files in standard archive formats
This commit is contained in:
commit
868a623f8d
@ -21,6 +21,7 @@ addons:
|
||||
sources:
|
||||
#- ubuntu-toolchain-r-test
|
||||
packages:
|
||||
- libarchive-dev
|
||||
#- g++-6
|
||||
|
||||
#matrix:
|
||||
|
@ -422,6 +422,12 @@ else
|
||||
AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.])
|
||||
fi
|
||||
|
||||
PKG_CHECK_MODULES([libarchive], [libarchive], [have_libarchive=true], [have_libarchive=false])
|
||||
AM_CONDITIONAL([HAVE_LIBARCHIVE], [$have_libarchive])
|
||||
if $have_libarchive; then
|
||||
AC_DEFINE([HAVE_LIBARCHIVE], [], [Enable libarchive])
|
||||
fi
|
||||
|
||||
AM_CONDITIONAL([ENABLE_TRAINING], true)
|
||||
|
||||
# Check availability of ICU packages.
|
||||
|
@ -88,6 +88,7 @@ tesseract_LDFLAGS = $(OPENCL_LDFLAGS)
|
||||
|
||||
tesseract_LDADD += $(LEPTONICA_LIBS)
|
||||
tesseract_LDADD += $(OPENMP_CXXFLAGS)
|
||||
tesseract_LDADD += $(libarchive_LIBS)
|
||||
|
||||
if T_WIN
|
||||
tesseract_LDADD += -ltiff
|
||||
|
@ -40,6 +40,8 @@ libtesseract_ccutil_la_SOURCES = \
|
||||
unichar.cpp unicharcompress.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \
|
||||
params.cpp universalambigs.cpp
|
||||
|
||||
AM_CPPFLAGS += $(libarchive_CFLAGS)
|
||||
|
||||
if T_WIN
|
||||
AM_CPPFLAGS += -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\"
|
||||
endif
|
||||
|
@ -2,7 +2,6 @@
|
||||
// File: tessdatamanager.cpp
|
||||
// Description: Functions to handle loading/combining tesseract data files.
|
||||
// Author: Daria Antonova
|
||||
// Created: Wed Jun 03 11:26:43 PST 2009
|
||||
//
|
||||
// (C) Copyright 2009, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -24,6 +23,12 @@
|
||||
#include "tessdatamanager.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
|
||||
#if defined(HAVE_LIBARCHIVE)
|
||||
#include <archive.h>
|
||||
#include <archive_entry.h>
|
||||
#endif
|
||||
|
||||
#include "errcode.h"
|
||||
#include "helpers.h"
|
||||
@ -52,9 +57,49 @@ void TessdataManager::LoadFileLater(const char *data_file_name) {
|
||||
data_file_name_ = data_file_name;
|
||||
}
|
||||
|
||||
#if defined(HAVE_LIBARCHIVE)
|
||||
bool TessdataManager::LoadArchiveFile(const char *filename) {
|
||||
bool result = false;
|
||||
archive *a = archive_read_new();
|
||||
if (a != nullptr) {
|
||||
archive_read_support_filter_all(a);
|
||||
archive_read_support_format_all(a);
|
||||
if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {
|
||||
archive_entry *ae;
|
||||
while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {
|
||||
const char *component = archive_entry_pathname(ae);
|
||||
if (component != nullptr) {
|
||||
TessdataType type;
|
||||
if (TessdataTypeFromFileName(component, &type)) {
|
||||
int64_t size = archive_entry_size(ae);
|
||||
if (size > 0) {
|
||||
entries_[type].resize_no_init(size);
|
||||
if (archive_read_data(a, &entries_[type][0], size) == size) {
|
||||
is_loaded_ = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
result = is_loaded_;
|
||||
#if defined(DEBUG)
|
||||
} else {
|
||||
tprintf("archive_read_open_filename(...,%s,...) failed, %s\n",
|
||||
filename, strerror(archive_errno(a)));
|
||||
#endif
|
||||
}
|
||||
archive_read_free(a);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool TessdataManager::Init(const char *data_file_name) {
|
||||
GenericVector<char> data;
|
||||
if (reader_ == nullptr) {
|
||||
#if defined(HAVE_LIBARCHIVE)
|
||||
if (LoadArchiveFile(data_file_name)) return true;
|
||||
#endif
|
||||
if (!LoadDataFromFile(data_file_name, &data)) return false;
|
||||
} else {
|
||||
if (!(*reader_)(data_file_name, &data)) return false;
|
||||
@ -65,6 +110,7 @@ bool TessdataManager::Init(const char *data_file_name) {
|
||||
// Loads from the given memory buffer as if a file.
|
||||
bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
|
||||
int size) {
|
||||
// TODO: This method supports only the proprietary file format.
|
||||
Clear();
|
||||
data_file_name_ = name;
|
||||
TFile fp;
|
||||
@ -78,10 +124,10 @@ bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
|
||||
GenericVector<int64_t> offset_table;
|
||||
offset_table.resize_no_init(num_entries);
|
||||
if (!fp.DeSerialize(&offset_table[0], num_entries)) return false;
|
||||
for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
if (offset_table[i] >= 0) {
|
||||
int64_t entry_size = size - offset_table[i];
|
||||
int j = i + 1;
|
||||
unsigned j = i + 1;
|
||||
while (j < num_entries && offset_table[j] == -1) ++j;
|
||||
if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
|
||||
entries_[i].resize_no_init(entry_size);
|
||||
@ -106,6 +152,7 @@ void TessdataManager::OverwriteEntry(TessdataType type, const char *data,
|
||||
// Saves to the given filename.
|
||||
bool TessdataManager::SaveFile(const STRING &filename,
|
||||
FileWriter writer) const {
|
||||
// TODO: This method supports only the proprietary file format.
|
||||
ASSERT_HOST(is_loaded_);
|
||||
GenericVector<char> data;
|
||||
Serialize(&data);
|
||||
@ -117,11 +164,12 @@ bool TessdataManager::SaveFile(const STRING &filename,
|
||||
|
||||
// Serializes to the given vector.
|
||||
void TessdataManager::Serialize(GenericVector<char> *data) const {
|
||||
// TODO: This method supports only the proprietary file format.
|
||||
ASSERT_HOST(is_loaded_);
|
||||
// Compute the offset_table and total size.
|
||||
int64_t offset_table[TESSDATA_NUM_ENTRIES];
|
||||
int64_t offset = sizeof(int32_t) + sizeof(offset_table);
|
||||
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
if (entries_[i].empty()) {
|
||||
offset_table[i] = -1;
|
||||
} else {
|
||||
@ -135,7 +183,7 @@ void TessdataManager::Serialize(GenericVector<char> *data) const {
|
||||
fp.OpenWrite(data);
|
||||
fp.Serialize(&num_entries);
|
||||
fp.Serialize(&offset_table[0], countof(offset_table));
|
||||
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
if (!entries_[i].empty()) {
|
||||
fp.Serialize(&entries_[i][0], entries_[i].size());
|
||||
}
|
||||
@ -144,7 +192,7 @@ void TessdataManager::Serialize(GenericVector<char> *data) const {
|
||||
|
||||
// Resets to the initial state, keeping the reader.
|
||||
void TessdataManager::Clear() {
|
||||
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
entries_[i].clear();
|
||||
}
|
||||
is_loaded_ = false;
|
||||
@ -154,7 +202,7 @@ void TessdataManager::Clear() {
|
||||
void TessdataManager::Directory() const {
|
||||
tprintf("Version string:%s\n", VersionString().c_str());
|
||||
int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
|
||||
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
if (!entries_[i].empty()) {
|
||||
tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
|
||||
entries_[i].size(), offset);
|
||||
@ -197,7 +245,7 @@ bool TessdataManager::CombineDataFiles(
|
||||
const char *language_data_path_prefix,
|
||||
const char *output_filename) {
|
||||
// Load individual tessdata components from files.
|
||||
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
TessdataType type;
|
||||
ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
|
||||
STRING filename = language_data_path_prefix;
|
||||
@ -229,6 +277,7 @@ bool TessdataManager::OverwriteComponents(
|
||||
char **component_filenames,
|
||||
int num_new_components) {
|
||||
// Open the files with the new components.
|
||||
// TODO: This method supports only the proprietary file format.
|
||||
for (int i = 0; i < num_new_components; ++i) {
|
||||
TessdataType type;
|
||||
if (TessdataTypeFromFileName(component_filenames[i], &type)) {
|
||||
@ -253,14 +302,16 @@ bool TessdataManager::ExtractToFile(const char *filename) {
|
||||
|
||||
bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix,
|
||||
TessdataType *type) {
|
||||
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||
if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
|
||||
*type = static_cast<TessdataType>(i);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
#if defined(DEBUG)
|
||||
tprintf("TessdataManager can't determine which tessdata"
|
||||
" component is represented by %s\n", suffix);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -214,6 +214,11 @@ class TessdataManager {
|
||||
*/
|
||||
bool ExtractToFile(const char *filename);
|
||||
|
||||
private:
|
||||
|
||||
// Use libarchive.
|
||||
bool LoadArchiveFile(const char *filename);
|
||||
|
||||
/**
|
||||
* Fills type with TessdataType of the tessdata component represented by the
|
||||
* given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.
|
||||
@ -230,7 +235,6 @@ class TessdataManager {
|
||||
static bool TessdataTypeFromFileName(const char *filename,
|
||||
TessdataType *type);
|
||||
|
||||
private:
|
||||
// Name of file it came from.
|
||||
STRING data_file_name_;
|
||||
// Function to load the file when we need it.
|
||||
|
@ -280,3 +280,23 @@ set_unicharset_properties_LDADD += $(LEPTONICA_LIBS)
|
||||
text2image_LDADD += $(LEPTONICA_LIBS)
|
||||
unicharset_extractor_LDADD += $(LEPTONICA_LIBS)
|
||||
wordlist2dawg_LDADD += $(LEPTONICA_LIBS)
|
||||
|
||||
extralib = $(libarchive_LIBS)
|
||||
|
||||
if !DISABLED_LEGACY_ENGINE
|
||||
ambiguous_words_LDADD += $(extralib)
|
||||
classifier_tester_LDADD += $(extralib)
|
||||
cntraining_LDADD += $(extralib)
|
||||
mftraining_LDADD += $(extralib)
|
||||
shapeclustering_LDADD += $(extralib)
|
||||
endif
|
||||
combine_lang_model_LDADD += $(extralib)
|
||||
combine_tessdata_LDADD += $(extralib)
|
||||
dawg2wordlist_LDADD += $(extralib)
|
||||
lstmeval_LDADD += $(extralib)
|
||||
lstmtraining_LDADD += $(extralib)
|
||||
merge_unicharsets_LDADD += $(extralib)
|
||||
set_unicharset_properties_LDADD += $(extralib)
|
||||
text2image_LDADD += $(extralib)
|
||||
unicharset_extractor_LDADD += $(extralib)
|
||||
wordlist2dawg_LDADD += $(extralib)
|
||||
|
Loading…
Reference in New Issue
Block a user