unicharset_extractor:

- run ReadMemBoxes only for box files
- do not write unicharset in case of broken box file
This commit is contained in:
zdenop 2023-01-06 15:52:42 +01:00
parent da3737d371
commit 8a26329623

View File

@ -21,6 +21,7 @@
// a unicharset.
#include <cstdlib>
#include <filesystem>
#include "boxread.h"
#include "commandlineflags.h"
#include "commontraining.h" // CheckSharedLibraryVersion
@ -64,15 +65,21 @@ static int Main(int argc, char **argv) {
UNICHARSET unicharset;
// Load input files
for (int arg = 1; arg < argc; ++arg) {
std::string file_data = tesseract::ReadFile(argv[arg]);
std::filesystem::path filePath = argv[arg];
std::string file_data = tesseract::ReadFile(filePath.u8string());
if (file_data.empty()) {
continue;
}
std::vector<std::string> texts;
if (ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0],
/*continue_on_failure*/ false, /*boxes*/ nullptr, &texts,
/*box_texts*/ nullptr, /*pages*/ nullptr)) {
tprintf("Extracting unicharset from box file %s\n", argv[arg]);
if (filePath.extension() == ".box") {
tprintf("Extracting unicharset from box file %s\n", filePath.u8string());
bool res = ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0],
/*continue_on_failure*/ false, /*boxes*/ nullptr, &texts,
/*box_texts*/ nullptr, /*pages*/ nullptr);
if (!res) {
tprintf("Can not read box data from '%s'\n", filePath.u8string());
return EXIT_FAILURE;
}
} else {
tprintf("Extracting unicharset from plain text file %s\n", argv[arg]);
texts.clear();