mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-19 15:03:45 +08:00
unicharset_extractor:
- run ReadMemBoxes only for box files - do not write unicharset in case of broken box file
This commit is contained in:
parent
da3737d371
commit
8a26329623
@ -21,6 +21,7 @@
|
||||
// a unicharset.
|
||||
|
||||
#include <cstdlib>
|
||||
#include <filesystem>
|
||||
#include "boxread.h"
|
||||
#include "commandlineflags.h"
|
||||
#include "commontraining.h" // CheckSharedLibraryVersion
|
||||
@ -64,15 +65,21 @@ static int Main(int argc, char **argv) {
|
||||
UNICHARSET unicharset;
|
||||
// Load input files
|
||||
for (int arg = 1; arg < argc; ++arg) {
|
||||
std::string file_data = tesseract::ReadFile(argv[arg]);
|
||||
std::filesystem::path filePath = argv[arg];
|
||||
std::string file_data = tesseract::ReadFile(filePath.u8string());
|
||||
if (file_data.empty()) {
|
||||
continue;
|
||||
}
|
||||
std::vector<std::string> texts;
|
||||
if (ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0],
|
||||
/*continue_on_failure*/ false, /*boxes*/ nullptr, &texts,
|
||||
/*box_texts*/ nullptr, /*pages*/ nullptr)) {
|
||||
tprintf("Extracting unicharset from box file %s\n", argv[arg]);
|
||||
if (filePath.extension() == ".box") {
|
||||
tprintf("Extracting unicharset from box file %s\n", filePath.u8string());
|
||||
bool res = ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0],
|
||||
/*continue_on_failure*/ false, /*boxes*/ nullptr, &texts,
|
||||
/*box_texts*/ nullptr, /*pages*/ nullptr);
|
||||
if (!res) {
|
||||
tprintf("Can not read box data from '%s'\n", filePath.u8string());
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
} else {
|
||||
tprintf("Extracting unicharset from plain text file %s\n", argv[arg]);
|
||||
texts.clear();
|
||||
|
Loading…
Reference in New Issue
Block a user