2007-05-16 09:24:06 +08:00
|
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
// File: unicharset_extractor.cpp
|
|
|
|
// Description: Unicode character/ligature set extractor.
|
|
|
|
// Author: Thomas Kielbus
|
|
|
|
// Created: Wed Jun 28 17:05:01 PDT 2006
|
|
|
|
//
|
|
|
|
// (C) Copyright 2006, Google Inc.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
//
|
|
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
|
2017-09-08 18:49:57 +08:00
|
|
|
// Given a list of box files or text files on the command line, this program
|
|
|
|
// normalizes the text according to command-line options and generates
|
|
|
|
// a unicharset.
|
2007-05-16 09:24:06 +08:00
|
|
|
|
2017-09-08 18:49:57 +08:00
|
|
|
#include <cstdlib>
|
2007-08-31 02:18:35 +08:00
|
|
|
#include "boxread.h"
|
2017-09-08 18:49:57 +08:00
|
|
|
#include "commandlineflags.h"
|
|
|
|
#include "genericvector.h"
|
|
|
|
#include "lang_model_helpers.h"
|
|
|
|
#include "normstrngs.h"
|
2012-02-02 11:02:16 +08:00
|
|
|
#include "strngs.h"
|
2017-09-08 18:49:57 +08:00
|
|
|
#include "tprintf.h"
|
2012-02-02 11:02:16 +08:00
|
|
|
#include "unicharset.h"
|
2017-09-08 18:49:57 +08:00
|
|
|
#include "unicharset_training_utils.h"
|
|
|
|
|
|
|
|
STRING_PARAM_FLAG(output_unicharset, "unicharset", "Output file path");
|
|
|
|
INT_PARAM_FLAG(norm_mode, 1,
|
|
|
|
"Normalization mode: 1=Combine graphemes, "
|
|
|
|
"2=Split graphemes, 3=Pure unicode");
|
|
|
|
|
|
|
|
namespace tesseract {
|
|
|
|
|
|
|
|
// Helper normalizes and segments the given strings according to norm_mode, and
|
|
|
|
// adds the segmented parts to unicharset.
|
|
|
|
static void AddStringsToUnicharset(const GenericVector<STRING>& strings,
|
|
|
|
int norm_mode, UNICHARSET* unicharset) {
|
|
|
|
for (int i = 0; i < strings.size(); ++i) {
|
2018-03-03 21:36:28 +08:00
|
|
|
std::vector<std::string> normalized;
|
2017-09-08 18:49:57 +08:00
|
|
|
if (NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
|
|
|
|
static_cast<GraphemeNormMode>(norm_mode),
|
|
|
|
/*report_errors*/ true,
|
|
|
|
strings[i].string(), &normalized)) {
|
2018-03-03 21:36:28 +08:00
|
|
|
for (const std::string& normed : normalized) {
|
2017-10-13 17:46:42 +08:00
|
|
|
|
|
|
|
// normed is a UTF-8 encoded string
|
|
|
|
if (normed.empty() || IsUTF8Whitespace(normed.c_str())) continue;
|
2017-09-08 18:49:57 +08:00
|
|
|
unicharset->unichar_insert(normed.c_str());
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
tprintf("Normalization failed for string '%s'\n", strings[i].c_str());
|
2009-07-11 10:44:07 +08:00
|
|
|
}
|
2007-08-31 02:18:35 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-15 17:31:22 +08:00
|
|
|
static int Main(int argc, char** argv) {
|
2007-05-16 09:24:06 +08:00
|
|
|
UNICHARSET unicharset;
|
2017-09-08 18:49:57 +08:00
|
|
|
// Load input files
|
|
|
|
for (int arg = 1; arg < argc; ++arg) {
|
|
|
|
STRING file_data = tesseract::ReadFile(argv[arg], /*reader*/ nullptr);
|
|
|
|
if (file_data.length() == 0) continue;
|
|
|
|
GenericVector<STRING> texts;
|
|
|
|
if (ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0],
|
|
|
|
/*continue_on_failure*/ false, /*boxes*/ nullptr,
|
|
|
|
&texts, /*box_texts*/ nullptr, /*pages*/ nullptr)) {
|
|
|
|
tprintf("Extracting unicharset from box file %s\n", argv[arg]);
|
|
|
|
} else {
|
|
|
|
tprintf("Extracting unicharset from plain text file %s\n", argv[arg]);
|
|
|
|
texts.truncate(0);
|
|
|
|
file_data.split('\n', &texts);
|
2007-05-16 09:24:06 +08:00
|
|
|
}
|
2017-09-08 18:49:57 +08:00
|
|
|
AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);
|
2007-05-16 09:24:06 +08:00
|
|
|
}
|
2017-09-08 18:49:57 +08:00
|
|
|
SetupBasicProperties(/*report_errors*/ true, /*decompose*/ false,
|
|
|
|
&unicharset);
|
|
|
|
// Write unicharset file.
|
|
|
|
if (unicharset.save_to_file(FLAGS_output_unicharset.c_str())) {
|
|
|
|
tprintf("Wrote unicharset file %s\n", FLAGS_output_unicharset.c_str());
|
|
|
|
} else {
|
|
|
|
tprintf("Cannot save unicharset file %s\n",
|
|
|
|
FLAGS_output_unicharset.c_str());
|
|
|
|
return EXIT_FAILURE;
|
2007-05-16 09:24:06 +08:00
|
|
|
}
|
2017-09-08 18:49:57 +08:00
|
|
|
return EXIT_SUCCESS;
|
|
|
|
}
|
2007-05-16 09:24:06 +08:00
|
|
|
|
2017-09-08 18:49:57 +08:00
|
|
|
} // namespace tesseract
|
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
2017-11-10 22:45:35 +08:00
|
|
|
if (argc > 1) {
|
|
|
|
tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
|
|
|
|
}
|
2017-09-08 18:49:57 +08:00
|
|
|
if (argc < 2) {
|
|
|
|
tprintf(
|
|
|
|
"Usage: %s [--output_unicharset filename] [--norm_mode mode]"
|
|
|
|
" box_or_text_file [...]\n",
|
|
|
|
argv[0]);
|
|
|
|
tprintf("Where mode means:\n");
|
|
|
|
tprintf(" 1=combine graphemes (use for Latin and other simple scripts)\n");
|
|
|
|
tprintf(" 2=split graphemes (use for Indic/Khmer/Myanmar)\n");
|
|
|
|
tprintf(" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n");
|
|
|
|
tprintf("Reads box or plain text files to extract the unicharset.\n");
|
|
|
|
return EXIT_FAILURE;
|
2007-05-16 09:24:06 +08:00
|
|
|
}
|
2017-09-08 18:49:57 +08:00
|
|
|
return tesseract::Main(argc, argv);
|
2007-05-16 09:24:06 +08:00
|
|
|
}
|