tesseract/training/unicharset_extractor.cpp

///////////////////////////////////////////////////////////////////////
// File:        unicharset_extractor.cpp
// Description: Unicode character/ligature set extractor.
// Author:      Thomas Kielbus
// Created:     Wed Jun 28 17:05:01 PDT 2006
//
// (C) Copyright 2006, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

// Given a list of box files or text files on the command line, this program
// normalizes the text according to command-line options and generates
// a unicharset.

#include <cstdlib>
#include "boxread.h"
#include "commandlineflags.h"
#include "genericvector.h"
#include "lang_model_helpers.h"
#include "normstrngs.h"
#include "strngs.h"
#include "tprintf.h"
#include "unicharset.h"
#include "unicharset_training_utils.h"

STRING_PARAM_FLAG(output_unicharset, "unicharset", "Output file path");
INT_PARAM_FLAG(norm_mode, 1,
               "Normalization mode: 1=Combine graphemes, "
               "2=Split graphemes, 3=Pure unicode");

namespace tesseract {

// Helper normalizes and segments the given strings according to norm_mode, and
// adds the segmented parts to unicharset.
static void AddStringsToUnicharset(const GenericVector<STRING>& strings,
                                   int norm_mode, UNICHARSET* unicharset) {
  for (int i = 0; i < strings.size(); ++i) {
    std::vector<string> normalized;
    if (NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,
                                     static_cast<GraphemeNormMode>(norm_mode),
                                     /*report_errors*/ true,
                                     strings[i].string(), &normalized)) {
      for (const string& normed : normalized) {

       // normed is a UTF-8 encoded string
        if (normed.empty() || IsUTF8Whitespace(normed.c_str())) continue;
        unicharset->unichar_insert(normed.c_str());
      }
    } else {
      tprintf("Normalization failed for string '%s'\n", strings[i].c_str());
    }
  }
}

int Main(int argc, char** argv) {
  UNICHARSET unicharset;
  // Load input files
  for (int arg = 1; arg < argc; ++arg) {
    STRING file_data = tesseract::ReadFile(argv[arg], /*reader*/ nullptr);
    if (file_data.length() == 0) continue;
    GenericVector<STRING> texts;
    if (ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0],
                     /*continue_on_failure*/ false, /*boxes*/ nullptr,
                     &texts, /*box_texts*/ nullptr, /*pages*/ nullptr)) {
      tprintf("Extracting unicharset from box file %s\n", argv[arg]);
    } else {
      tprintf("Extracting unicharset from plain text file %s\n", argv[arg]);
      texts.truncate(0);
      file_data.split('\n', &texts);
    }
    AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);
  }
  SetupBasicProperties(/*report_errors*/ true, /*decompose*/ false,
                       &unicharset);
  // Write unicharset file.
  if (unicharset.save_to_file(FLAGS_output_unicharset.c_str())) {
    tprintf("Wrote unicharset file %s\n", FLAGS_output_unicharset.c_str());
  } else {
    tprintf("Cannot save unicharset file %s\n",
            FLAGS_output_unicharset.c_str());
    return EXIT_FAILURE;
  }
  return EXIT_SUCCESS;
}

}  // namespace tesseract

int main(int argc, char** argv) {
  if (argc > 1) {
    tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
  }
  if (argc < 2) {
    tprintf(
        "Usage: %s [--output_unicharset filename] [--norm_mode mode]"
        " box_or_text_file [...]\n",
        argv[0]);
    tprintf("Where mode means:\n");
    tprintf(" 1=combine graphemes (use for Latin and other simple scripts)\n");
    tprintf(" 2=split graphemes (use for Indic/Khmer/Myanmar)\n");
    tprintf(" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n");
    tprintf("Reads box or plain text files to extract the unicharset.\n");
    return EXIT_FAILURE;
  }
  return tesseract::Main(argc, argv);
}
Preparations for unicodization git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@38 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-16 09:24:06 +08:00			`///////////////////////////////////////////////////////////////////////`
			`// File: unicharset_extractor.cpp`
			`// Description: Unicode character/ligature set extractor.`
			`// Author: Thomas Kielbus`
			`// Created: Wed Jun 28 17:05:01 PDT 2006`
			`//`
			`// (C) Copyright 2006, Google Inc.`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`
			`//`
			`///////////////////////////////////////////////////////////////////////`

Rewrote unicharset_extractor to use the new string normalizer and read plain text as well as box files. 2017-09-08 18:49:57 +08:00			`// Given a list of box files or text files on the command line, this program`
			`// normalizes the text according to command-line options and generates`
			`// a unicharset.`
Preparations for unicodization git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@38 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-16 09:24:06 +08:00
Rewrote unicharset_extractor to use the new string normalizer and read plain text as well as box files. 2017-09-08 18:49:57 +08:00			`#include <cstdlib>`
Fixed various internationalization issues, mostly for training git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@106 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-08-31 02:18:35 +08:00			`#include "boxread.h"`
Rewrote unicharset_extractor to use the new string normalizer and read plain text as well as box files. 2017-09-08 18:49:57 +08:00			`#include "commandlineflags.h"`
			`#include "genericvector.h"`
			`#include "lang_model_helpers.h"`
			`#include "normstrngs.h"`
Fixed training leaks and randomness git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@653 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 11:02:16 +08:00			`#include "strngs.h"`
Rewrote unicharset_extractor to use the new string normalizer and read plain text as well as box files. 2017-09-08 18:49:57 +08:00			`#include "tprintf.h"`
Fixed training leaks and randomness git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@653 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 11:02:16 +08:00			`#include "unicharset.h"`
Rewrote unicharset_extractor to use the new string normalizer and read plain text as well as box files. 2017-09-08 18:49:57 +08:00			`#include "unicharset_training_utils.h"`

			`STRING_PARAM_FLAG(output_unicharset, "unicharset", "Output file path");`
			`INT_PARAM_FLAG(norm_mode, 1,`
			`"Normalization mode: 1=Combine graphemes, "`
			`"2=Split graphemes, 3=Pure unicode");`

			`namespace tesseract {`

			`// Helper normalizes and segments the given strings according to norm_mode, and`
			`// adds the segmented parts to unicharset.`
			`static void AddStringsToUnicharset(const GenericVector<STRING>& strings,`
			`int norm_mode, UNICHARSET* unicharset) {`
			`for (int i = 0; i < strings.size(); ++i) {`
			`std::vector<string> normalized;`
			`if (NormalizeCleanAndSegmentUTF8(UnicodeNormMode::kNFC, OCRNorm::kNone,`
			`static_cast<GraphemeNormMode>(norm_mode),`
			`/report_errors/ true,`
			`strings[i].string(), &normalized)) {`
			`for (const string& normed : normalized) {`
Update unicharset_extractor.cpp (#1153) * change IsWhitespace to IsUTF8Whitespace To solve "Phase UP: Generating unicharset and unichar properties files" ERROR #1147 please reference: [#1147](https://github.com/tesseract-ocr/tesseract/issues/1147) * Update unicharset_extractor.cpp fix the "Phase UP: Generating unicharset and unichar properties files" ERROR * Update unicharset_extractor.cpp fix "Phase UP: Generating unicharset and unichar properties files" ERROR #1147 * Update unicharset_extractor.cpp fix the encoding invalid problem and fix the comment 2017-10-13 17:46:42 +08:00
			`// normed is a UTF-8 encoded string`
			`if (normed.empty() \|\| IsUTF8Whitespace(normed.c_str())) continue;`
Rewrote unicharset_extractor to use the new string normalizer and read plain text as well as box files. 2017-09-08 18:49:57 +08:00			`unicharset->unichar_insert(normed.c_str());`
			`}`
			`} else {`
			`tprintf("Normalization failed for string '%s'\n", strings[i].c_str());`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`}`
Fixed various internationalization issues, mostly for training git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@106 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-08-31 02:18:35 +08:00			`}`
			`}`

Rewrote unicharset_extractor to use the new string normalizer and read plain text as well as box files. 2017-09-08 18:49:57 +08:00			`int Main(int argc, char** argv) {`
Preparations for unicodization git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@38 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-16 09:24:06 +08:00			`UNICHARSET unicharset;`
Rewrote unicharset_extractor to use the new string normalizer and read plain text as well as box files. 2017-09-08 18:49:57 +08:00			`// Load input files`
			`for (int arg = 1; arg < argc; ++arg) {`
			`STRING file_data = tesseract::ReadFile(argv[arg], /reader/ nullptr);`
			`if (file_data.length() == 0) continue;`
			`GenericVector<STRING> texts;`
			`if (ReadMemBoxes(-1, /skip_blanks/ true, &file_data[0],`
			`/continue_on_failure/ false, /boxes/ nullptr,`
			`&texts, /box_texts/ nullptr, /pages/ nullptr)) {`
			`tprintf("Extracting unicharset from box file %s\n", argv[arg]);`
			`} else {`
			`tprintf("Extracting unicharset from plain text file %s\n", argv[arg]);`
			`texts.truncate(0);`
			`file_data.split('\n', &texts);`
Preparations for unicodization git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@38 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-16 09:24:06 +08:00			`}`
Rewrote unicharset_extractor to use the new string normalizer and read plain text as well as box files. 2017-09-08 18:49:57 +08:00			`AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);`
Preparations for unicodization git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@38 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-16 09:24:06 +08:00			`}`
Rewrote unicharset_extractor to use the new string normalizer and read plain text as well as box files. 2017-09-08 18:49:57 +08:00			`SetupBasicProperties(/report_errors/ true, /decompose/ false,`
			`&unicharset);`
			`// Write unicharset file.`
			`if (unicharset.save_to_file(FLAGS_output_unicharset.c_str())) {`
			`tprintf("Wrote unicharset file %s\n", FLAGS_output_unicharset.c_str());`
			`} else {`
			`tprintf("Cannot save unicharset file %s\n",`
			`FLAGS_output_unicharset.c_str());`
			`return EXIT_FAILURE;`
Preparations for unicodization git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@38 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-16 09:24:06 +08:00			`}`
Rewrote unicharset_extractor to use the new string normalizer and read plain text as well as box files. 2017-09-08 18:49:57 +08:00			`return EXIT_SUCCESS;`
			`}`
Preparations for unicodization git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@38 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-16 09:24:06 +08:00
Rewrote unicharset_extractor to use the new string normalizer and read plain text as well as box files. 2017-09-08 18:49:57 +08:00			`} // namespace tesseract`

			`int main(int argc, char** argv) {`
Fix help message for unicharset_extractor (#1206) If unicharset_extractor was called without any argument, a help message was printed by tesseract::ParseCommandLineFlags. Replace that by the local help message which is better. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2017-11-10 22:45:35 +08:00			`if (argc > 1) {`
			`tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);`
			`}`
Rewrote unicharset_extractor to use the new string normalizer and read plain text as well as box files. 2017-09-08 18:49:57 +08:00			`if (argc < 2) {`
			`tprintf(`
			`"Usage: %s [--output_unicharset filename] [--norm_mode mode]"`
			`" box_or_text_file [...]\n",`
			`argv[0]);`
			`tprintf("Where mode means:\n");`
			`tprintf(" 1=combine graphemes (use for Latin and other simple scripts)\n");`
			`tprintf(" 2=split graphemes (use for Indic/Khmer/Myanmar)\n");`
			`tprintf(" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n");`
			`tprintf("Reads box or plain text files to extract the unicharset.\n");`
			`return EXIT_FAILURE;`
Preparations for unicodization git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@38 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-16 09:24:06 +08:00			`}`
Rewrote unicharset_extractor to use the new string normalizer and read plain text as well as box files. 2017-09-08 18:49:57 +08:00			`return tesseract::Main(argc, argv);`
Preparations for unicodization git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@38 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-16 09:24:06 +08:00			`}`