tesseract/training/lang_model_helpers.cpp

// Copyright 2017 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)
// Purpose: Collection of convenience functions to simplify creation of the
//          unicharset, recoder, and dawgs for an LSTM model.

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lang_model_helpers.h"

#if defined(_WIN32)
#include <direct.h>
#endif
#include <sys/stat.h>
#include <sys/types.h>
#include <cstdlib>
#include "dawg.h"
#include "fileio.h"
#include "tessdatamanager.h"
#include "trie.h"
#include "unicharcompress.h"

namespace tesseract {

// Helper makes a filename (<output_dir>/<lang>/<lang><suffix>) and writes data
// to the file, using writer if not null, otherwise, a default writer.
// Default writer will overwrite any existing file, but a supplied writer
// can do its own thing. If lang is empty, returns true but does nothing.
// NOTE that suffix should contain any required . for the filename.
bool WriteFile(const string& output_dir, const string& lang,
               const string& suffix, const GenericVector<char>& data,
               FileWriter writer) {
  if (lang.empty()) return true;
  string dirname = output_dir + "/" + lang;
  // Attempt to make the directory, but ignore errors, as it may not be a
  // standard filesystem, and the writer will complain if not successful.
#if defined(_WIN32)
  _mkdir(dirname.c_str());
#else
  mkdir(dirname.c_str(), S_IRWXU | S_IRWXG);
#endif
  string filename = dirname + "/" + lang + suffix;
  if (writer == nullptr)
    return SaveDataToFile(data, filename.c_str());
  else
    return (*writer)(data, filename.c_str());
}

// Helper reads a file with optional reader and returns a STRING.
// On failure emits a warning message and returns and empty STRING.
STRING ReadFile(const string& filename, FileReader reader) {
  if (filename.empty()) return STRING();
  GenericVector<char> data;
  bool read_result;
  if (reader == nullptr)
    read_result = LoadDataFromFile(filename.c_str(), &data);
  else
    read_result = (*reader)(filename.c_str(), &data);
  if (read_result) return STRING(&data[0], data.size());
  tprintf("Failed to read data from: %s\n", filename.c_str());
  return STRING();
}

// Helper writes the unicharset to file and to the traineddata.
bool WriteUnicharset(const UNICHARSET& unicharset, const string& output_dir,
                     const string& lang, FileWriter writer,
                     TessdataManager* traineddata) {
  GenericVector<char> unicharset_data;
  TFile fp;
  fp.OpenWrite(&unicharset_data);
  if (!unicharset.save_to_file(&fp)) return false;
  traineddata->OverwriteEntry(TESSDATA_LSTM_UNICHARSET, &unicharset_data[0],
                              unicharset_data.size());
  return WriteFile(output_dir, lang, ".unicharset", unicharset_data, writer);
}

// Helper creates the recoder and writes it to the traineddata, and a human-
// readable form to file.
bool WriteRecoder(const UNICHARSET& unicharset, bool pass_through,
                  const string& output_dir, const string& lang,
                  FileWriter writer, STRING* radical_table_data,
                  TessdataManager* traineddata) {
  UnicharCompress recoder;
  // Where the unicharset is carefully setup already to contain a good
  // compact encoding, use a pass-through recoder that does nothing.
  // For scripts that have a large number of unicodes (Han, Hangul) we want
  // to use the recoder to compress the symbol space by re-encoding each
  // unicode as multiple codes from a smaller 'alphabet' that are related to the
  // shapes in the character. Hangul Jamo is a perfect example of this.
  // See the Hangul Syllables section, sub-section "Equivalence" in:
  // http://www.unicode.org/versions/Unicode10.0.0/ch18.pdf
  if (pass_through) {
    recoder.SetupPassThrough(unicharset);
  } else {
    int null_char =
        unicharset.has_special_codes() ? UNICHAR_BROKEN : unicharset.size();
    tprintf("Null char=%d\n", null_char);
    if (!recoder.ComputeEncoding(unicharset, null_char, radical_table_data)) {
      tprintf("Creation of encoded unicharset failed!!\n");
      return false;
    }
  }
  TFile fp;
  GenericVector<char> recoder_data;
  fp.OpenWrite(&recoder_data);
  if (!recoder.Serialize(&fp)) return false;
  traineddata->OverwriteEntry(TESSDATA_LSTM_RECODER, &recoder_data[0],
                              recoder_data.size());
  STRING encoding = recoder.GetEncodingAsString(unicharset);
  recoder_data.init_to_size(encoding.length(), 0);
  memcpy(&recoder_data[0], &encoding[0], encoding.length());
  STRING suffix;
  suffix.add_str_int(".charset_size=", recoder.code_range());
  suffix += ".txt";
  return WriteFile(output_dir, lang, suffix.string(), recoder_data, writer);
}

// Helper builds a dawg from the given words, using the unicharset as coding,
// and reverse_policy for LTR/RTL, and overwrites file_type in the traineddata.
static bool WriteDawg(const GenericVector<STRING>& words,
                      const UNICHARSET& unicharset,
                      Trie::RTLReversePolicy reverse_policy,
                      TessdataType file_type, TessdataManager* traineddata) {
  // The first 3 arguments are not used in this case.
  Trie trie(DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM, unicharset.size(), 0);
  trie.add_word_list(words, unicharset, reverse_policy);
  tprintf("Reducing Trie to SquishedDawg\n");
  std::unique_ptr<SquishedDawg> dawg(trie.trie_to_dawg());
  if (dawg == nullptr || dawg->NumEdges() == 0) return false;
  TFile fp;
  GenericVector<char> dawg_data;
  fp.OpenWrite(&dawg_data);
  if (!dawg->write_squished_dawg(&fp)) return false;
  traineddata->OverwriteEntry(file_type, &dawg_data[0], dawg_data.size());
  return true;
}

// Builds and writes the dawgs, given a set of words, punctuation
// patterns, number patterns, to the traineddata. Encoding uses the given
// unicharset, and the punc dawgs is reversed if lang_is_rtl.
static bool WriteDawgs(const GenericVector<STRING>& words,
                       const GenericVector<STRING>& puncs,
                       const GenericVector<STRING>& numbers, bool lang_is_rtl,
                       const UNICHARSET& unicharset,
                       TessdataManager* traineddata) {
  if (puncs.empty()) {
    tprintf("Must have non-empty puncs list to use language models!!\n");
    return false;
  }
  // For each of the dawg types, make the dawg, and write to traineddata.
  // Dawgs are reversed as follows:
  // Words: According to the word content.
  // Puncs: According to lang_is_rtl.
  // Numbers: Never.
  // System dawg (main wordlist).
  if (!words.empty() &&
      !WriteDawg(words, unicharset, Trie::RRP_REVERSE_IF_HAS_RTL,
                 TESSDATA_LSTM_SYSTEM_DAWG, traineddata)) {
    return false;
  }
  // punc/punc-dawg.
  Trie::RTLReversePolicy reverse_policy =
      lang_is_rtl ? Trie::RRP_FORCE_REVERSE : Trie::RRP_DO_NO_REVERSE;
  if (!WriteDawg(puncs, unicharset, reverse_policy, TESSDATA_LSTM_PUNC_DAWG,
                 traineddata)) {
    return false;
  }
  // numbers/number-dawg.
  if (!numbers.empty() &&
      !WriteDawg(numbers, unicharset, Trie::RRP_DO_NO_REVERSE,
                 TESSDATA_LSTM_NUMBER_DAWG, traineddata)) {
    return false;
  }
  return true;
}

// The main function for combine_lang_model.cpp.
// Returns EXIT_SUCCESS or EXIT_FAILURE for error.
int CombineLangModel(const UNICHARSET& unicharset, const string& script_dir,
                     const string& version_str, const string& output_dir,
                     const string& lang, bool pass_through_recoder,
                     const GenericVector<STRING>& words,
                     const GenericVector<STRING>& puncs,
                     const GenericVector<STRING>& numbers, bool lang_is_rtl,
                     FileReader reader, FileWriter writer) {
  // Build the traineddata file.
  TessdataManager traineddata;
  if (!version_str.empty()) {
    traineddata.SetVersionString(traineddata.VersionString() + ":" +
                                 version_str);
  }
  // Unicharset and recoder.
  if (!WriteUnicharset(unicharset, output_dir, lang, writer, &traineddata)) {
    tprintf("Error writing unicharset!!\n");
    return EXIT_FAILURE;
  } else {
    tprintf("Config file is optional, continuing...\n");
  }
  // If there is a config file, read it and add to traineddata.
  string config_filename = script_dir + "/" + lang + "/" + lang + ".config";
  STRING config_file = ReadFile(config_filename, reader);
  if (config_file.length() > 0) {
    traineddata.OverwriteEntry(TESSDATA_LANG_CONFIG, &config_file[0],
                               config_file.length());
  }
  string radical_filename = script_dir + "/radical-stroke.txt";
  STRING radical_data = ReadFile(radical_filename, reader);
  if (radical_data.length() == 0) {
    tprintf("Error reading radical code table %s\n", radical_filename.c_str());
    return EXIT_FAILURE;
  }
  if (!WriteRecoder(unicharset, pass_through_recoder, output_dir, lang, writer,
                    &radical_data, &traineddata)) {
    tprintf("Error writing recoder!!\n");
  }
  if (!words.empty() || !puncs.empty() || !numbers.empty()) {
    if (!WriteDawgs(words, puncs, numbers, lang_is_rtl, unicharset,
                    &traineddata)) {
      tprintf("Error during conversion of wordlists to DAWGs!!\n");
      return EXIT_FAILURE;
    }
  }

  // Traineddata file.
  GenericVector<char> traineddata_data;
  traineddata.Serialize(&traineddata_data);
  if (!WriteFile(output_dir, lang, ".traineddata", traineddata_data, writer)) {
    tprintf("Error writing output traineddata file!!\n");
    return EXIT_FAILURE;
  }
  return EXIT_SUCCESS;
}

}  // namespace tesseract
Part 2 of separating out the unicharset from the LSTM model, fixing command line for training 2017-08-03 04:29:23 +08:00			`// Copyright 2017 Google Inc. All Rights Reserved.`
			`// Author: rays@google.com (Ray Smith)`
			`// Purpose: Collection of convenience functions to simplify creation of the`
			`// unicharset, recoder, and dawgs for an LSTM model.`

			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`
			`#include "lang_model_helpers.h"`

Fix broken build for Windows Windows does not provide a mkdir function with two parameters. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2017-08-04 15:38:27 +08:00			`#if defined(_WIN32)`
			`#include <direct.h>`
			`#endif`
Part 2 of separating out the unicharset from the LSTM model, fixing command line for training 2017-08-03 04:29:23 +08:00			`#include <sys/stat.h>`
			`#include <sys/types.h>`
			`#include <cstdlib>`
			`#include "dawg.h"`
			`#include "fileio.h"`
			`#include "tessdatamanager.h"`
			`#include "trie.h"`
			`#include "unicharcompress.h"`

			`namespace tesseract {`

			`// Helper makes a filename (<output_dir>/<lang>/<lang><suffix>) and writes data`
			`// to the file, using writer if not null, otherwise, a default writer.`
			`// Default writer will overwrite any existing file, but a supplied writer`
			`// can do its own thing. If lang is empty, returns true but does nothing.`
			`// NOTE that suffix should contain any required . for the filename.`
			`bool WriteFile(const string& output_dir, const string& lang,`
			`const string& suffix, const GenericVector<char>& data,`
			`FileWriter writer) {`
			`if (lang.empty()) return true;`
			`string dirname = output_dir + "/" + lang;`
			`// Attempt to make the directory, but ignore errors, as it may not be a`
			`// standard filesystem, and the writer will complain if not successful.`
Fix broken build for Windows Windows does not provide a mkdir function with two parameters. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2017-08-04 15:38:27 +08:00			`#if defined(_WIN32)`
			`_mkdir(dirname.c_str());`
			`#else`
Part 2 of separating out the unicharset from the LSTM model, fixing command line for training 2017-08-03 04:29:23 +08:00			`mkdir(dirname.c_str(), S_IRWXU \| S_IRWXG);`
Fix broken build for Windows Windows does not provide a mkdir function with two parameters. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2017-08-04 15:38:27 +08:00			`#endif`
Part 2 of separating out the unicharset from the LSTM model, fixing command line for training 2017-08-03 04:29:23 +08:00			`string filename = dirname + "/" + lang + suffix;`
			`if (writer == nullptr)`
			`return SaveDataToFile(data, filename.c_str());`
			`else`
			`return (*writer)(data, filename.c_str());`
			`}`

			`// Helper reads a file with optional reader and returns a STRING.`
			`// On failure emits a warning message and returns and empty STRING.`
			`STRING ReadFile(const string& filename, FileReader reader) {`
			`if (filename.empty()) return STRING();`
			`GenericVector<char> data;`
			`bool read_result;`
			`if (reader == nullptr)`
			`read_result = LoadDataFromFile(filename.c_str(), &data);`
			`else`
			`read_result = (*reader)(filename.c_str(), &data);`
			`if (read_result) return STRING(&data[0], data.size());`
			`tprintf("Failed to read data from: %s\n", filename.c_str());`
			`return STRING();`
			`}`

			`// Helper writes the unicharset to file and to the traineddata.`
			`bool WriteUnicharset(const UNICHARSET& unicharset, const string& output_dir,`
			`const string& lang, FileWriter writer,`
			`TessdataManager* traineddata) {`
			`GenericVector<char> unicharset_data;`
			`TFile fp;`
			`fp.OpenWrite(&unicharset_data);`
			`if (!unicharset.save_to_file(&fp)) return false;`
			`traineddata->OverwriteEntry(TESSDATA_LSTM_UNICHARSET, &unicharset_data[0],`
			`unicharset_data.size());`
			`return WriteFile(output_dir, lang, ".unicharset", unicharset_data, writer);`
			`}`

			`// Helper creates the recoder and writes it to the traineddata, and a human-`
			`// readable form to file.`
			`bool WriteRecoder(const UNICHARSET& unicharset, bool pass_through,`
			`const string& output_dir, const string& lang,`
			`FileWriter writer, STRING* radical_table_data,`
			`TessdataManager* traineddata) {`
			`UnicharCompress recoder;`
			`// Where the unicharset is carefully setup already to contain a good`
			`// compact encoding, use a pass-through recoder that does nothing.`
			`// For scripts that have a large number of unicodes (Han, Hangul) we want`
			`// to use the recoder to compress the symbol space by re-encoding each`
			`// unicode as multiple codes from a smaller 'alphabet' that are related to the`
			`// shapes in the character. Hangul Jamo is a perfect example of this.`
			`// See the Hangul Syllables section, sub-section "Equivalence" in:`
			`// http://www.unicode.org/versions/Unicode10.0.0/ch18.pdf`
			`if (pass_through) {`
			`recoder.SetupPassThrough(unicharset);`
			`} else {`
			`int null_char =`
			`unicharset.has_special_codes() ? UNICHAR_BROKEN : unicharset.size();`
			`tprintf("Null char=%d\n", null_char);`
			`if (!recoder.ComputeEncoding(unicharset, null_char, radical_table_data)) {`
			`tprintf("Creation of encoded unicharset failed!!\n");`
			`return false;`
			`}`
			`}`
			`TFile fp;`
			`GenericVector<char> recoder_data;`
			`fp.OpenWrite(&recoder_data);`
			`if (!recoder.Serialize(&fp)) return false;`
			`traineddata->OverwriteEntry(TESSDATA_LSTM_RECODER, &recoder_data[0],`
			`recoder_data.size());`
			`STRING encoding = recoder.GetEncodingAsString(unicharset);`
			`recoder_data.init_to_size(encoding.length(), 0);`
			`memcpy(&recoder_data[0], &encoding[0], encoding.length());`
			`STRING suffix;`
			`suffix.add_str_int(".charset_size=", recoder.code_range());`
			`suffix += ".txt";`
			`return WriteFile(output_dir, lang, suffix.string(), recoder_data, writer);`
			`}`

			`// Helper builds a dawg from the given words, using the unicharset as coding,`
			`// and reverse_policy for LTR/RTL, and overwrites file_type in the traineddata.`
			`static bool WriteDawg(const GenericVector<STRING>& words,`
			`const UNICHARSET& unicharset,`
			`Trie::RTLReversePolicy reverse_policy,`
			`TessdataType file_type, TessdataManager* traineddata) {`
			`// The first 3 arguments are not used in this case.`
			`Trie trie(DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM, unicharset.size(), 0);`
			`trie.add_word_list(words, unicharset, reverse_policy);`
			`tprintf("Reducing Trie to SquishedDawg\n");`
			`std::unique_ptr<SquishedDawg> dawg(trie.trie_to_dawg());`
			`if (dawg == nullptr \|\| dawg->NumEdges() == 0) return false;`
			`TFile fp;`
			`GenericVector<char> dawg_data;`
			`fp.OpenWrite(&dawg_data);`
			`if (!dawg->write_squished_dawg(&fp)) return false;`
			`traineddata->OverwriteEntry(file_type, &dawg_data[0], dawg_data.size());`
			`return true;`
			`}`

			`// Builds and writes the dawgs, given a set of words, punctuation`
			`// patterns, number patterns, to the traineddata. Encoding uses the given`
			`// unicharset, and the punc dawgs is reversed if lang_is_rtl.`
			`static bool WriteDawgs(const GenericVector<STRING>& words,`
			`const GenericVector<STRING>& puncs,`
			`const GenericVector<STRING>& numbers, bool lang_is_rtl,`
			`const UNICHARSET& unicharset,`
			`TessdataManager* traineddata) {`
			`if (puncs.empty()) {`
			`tprintf("Must have non-empty puncs list to use language models!!\n");`
			`return false;`
			`}`
			`// For each of the dawg types, make the dawg, and write to traineddata.`
			`// Dawgs are reversed as follows:`
			`// Words: According to the word content.`
			`// Puncs: According to lang_is_rtl.`
			`// Numbers: Never.`
			`// System dawg (main wordlist).`
			`if (!words.empty() &&`
			`!WriteDawg(words, unicharset, Trie::RRP_REVERSE_IF_HAS_RTL,`
			`TESSDATA_LSTM_SYSTEM_DAWG, traineddata)) {`
			`return false;`
			`}`
			`// punc/punc-dawg.`
			`Trie::RTLReversePolicy reverse_policy =`
			`lang_is_rtl ? Trie::RRP_FORCE_REVERSE : Trie::RRP_DO_NO_REVERSE;`
			`if (!WriteDawg(puncs, unicharset, reverse_policy, TESSDATA_LSTM_PUNC_DAWG,`
			`traineddata)) {`
			`return false;`
			`}`
			`// numbers/number-dawg.`
			`if (!numbers.empty() &&`
			`!WriteDawg(numbers, unicharset, Trie::RRP_DO_NO_REVERSE,`
			`TESSDATA_LSTM_NUMBER_DAWG, traineddata)) {`
			`return false;`
			`}`
			`return true;`
			`}`

			`// The main function for combine_lang_model.cpp.`
			`// Returns EXIT_SUCCESS or EXIT_FAILURE for error.`
			`int CombineLangModel(const UNICHARSET& unicharset, const string& script_dir,`
			`const string& version_str, const string& output_dir,`
			`const string& lang, bool pass_through_recoder,`
			`const GenericVector<STRING>& words,`
			`const GenericVector<STRING>& puncs,`
			`const GenericVector<STRING>& numbers, bool lang_is_rtl,`
			`FileReader reader, FileWriter writer) {`
			`// Build the traineddata file.`
			`TessdataManager traineddata;`
			`if (!version_str.empty()) {`
			`traineddata.SetVersionString(traineddata.VersionString() + ":" +`
			`version_str);`
			`}`
			`// Unicharset and recoder.`
			`if (!WriteUnicharset(unicharset, output_dir, lang, writer, &traineddata)) {`
			`tprintf("Error writing unicharset!!\n");`
			`return EXIT_FAILURE;`
Improved error message on missing optional config 2017-08-08 00:50:49 +08:00			`} else {`
			`tprintf("Config file is optional, continuing...\n");`
Part 2 of separating out the unicharset from the LSTM model, fixing command line for training 2017-08-03 04:29:23 +08:00			`}`
			`// If there is a config file, read it and add to traineddata.`
			`string config_filename = script_dir + "/" + lang + "/" + lang + ".config";`
			`STRING config_file = ReadFile(config_filename, reader);`
			`if (config_file.length() > 0) {`
			`traineddata.OverwriteEntry(TESSDATA_LANG_CONFIG, &config_file[0],`
			`config_file.length());`
			`}`
			`string radical_filename = script_dir + "/radical-stroke.txt";`
			`STRING radical_data = ReadFile(radical_filename, reader);`
			`if (radical_data.length() == 0) {`
			`tprintf("Error reading radical code table %s\n", radical_filename.c_str());`
			`return EXIT_FAILURE;`
			`}`
			`if (!WriteRecoder(unicharset, pass_through_recoder, output_dir, lang, writer,`
			`&radical_data, &traineddata)) {`
			`tprintf("Error writing recoder!!\n");`
			`}`
			`if (!words.empty() \|\| !puncs.empty() \|\| !numbers.empty()) {`
			`if (!WriteDawgs(words, puncs, numbers, lang_is_rtl, unicharset,`
			`&traineddata)) {`
			`tprintf("Error during conversion of wordlists to DAWGs!!\n");`
			`return EXIT_FAILURE;`
			`}`
			`}`

			`// Traineddata file.`
			`GenericVector<char> traineddata_data;`
			`traineddata.Serialize(&traineddata_data);`
			`if (!WriteFile(output_dir, lang, ".traineddata", traineddata_data, writer)) {`
			`tprintf("Error writing output traineddata file!!\n");`
			`return EXIT_FAILURE;`
			`}`
			`return EXIT_SUCCESS;`
			`}`

			`} // namespace tesseract`