tesseract/src/training/unicharset_training_utils.cpp

///////////////////////////////////////////////////////////////////////
// File:        unicharset_training_utils.cpp
// Description: Training utilities for UNICHARSET.
// Author:      Ray Smith
// Created:     Fri Oct 17 17:09:01 PDT 2014
//
// (C) Copyright 2014, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#include "unicharset_training_utils.h"

#include <stdlib.h>
#include <string.h>
#include <string>
#include <vector>

#include "fileio.h"
#include "icuerrorcode.h"
#include "normstrngs.h"
#include "statistc.h"
#include "unichar.h"
#include "unicharset.h"
#include "unicode/uchar.h"    // from libicu
#include "unicode/uscript.h"  // from libicu

namespace tesseract {

// Helper sets the character attribute properties and sets up the script table.
// Does not set tops and bottoms.
void SetupBasicProperties(bool report_errors, bool decompose,
                          UNICHARSET* unicharset) {
  for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
    // Convert any custom ligatures.
    const char* unichar_str = unicharset->id_to_unichar(unichar_id);
    for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {
      if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
        unichar_str = UNICHARSET::kCustomLigatures[i][0];
        break;
      }
    }

    // Convert the unichar to UTF32 representation
    std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(unichar_str);

    // Assume that if the property is true for any character in the string,
    // then it holds for the whole "character".
    bool unichar_isalpha = false;
    bool unichar_islower = false;
    bool unichar_isupper = false;
    bool unichar_isdigit = false;
    bool unichar_ispunct = false;

    for (char32 u_ch : uni_vector) {
      if (u_isalpha(u_ch)) unichar_isalpha = true;
      if (u_islower(u_ch)) unichar_islower = true;
      if (u_isupper(u_ch)) unichar_isupper = true;
      if (u_isdigit(u_ch)) unichar_isdigit = true;
      if (u_ispunct(u_ch)) unichar_ispunct = true;
    }

    unicharset->set_isalpha(unichar_id, unichar_isalpha);
    unicharset->set_islower(unichar_id, unichar_islower);
    unicharset->set_isupper(unichar_id, unichar_isupper);
    unicharset->set_isdigit(unichar_id, unichar_isdigit);
    unicharset->set_ispunctuation(unichar_id, unichar_ispunct);

    tesseract::IcuErrorCode err;
    unicharset->set_script(unichar_id, uscript_getName(
        uscript_getScript(uni_vector[0], err)));

    const int num_code_points = uni_vector.size();
    // Obtain the lower/upper case if needed and record it in the properties.
    unicharset->set_other_case(unichar_id, unichar_id);
    if (unichar_islower || unichar_isupper) {
      std::vector<char32> other_case(num_code_points, 0);
      for (int i = 0; i < num_code_points; ++i) {
        // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
        // However since they deal with UChars (so need a conversion function
        // from char32 or UTF8string) and require a meaningful locale string,
        // for now u_tolower()/u_toupper() are used.
        other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :
          u_tolower(uni_vector[i]);
      }
      std::string other_case_uch = UNICHAR::UTF32ToUTF8(other_case);
      UNICHAR_ID other_case_id =
          unicharset->unichar_to_id(other_case_uch.c_str());
      if (other_case_id != INVALID_UNICHAR_ID) {
        unicharset->set_other_case(unichar_id, other_case_id);
      } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {
        tprintf("Other case %s of %s is not in unicharset\n",
                other_case_uch.c_str(), unichar_str);
      }
    }

    // Set RTL property and obtain mirror unichar ID from ICU.
    std::vector<char32> mirrors(num_code_points, 0);
    for (int i = 0; i < num_code_points; ++i) {
      mirrors[i] = u_charMirror(uni_vector[i]);
      if (i == 0) {  // set directionality to that of the 1st code point
        unicharset->set_direction(unichar_id,
                                  static_cast<UNICHARSET::Direction>(
                                      u_charDirection(uni_vector[i])));
      }
    }
    std::string mirror_uch = UNICHAR::UTF32ToUTF8(mirrors);
    UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
    if (mirror_uch_id != INVALID_UNICHAR_ID) {
      unicharset->set_mirror(unichar_id, mirror_uch_id);
    } else if (report_errors) {
      tprintf("Mirror %s of %s is not in unicharset\n",
              mirror_uch.c_str(), unichar_str);
    }

    // Record normalized version of this unichar.
    std::string normed_str;
    if (unichar_id != 0 &&
        tesseract::NormalizeUTF8String(
            decompose ? tesseract::UnicodeNormMode::kNFKD
                      : tesseract::UnicodeNormMode::kNFKC,
            tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone,
            unichar_str, &normed_str) &&
        !normed_str.empty()) {
      unicharset->set_normed(unichar_id, normed_str.c_str());
    } else {
      unicharset->set_normed(unichar_id, unichar_str);
    }
    ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());
  }
  unicharset->post_load_setup();
}

// Helper sets the properties from universal script unicharsets, if found.
void SetScriptProperties(const std::string& script_dir, UNICHARSET* unicharset) {
  for (int s = 0; s < unicharset->get_script_table_size(); ++s) {
    // Load the unicharset for the script if available.
    std::string filename = script_dir + "/" +
                      unicharset->get_script_from_script_id(s) + ".unicharset";
    UNICHARSET script_set;
    if (script_set.load_from_file(filename.c_str())) {
      unicharset->SetPropertiesFromOther(script_set);
    } else if (s != unicharset->common_sid() && s != unicharset->null_sid()) {
      tprintf("Failed to load script unicharset from:%s\n", filename.c_str());
    }
  }
  for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset->size(); ++c) {
    if (unicharset->PropertiesIncomplete(c)) {
      tprintf("Warning: properties incomplete for index %d = %s\n", c,
              unicharset->id_to_unichar(c));
    }
  }
}

// Helper gets the combined x-heights string.
std::string GetXheightString(const std::string& script_dir,
                        const UNICHARSET& unicharset) {
  std::string xheights_str;
  for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
    // Load the xheights for the script if available.
    std::string filename = script_dir + "/" +
                      unicharset.get_script_from_script_id(s) + ".xheights";
    std::string script_heights;
    if (File::ReadFileToString(filename, &script_heights))
      xheights_str += script_heights;
  }
  return xheights_str;
}

// Helper to set the properties for an input unicharset file, writes to the
// output file. If an appropriate script unicharset can be found in the
// script_dir directory, then the tops and bottoms are expanded using the
// script unicharset.
// If non-empty, xheight data for the fonts are written to the xheights_file.
void SetPropertiesForInputFile(const std::string& script_dir,
                               const std::string& input_unicharset_file,
                               const std::string& output_unicharset_file,
                               const std::string& output_xheights_file) {
  UNICHARSET unicharset;

  // Load the input unicharset
  unicharset.load_from_file(input_unicharset_file.c_str());
  tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
          input_unicharset_file.c_str());

  // Set unichar properties
  tprintf("Setting unichar properties\n");
  SetupBasicProperties(true, false, &unicharset);
  tprintf("Setting script properties\n");
  SetScriptProperties(script_dir, &unicharset);
  if (!output_xheights_file.empty()) {
    std::string xheights_str = GetXheightString(script_dir, unicharset);
    File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
  }

  // Write the output unicharset
  tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
  unicharset.save_to_file(output_unicharset_file.c_str());
}

}  // namespace tesseract
Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00			`///////////////////////////////////////////////////////////////////////`
			`// File: unicharset_training_utils.cpp`
			`// Description: Training utilities for UNICHARSET.`
			`// Author: Ray Smith`
			`// Created: Fri Oct 17 17:09:01 PDT 2014`
			`//`
			`// (C) Copyright 2014, Google Inc.`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`
			`//`
			`///////////////////////////////////////////////////////////////////////`

			`#include "unicharset_training_utils.h"`

			`#include <stdlib.h>`
			`#include <string.h>`
			`#include <string>`
Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion 2017-07-15 00:30:14 +08:00			`#include <vector>`
Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00
			`#include "fileio.h"`
			`#include "icuerrorcode.h"`
			`#include "normstrngs.h"`
			`#include "statistc.h"`
Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion 2017-07-15 00:30:14 +08:00			`#include "unichar.h"`
Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00			`#include "unicharset.h"`
			`#include "unicode/uchar.h" // from libicu`
			`#include "unicode/uscript.h" // from libicu`

			`namespace tesseract {`

			`// Helper sets the character attribute properties and sets up the script table.`
			`// Does not set tops and bottoms.`
Added new LSTM-based neural network line recognizer 2016-11-08 07:38:07 +08:00			`void SetupBasicProperties(bool report_errors, bool decompose,`
			`UNICHARSET* unicharset) {`
Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00			`for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {`
			`// Convert any custom ligatures.`
			`const char* unichar_str = unicharset->id_to_unichar(unichar_id);`
training: Replace NULL by nullptr Signed-off-by: Stefan Weil <sw@weilnetz.de> 2016-12-13 15:08:01 +08:00			`for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {`
Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00			`if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {`
			`unichar_str = UNICHARSET::kCustomLigatures[i][0];`
			`break;`
			`}`
			`}`

			`// Convert the unichar to UTF32 representation`
Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion 2017-07-15 00:30:14 +08:00			`std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(unichar_str);`
Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00
			`// Assume that if the property is true for any character in the string,`
			`// then it holds for the whole "character".`
			`bool unichar_isalpha = false;`
			`bool unichar_islower = false;`
			`bool unichar_isupper = false;`
			`bool unichar_isdigit = false;`
			`bool unichar_ispunct = false;`

Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion 2017-07-15 00:30:14 +08:00			`for (char32 u_ch : uni_vector) {`
			`if (u_isalpha(u_ch)) unichar_isalpha = true;`
			`if (u_islower(u_ch)) unichar_islower = true;`
			`if (u_isupper(u_ch)) unichar_isupper = true;`
			`if (u_isdigit(u_ch)) unichar_isdigit = true;`
			`if (u_ispunct(u_ch)) unichar_ispunct = true;`
Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00			`}`

			`unicharset->set_isalpha(unichar_id, unichar_isalpha);`
			`unicharset->set_islower(unichar_id, unichar_islower);`
			`unicharset->set_isupper(unichar_id, unichar_isupper);`
			`unicharset->set_isdigit(unichar_id, unichar_isdigit);`
			`unicharset->set_ispunctuation(unichar_id, unichar_ispunct);`

			`tesseract::IcuErrorCode err;`
			`unicharset->set_script(unichar_id, uscript_getName(`
			`uscript_getScript(uni_vector[0], err)));`

			`const int num_code_points = uni_vector.size();`
			`// Obtain the lower/upper case if needed and record it in the properties.`
			`unicharset->set_other_case(unichar_id, unichar_id);`
			`if (unichar_islower \|\| unichar_isupper) {`
Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion 2017-07-15 00:30:14 +08:00			`std::vector<char32> other_case(num_code_points, 0);`
Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00			`for (int i = 0; i < num_code_points; ++i) {`
			`// TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.`
			`// However since they deal with UChars (so need a conversion function`
			`// from char32 or UTF8string) and require a meaningful locale string,`
			`// for now u_tolower()/u_toupper() are used.`
			`other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) :`
			`u_tolower(uni_vector[i]);`
			`}`
Remove old code for string class (no longer needed) (#1354) * Remove old code for string class (no longer needed) Signed-off-by: Stefan Weil <sw@weilnetz.de> * Add std namespace to string class Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-03-03 21:36:28 +08:00			`std::string other_case_uch = UNICHAR::UTF32ToUTF8(other_case);`
Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00			`UNICHAR_ID other_case_id =`
			`unicharset->unichar_to_id(other_case_uch.c_str());`
			`if (other_case_id != INVALID_UNICHAR_ID) {`
			`unicharset->set_other_case(unichar_id, other_case_id);`
			`} else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {`
			`tprintf("Other case %s of %s is not in unicharset\n",`
			`other_case_uch.c_str(), unichar_str);`
			`}`
			`}`

			`// Set RTL property and obtain mirror unichar ID from ICU.`
Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion 2017-07-15 00:30:14 +08:00			`std::vector<char32> mirrors(num_code_points, 0);`
Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00			`for (int i = 0; i < num_code_points; ++i) {`
			`mirrors[i] = u_charMirror(uni_vector[i]);`
			`if (i == 0) { // set directionality to that of the 1st code point`
			`unicharset->set_direction(unichar_id,`
			`static_cast<UNICHARSET::Direction>(`
			`u_charDirection(uni_vector[i])));`
			`}`
			`}`
Remove old code for string class (no longer needed) (#1354) * Remove old code for string class (no longer needed) Signed-off-by: Stefan Weil <sw@weilnetz.de> * Add std namespace to string class Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-03-03 21:36:28 +08:00			`std::string mirror_uch = UNICHAR::UTF32ToUTF8(mirrors);`
Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00			`UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());`
			`if (mirror_uch_id != INVALID_UNICHAR_ID) {`
			`unicharset->set_mirror(unichar_id, mirror_uch_id);`
			`} else if (report_errors) {`
			`tprintf("Mirror %s of %s is not in unicharset\n",`
			`mirror_uch.c_str(), unichar_str);`
			`}`

			`// Record normalized version of this unichar.`
Remove old code for string class (no longer needed) (#1354) * Remove old code for string class (no longer needed) Signed-off-by: Stefan Weil <sw@weilnetz.de> * Add std namespace to string class Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-03-03 21:36:28 +08:00			`std::string normed_str;`
Added script-specific validation and normalization for virama-using scripts and updated normalization for others 2017-07-15 01:05:05 +08:00			`if (unichar_id != 0 &&`
			`tesseract::NormalizeUTF8String(`
			`decompose ? tesseract::UnicodeNormMode::kNFKD`
			`: tesseract::UnicodeNormMode::kNFKC,`
			`tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone,`
			`unichar_str, &normed_str) &&`
			`!normed_str.empty()) {`
Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00			`unicharset->set_normed(unichar_id, normed_str.c_str());`
			`} else {`
			`unicharset->set_normed(unichar_id, unichar_str);`
			`}`
			`ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());`
			`}`
			`unicharset->post_load_setup();`
			`}`

Part 2 of separating out the unicharset from the LSTM model, fixing command line for training 2017-08-03 04:29:23 +08:00			`// Helper sets the properties from universal script unicharsets, if found.`
Remove old code for string class (no longer needed) (#1354) * Remove old code for string class (no longer needed) Signed-off-by: Stefan Weil <sw@weilnetz.de> * Add std namespace to string class Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-03-03 21:36:28 +08:00			`void SetScriptProperties(const std::string& script_dir, UNICHARSET* unicharset) {`
Part 2 of separating out the unicharset from the LSTM model, fixing command line for training 2017-08-03 04:29:23 +08:00			`for (int s = 0; s < unicharset->get_script_table_size(); ++s) {`
			`// Load the unicharset for the script if available.`
Remove old code for string class (no longer needed) (#1354) * Remove old code for string class (no longer needed) Signed-off-by: Stefan Weil <sw@weilnetz.de> * Add std namespace to string class Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-03-03 21:36:28 +08:00			`std::string filename = script_dir + "/" +`
Part 2 of separating out the unicharset from the LSTM model, fixing command line for training 2017-08-03 04:29:23 +08:00			`unicharset->get_script_from_script_id(s) + ".unicharset";`
			`UNICHARSET script_set;`
			`if (script_set.load_from_file(filename.c_str())) {`
			`unicharset->SetPropertiesFromOther(script_set);`
			`} else if (s != unicharset->common_sid() && s != unicharset->null_sid()) {`
			`tprintf("Failed to load script unicharset from:%s\n", filename.c_str());`
			`}`
			`}`
			`for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset->size(); ++c) {`
			`if (unicharset->PropertiesIncomplete(c)) {`
			`tprintf("Warning: properties incomplete for index %d = %s\n", c,`
			`unicharset->id_to_unichar(c));`
			`}`
			`}`
			`}`

			`// Helper gets the combined x-heights string.`
Remove old code for string class (no longer needed) (#1354) * Remove old code for string class (no longer needed) Signed-off-by: Stefan Weil <sw@weilnetz.de> * Add std namespace to string class Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-03-03 21:36:28 +08:00			`std::string GetXheightString(const std::string& script_dir,`
Part 2 of separating out the unicharset from the LSTM model, fixing command line for training 2017-08-03 04:29:23 +08:00			`const UNICHARSET& unicharset) {`
Remove old code for string class (no longer needed) (#1354) * Remove old code for string class (no longer needed) Signed-off-by: Stefan Weil <sw@weilnetz.de> * Add std namespace to string class Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-03-03 21:36:28 +08:00			`std::string xheights_str;`
Part 2 of separating out the unicharset from the LSTM model, fixing command line for training 2017-08-03 04:29:23 +08:00			`for (int s = 0; s < unicharset.get_script_table_size(); ++s) {`
			`// Load the xheights for the script if available.`
Remove old code for string class (no longer needed) (#1354) * Remove old code for string class (no longer needed) Signed-off-by: Stefan Weil <sw@weilnetz.de> * Add std namespace to string class Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-03-03 21:36:28 +08:00			`std::string filename = script_dir + "/" +`
Part 2 of separating out the unicharset from the LSTM model, fixing command line for training 2017-08-03 04:29:23 +08:00			`unicharset.get_script_from_script_id(s) + ".xheights";`
Remove old code for string class (no longer needed) (#1354) * Remove old code for string class (no longer needed) Signed-off-by: Stefan Weil <sw@weilnetz.de> * Add std namespace to string class Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-03-03 21:36:28 +08:00			`std::string script_heights;`
Part 2 of separating out the unicharset from the LSTM model, fixing command line for training 2017-08-03 04:29:23 +08:00			`if (File::ReadFileToString(filename, &script_heights))`
			`xheights_str += script_heights;`
			`}`
			`return xheights_str;`
			`}`

Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00			`// Helper to set the properties for an input unicharset file, writes to the`
			`// output file. If an appropriate script unicharset can be found in the`
			`// script_dir directory, then the tops and bottoms are expanded using the`
			`// script unicharset.`
			`// If non-empty, xheight data for the fonts are written to the xheights_file.`
Remove old code for string class (no longer needed) (#1354) * Remove old code for string class (no longer needed) Signed-off-by: Stefan Weil <sw@weilnetz.de> * Add std namespace to string class Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-03-03 21:36:28 +08:00			`void SetPropertiesForInputFile(const std::string& script_dir,`
			`const std::string& input_unicharset_file,`
			`const std::string& output_unicharset_file,`
			`const std::string& output_xheights_file) {`
Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00			`UNICHARSET unicharset;`

			`// Load the input unicharset`
			`unicharset.load_from_file(input_unicharset_file.c_str());`
			`tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),`
			`input_unicharset_file.c_str());`

			`// Set unichar properties`
			`tprintf("Setting unichar properties\n");`
Added new LSTM-based neural network line recognizer 2016-11-08 07:38:07 +08:00			`SetupBasicProperties(true, false, &unicharset);`
Part 2 of separating out the unicharset from the LSTM model, fixing command line for training 2017-08-03 04:29:23 +08:00			`tprintf("Setting script properties\n");`
			`SetScriptProperties(script_dir, &unicharset);`
			`if (!output_xheights_file.empty()) {`
Remove old code for string class (no longer needed) (#1354) * Remove old code for string class (no longer needed) Signed-off-by: Stefan Weil <sw@weilnetz.de> * Add std namespace to string class Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-03-03 21:36:28 +08:00			`std::string xheights_str = GetXheightString(script_dir, unicharset);`
Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00			`File::WriteStringToFileOrDie(xheights_str, output_xheights_file);`
			`}`

			`// Write the output unicharset`
			`tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());`
			`unicharset.save_to_file(output_unicharset_file.c_str());`
			`}`

			`} // namespace tesseract`