tesseract/training/combine_tessdata.cpp

///////////////////////////////////////////////////////////////////////
// File:        combine_tessdata
// Description: Creates a unified traineddata file from several
//              data files produced by the training process.
// Author:      Daria Antonova
// Created:     Wed Jun 03 11:26:43 PST 2009
//
// (C) Copyright 2009, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#include "tessdatamanager.h"

// Main program to combine/extract/overwrite tessdata components
// in [lang].traineddata files.
//
// To combine all the individual tessdata components (unicharset, DAWGs,
// classifier templates, ambiguities, language configs) located at, say,
// /home/$USER/temp/eng.* run:
//
//   combine_tessdata /home/$USER/temp/eng.
//
// The result will be a combined tessdata file /home/$USER/temp/eng.traineddata
//
// Specify option -e if you would like to extract individual components
// from a combined traineddata file. For example, to extract language config
// file and the unicharset from tessdata/eng.traineddata run:
//
//   combine_tessdata -e tessdata/eng.traineddata
//   /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
//
// The desired config file and unicharset will be written to
// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
//
// Specify option -o to overwrite individual components of the given
// [lang].traineddata file. For example, to overwrite language config
// and unichar ambiguities files in tessdata/eng.traineddata use:
//
//   combine_tessdata -o tessdata/eng.traineddata
//   /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
//
// As a result, tessdata/eng.traineddata will contain the new language config
// and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.
//
// Note: the file names of the files to extract to and to overwrite from should
// have the appropriate file suffixes (extensions) indicating their tessdata
// component type (.unicharset for the unicharset, .unicharambigs for unichar
// ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.
//
// Specify option -u to unpack all the components to the specified path:
//
// combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
//
// This will create  /home/$USER/temp/eng.* files with individual tessdata
// components from tessdata/eng.traineddata.
//
int main(int argc, char **argv) {
  int i;
  if (argc == 2) {
    printf("Combining tessdata files\n");
    STRING output_file = argv[1];
    output_file += kTrainedDataSuffix;
    if (!tesseract::TessdataManager::CombineDataFiles(
        argv[1], output_file.string())) {
      char* last = &argv[1][strlen(argv[1])-1];
      printf("Error combining tessdata files into %s\n",
             output_file.string());
      if (*last != '.')
        printf("Hint: the prefix is missing a period (.)\n");
    }
  } else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 ||
                           strcmp(argv[1], "-u") == 0)) {
    // Initialize TessdataManager with the data in the given traineddata file.
    tesseract::TessdataManager tm;
    tm.Init(argv[2], 0);
    printf("Extracting tessdata components from %s\n", argv[2]);
    if (strcmp(argv[1], "-e") == 0) {
      for (i = 3; i < argc; ++i) {
        if (tm.ExtractToFile(argv[i])) {
          printf("Wrote %s\n", argv[i]);
        } else {
          printf("Not extracting %s, since this component"
                 " is not present\n", argv[i]);
        }
      }
    } else {  // extract all the components
      for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
        STRING filename = argv[3];
        filename += tesseract::kTessdataFileSuffixes[i];
        if (tm.ExtractToFile(filename.string())) {
          printf("Wrote %s\n", filename.string());
        }
      }
    }
    tm.End();
  } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
    // Rename the current traineddata file to a temporary name.
    const char *new_traineddata_filename = argv[2];
    STRING traineddata_filename = new_traineddata_filename;
    traineddata_filename += ".__tmp__";
    if (rename(new_traineddata_filename, traineddata_filename.string()) != 0) {
      tprintf("Failed to create a temporary file %s\n",
              traineddata_filename.string());
      exit(1);
    }

    // Initialize TessdataManager with the data in the given traineddata file.
    tesseract::TessdataManager tm;
    tm.Init(traineddata_filename.string(), 0);

    // Write the updated traineddata file.
    tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
    tm.End();
  } else {
    printf("Usage for combining tessdata components:\n"
           "%s language_data_path_prefix (e.g. tessdata/eng.)\n", argv[0]);
    printf("Usage for extracting tessdata components:\n"
           "%s -e traineddata_file [output_component_file...]\n", argv[0]);
    printf("Usage for overwriting tessdata components:\n"
           "%s -o traineddata_file [input_component_file...]\n", argv[0]);
    printf("Usage for unpacking all tessdata components:\n"
           "%s -u traineddata_file output_path_prefix"
           " (e.g. /tmp/eng.)\n", argv[0]);
    return 1;
  }
}
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`///////////////////////////////////////////////////////////////////////`
			`// File: combine_tessdata`
			`// Description: Creates a unified traineddata file from several`
			`// data files produced by the training process.`
			`// Author: Daria Antonova`
			`// Created: Wed Jun 03 11:26:43 PST 2009`
			`//`
			`// (C) Copyright 2009, Google Inc.`
			`// Licensed under the Apache License, Version 2.0 (the "License");`
			`// you may not use this file except in compliance with the License.`
			`// You may obtain a copy of the License at`
			`// http://www.apache.org/licenses/LICENSE-2.0`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the License is distributed on an "AS IS" BASIS,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the License for the specific language governing permissions and`
			`// limitations under the License.`
			`//`
			`///////////////////////////////////////////////////////////////////////`

			`#include "tessdatamanager.h"`

Updated tessdatamanager/combine_tessdata to give more functionality git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@353 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-05-21 07:07:24 +08:00			`// Main program to combine/extract/overwrite tessdata components`
			`// in [lang].traineddata files.`
			`//`
			`// To combine all the individual tessdata components (unicharset, DAWGs,`
			`// classifier templates, ambiguities, language configs) located at, say,`
			`// /home/$USER/temp/eng.* run:`
			`//`
			`// combine_tessdata /home/$USER/temp/eng.`
			`//`
			`// The result will be a combined tessdata file /home/$USER/temp/eng.traineddata`
			`//`
			`// Specify option -e if you would like to extract individual components`
			`// from a combined traineddata file. For example, to extract language config`
			`// file and the unicharset from tessdata/eng.traineddata run:`
			`//`
			`// combine_tessdata -e tessdata/eng.traineddata`
			`// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset`
			`//`
			`// The desired config file and unicharset will be written to`
			`// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset`
			`//`
			`// Specify option -o to overwrite individual components of the given`
			`// [lang].traineddata file. For example, to overwrite language config`
			`// and unichar ambiguities files in tessdata/eng.traineddata use:`
			`//`
			`// combine_tessdata -o tessdata/eng.traineddata`
			`// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs`
			`//`
			`// As a result, tessdata/eng.traineddata will contain the new language config`
			`// and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.`
			`//`
			`// Note: the file names of the files to extract to and to overwrite from should`
			`// have the appropriate file suffixes (extensions) indicating their tessdata`
			`// component type (.unicharset for the unicharset, .unicharambigs for unichar`
			`// ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.`
			`//`
			`// Specify option -u to unpack all the components to the specified path:`
			`//`
			`// combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.`
			`//`
			`// This will create /home/$USER/temp/eng.* files with individual tessdata`
			`// components from tessdata/eng.traineddata.`
			`//`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`int main(int argc, char **argv) {`
Updated tessdatamanager/combine_tessdata to give more functionality git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@353 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-05-21 07:07:24 +08:00			`int i;`
			`if (argc == 2) {`
a better prompt for the user to include '.'; reverts Zdenko's message changes: the original example was a /real/ example git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@464 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-09-28 09:03:17 +08:00			`printf("Combining tessdata files\n");`
Updated tessdatamanager/combine_tessdata to give more functionality git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@353 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-05-21 07:07:24 +08:00			`STRING output_file = argv[1];`
			`output_file += kTrainedDataSuffix;`
			`if (!tesseract::TessdataManager::CombineDataFiles(`
			`argv[1], output_file.string())) {`
a better prompt for the user to include '.'; reverts Zdenko's message changes: the original example was a /real/ example git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@464 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-09-28 09:03:17 +08:00			`char* last = &argv[1][strlen(argv[1])-1];`
Updated tessdatamanager/combine_tessdata to give more functionality git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@353 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-05-21 07:07:24 +08:00			`printf("Error combining tessdata files into %s\n",`
			`output_file.string());`
a better prompt for the user to include '.'; reverts Zdenko's message changes: the original example was a /real/ example git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@464 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-09-28 09:03:17 +08:00			`if (*last != '.')`
			`printf("Hint: the prefix is missing a period (.)\n");`
Updated tessdatamanager/combine_tessdata to give more functionality git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@353 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-05-21 07:07:24 +08:00			`}`
			`} else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 \|\|`
			`strcmp(argv[1], "-u") == 0)) {`
			`// Initialize TessdataManager with the data in the given traineddata file.`
			`tesseract::TessdataManager tm;`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`tm.Init(argv[2], 0);`
Updated tessdatamanager/combine_tessdata to give more functionality git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@353 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-05-21 07:07:24 +08:00			`printf("Extracting tessdata components from %s\n", argv[2]);`
			`if (strcmp(argv[1], "-e") == 0) {`
			`for (i = 3; i < argc; ++i) {`
			`if (tm.ExtractToFile(argv[i])) {`
			`printf("Wrote %s\n", argv[i]);`
			`} else {`
			`printf("Not extracting %s, since this component"`
			`" is not present\n", argv[i]);`
			`}`
			`}`
			`} else { // extract all the components`
			`for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {`
			`STRING filename = argv[3];`
			`filename += tesseract::kTessdataFileSuffixes[i];`
			`if (tm.ExtractToFile(filename.string())) {`
			`printf("Wrote %s\n", filename.string());`
			`}`
			`}`
			`}`
			`tm.End();`
			`} else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {`
			`// Rename the current traineddata file to a temporary name.`
			`const char *new_traineddata_filename = argv[2];`
			`STRING traineddata_filename = new_traineddata_filename;`
			`traineddata_filename += ".__tmp__";`
			`if (rename(new_traineddata_filename, traineddata_filename.string()) != 0) {`
			`tprintf("Failed to create a temporary file %s\n",`
			`traineddata_filename.string());`
			`exit(1);`
			`}`

			`// Initialize TessdataManager with the data in the given traineddata file.`
			`tesseract::TessdataManager tm;`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`tm.Init(traineddata_filename.string(), 0);`
Updated tessdatamanager/combine_tessdata to give more functionality git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@353 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-05-21 07:07:24 +08:00
			`// Write the updated traineddata file.`
			`tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);`
			`tm.End();`
			`} else {`
			`printf("Usage for combining tessdata components:\n"`
a better prompt for the user to include '.'; reverts Zdenko's message changes: the original example was a /real/ example git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@464 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-09-28 09:03:17 +08:00			`"%s language_data_path_prefix (e.g. tessdata/eng.)\n", argv[0]);`
			`printf("Usage for extracting tessdata components:\n"`
			`"%s -e traineddata_file [output_component_file...]\n", argv[0]);`
			`printf("Usage for overwriting tessdata components:\n"`
			`"%s -o traineddata_file [input_component_file...]\n", argv[0]);`
			`printf("Usage for unpacking all tessdata components:\n"`
			`"%s -u traineddata_file output_path_prefix"`
			`" (e.g. /tmp/eng.)\n", argv[0]);`
Changes to training for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@302 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:44:07 +08:00			`return 1;`
			`}`
			`}`