tesseract/src/training/tesstrain.sh

#!/bin/bash
# (C) Copyright 2014, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script provides an easy way to execute various phases of training
# Tesseract.  For a detailed description of the phases, see
# https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
#

display_usage() {
echo -e "USAGE: tesstrain.sh
     --exposures EXPOSURES      # A list of exposure levels to use (e.g. "-1 0 1").
     --fontlist FONTS           # A list of fontnames to train on.
     --fonts_dir FONTS_PATH     # Path to font files.
     --lang LANG_CODE           # ISO 639 code.
     --langdata_dir DATADIR     # Path to tesseract/training/langdata directory.
     --linedata_only            # Only generate training data for lstmtraining.
     --output_dir OUTPUTDIR     # Location of output traineddata file.
     --overwrite                # Safe to overwrite files in output_dir.
     --run_shape_clustering     # Run shape clustering (use for Indic langs).
     --maxpages                 # Specify maximum pages to output (default:0=all)
     --save_box_tiff            # Save box/tiff pairs along with lstmf files.
<<<<<<< HEAD
     --x_size                   # Specify width of output image (default:3600)
=======
     --xsize                    # Specify width of output image (default:3600)
    
  OPTIONAL flag for specifying directory with user specified box/tiff pairs.
  Files should be named similar to ${LANG_CODE}.${fontname}.exp${EXPOSURE}.box/tif
     --my_boxtiff_dir MY_BOXTIFF_DIR # Location of user specified box/tiff files.
>>>>>>> c7cd112... allow box/tiff pairs for LSTM training
     
  OPTIONAL flags for input data. If unspecified we will look for them in
  the langdata_dir directory.
     --training_text TEXTFILE   # Text to render and use for training.
     --wordlist WORDFILE        # Word list for the language ordered by
                                # decreasing frequency.
  OPTIONAL flag to specify location of existing traineddata files, required
  during feature extraction. If unspecified will use TESSDATA_PREFIX defined in
  the current environment.
     --tessdata_dir TESSDATADIR     # Path to tesseract/tessdata directory.
  NOTE:
  The font names specified in --fontlist need to be recognizable by Pango using
  fontconfig. An easy way to list the canonical names of all fonts available on
  your system is to run text2image with --list_available_fonts and the
  appropriate --fonts_dir path."
}

source "$(dirname $0)/tesstrain_utils.sh"
if [[ $# -eq 0 || "$1" == "--help" || "$1" == "-h" ]]; then
    display_usage
    exit 0
fi
if [ $# == 0 ]; then
    display_usage
    exit 1
fi

ARGV=("$@")
parse_flags

mkdir -p ${TRAINING_DIR}

if [[ ${MY_BOXTIFF_DIR} != "" ]]; then
    tlog "\n=== Copy existing box/tiff pairs from '${MY_BOXTIFF_DIR}'"
    cp  ${MY_BOXTIFF_DIR}/*.box ${TRAINING_DIR} | true
    cp  ${MY_BOXTIFF_DIR}/*.tif ${TRAINING_DIR} | true
    ls -l  ${TRAINING_DIR}
fi

tlog "\n=== Starting training for language '${LANG_CODE}'"

source "$(dirname $0)/language-specific.sh"
set_lang_specific_parameters ${LANG_CODE}

initialize_fontconfig

phase_I_generate_image 8
phase_UP_generate_unicharset
if ((LINEDATA)); then
  phase_E_extract_features " --psm 6  lstm.train " 8 "lstmf"
  make__lstmdata
  tlog "\nCreated starter traineddata for LSTM training of language '${LANG_CODE}'\n"
  tlog "\nRun 'lstmtraining' comman next to continue LSTM training for language '${LANG_CODE}'\n"
else
  phase_D_generate_dawg
  phase_E_extract_features "box.train" 8 "tr"
  phase_C_cluster_prototypes "${TRAINING_DIR}/${LANG_CODE}.normproto"
  phase_S_cluster_shapes
  phase_M_cluster_microfeatures
  phase_B_generate_ambiguities
  make__traineddata
  tlog "\nCompleted training for language '${LANG_CODE}'\n"
fi
Added tesstrain.sh - a master training script git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1146 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-08-12 07:20:56 +08:00			`#!/bin/bash`
			`# (C) Copyright 2014, Google Inc.`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`
			`# This script provides an easy way to execute various phases of training`
			`# Tesseract. For a detailed description of the phases, see`
change links from code.google.com to github.com 2015-07-11 15:43:31 +08:00			`# https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract`
Added tesstrain.sh - a master training script git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1146 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-08-12 07:20:56 +08:00			`#`

print help for tesstrain.sh; fixes #1469 2018-10-02 17:35:10 +08:00			`display_usage() {`
			`echo -e "USAGE: tesstrain.sh`
add --xsize as parameter for tesstrain 2019-01-27 15:00:25 +08:00			`--exposures EXPOSURES # A list of exposure levels to use (e.g. "-1 0 1").`
print help for tesstrain.sh; fixes #1469 2018-10-02 17:35:10 +08:00			`--fontlist FONTS # A list of fontnames to train on.`
			`--fonts_dir FONTS_PATH # Path to font files.`
			`--lang LANG_CODE # ISO 639 code.`
			`--langdata_dir DATADIR # Path to tesseract/training/langdata directory.`
add --xsize as parameter for tesstrain 2019-01-27 15:00:25 +08:00			`--linedata_only # Only generate training data for lstmtraining.`
print help for tesstrain.sh; fixes #1469 2018-10-02 17:35:10 +08:00			`--output_dir OUTPUTDIR # Location of output traineddata file.`
			`--overwrite # Safe to overwrite files in output_dir.`
			`--run_shape_clustering # Run shape clustering (use for Indic langs).`
add --xsize as parameter for tesstrain 2019-01-27 15:00:25 +08:00			`--maxpages # Specify maximum pages to output (default:0=all)`
			`--save_box_tiff # Save box/tiff pairs along with lstmf files.`
allow user specified box/tiff pairs with tesstrain.sh 2019-02-01 03:03:05 +08:00			`<<<<<<< HEAD`
add --xsize as parameter for tesstrain 2019-01-27 15:00:25 +08:00			`--x_size # Specify width of output image (default:3600)`
allow user specified box/tiff pairs with tesstrain.sh 2019-02-01 03:03:05 +08:00			`=======`
			`--xsize # Specify width of output image (default:3600)`

			`OPTIONAL flag for specifying directory with user specified box/tiff pairs.`
			`Files should be named similar to ${LANG_CODE}.${fontname}.exp${EXPOSURE}.box/tif`
			`--my_boxtiff_dir MY_BOXTIFF_DIR # Location of user specified box/tiff files.`
			`>>>>>>> c7cd112... allow box/tiff pairs for LSTM training`
add --xsize as parameter for tesstrain 2019-01-27 15:00:25 +08:00
print help for tesstrain.sh; fixes #1469 2018-10-02 17:35:10 +08:00			`OPTIONAL flags for input data. If unspecified we will look for them in`
			`the langdata_dir directory.`
			`--training_text TEXTFILE # Text to render and use for training.`
			`--wordlist WORDFILE # Word list for the language ordered by`
			`# decreasing frequency.`
			`OPTIONAL flag to specify location of existing traineddata files, required`
			`during feature extraction. If unspecified will use TESSDATA_PREFIX defined in`
			`the current environment.`
			`--tessdata_dir TESSDATADIR # Path to tesseract/tessdata directory.`
			`NOTE:`
			`The font names specified in --fontlist need to be recognizable by Pango using`
			`fontconfig. An easy way to list the canonical names of all fonts available on`
			`your system is to run text2image with --list_available_fonts and the`
			`appropriate --fonts_dir path."`
			`}`
Added tesstrain.sh - a master training script git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1146 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-08-12 07:20:56 +08:00
Fix some issues reported by shellcheck (SC2004, SC2006) Examples: In training/tesstrain.sh line 64: if (( ${LINEDATA} )); then ^-- SC2004: $/${} is unnecessary on arithmetic variables. In training/tesstrain.sh line 56: source `dirname $0`/language-specific.sh ^-- SC2006: Use $(..) instead of legacy `..`. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2016-12-14 21:11:24 +08:00			`source "$(dirname $0)/tesstrain_utils.sh"`
Exit training script if run command failed; fixes #2005 2018-10-20 19:00:39 +08:00			`if [[ $# -eq 0 \|\| "$1" == "--help" \|\| "$1" == "-h" ]]; then`
print help for tesstrain.sh; fixes #1469 2018-10-02 17:35:10 +08:00			`display_usage`
			`exit 0`
			`fi`
			`if [ $# == 0 ]; then`
			`display_usage`
			`exit 1`
			`fi`
Added tesstrain.sh - a master training script git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1146 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-08-12 07:20:56 +08:00
			`ARGV=("$@")`
			`parse_flags`

			`mkdir -p ${TRAINING_DIR}`
allow user specified box/tiff pairs with tesstrain.sh 2019-02-01 03:03:05 +08:00
			`if [[ ${MY_BOXTIFF_DIR} != "" ]]; then`
			`tlog "\n=== Copy existing box/tiff pairs from '${MY_BOXTIFF_DIR}'"`
			`cp ${MY_BOXTIFF_DIR}/*.box ${TRAINING_DIR} \| true`
			`cp ${MY_BOXTIFF_DIR}/*.tif ${TRAINING_DIR} \| true`
			`ls -l ${TRAINING_DIR}`
			`fi`

Use mktemp to create workspace directory mktemp is a better idea for security, as well as enabling users to specify a different directory using the TMPDIR environment variable, which is useful if /tmp is a small tmpfs. Also fix a bug where the first few log messages were failing as the workspace directory wasn't been created early enough. 2015-09-10 22:05:07 +08:00			`tlog "\n=== Starting training for language '${LANG_CODE}'"`
Added tesstrain.sh - a master training script git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1146 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-08-12 07:20:56 +08:00
Fix some issues reported by shellcheck (SC2004, SC2006) Examples: In training/tesstrain.sh line 64: if (( ${LINEDATA} )); then ^-- SC2004: $/${} is unnecessary on arithmetic variables. In training/tesstrain.sh line 56: source `dirname $0`/language-specific.sh ^-- SC2006: Use $(..) instead of legacy `..`. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2016-12-14 21:11:24 +08:00			`source "$(dirname $0)/language-specific.sh"`
Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00			`set_lang_specific_parameters ${LANG_CODE}`

			`initialize_fontconfig`

			`phase_I_generate_image 8`
			`phase_UP_generate_unicharset`
Fix some issues reported by shellcheck (SC2004, SC2006) Examples: In training/tesstrain.sh line 64: if (( ${LINEDATA} )); then ^-- SC2004: $/${} is unnecessary on arithmetic variables. In training/tesstrain.sh line 56: source `dirname $0`/language-specific.sh ^-- SC2006: Use $(..) instead of legacy `..`. Signed-off-by: Stefan Weil <sw@weilnetz.de> 2016-12-14 21:11:24 +08:00			`if ((LINEDATA)); then`
Javanese script training 2018-08-16 20:15:10 +08:00			`phase_E_extract_features " --psm 6 lstm.train " 8 "lstmf"`
Fixes to training process to allow incremental training from a recognition model 2016-12-01 07:51:17 +08:00			`make__lstmdata`
allow user specified box/tiff pairs with tesstrain.sh 2019-02-01 03:03:05 +08:00			`tlog "\nCreated starter traineddata for LSTM training of language '${LANG_CODE}'\n"`
			`tlog "\nRun 'lstmtraining' comman next to continue LSTM training for language '${LANG_CODE}'\n"`
Fixes to training process to allow incremental training from a recognition model 2016-12-01 07:51:17 +08:00			`else`
Part 2 of separating out the unicharset from the LSTM model, fixing command line for training 2017-08-03 04:29:23 +08:00			`phase_D_generate_dawg`
Fixes to training process to allow incremental training from a recognition model 2016-12-01 07:51:17 +08:00			`phase_E_extract_features "box.train" 8 "tr"`
			`phase_C_cluster_prototypes "${TRAINING_DIR}/${LANG_CODE}.normproto"`
removed reference to unbound variable 2019-01-24 02:04:16 +08:00			`phase_S_cluster_shapes`
Fixes to training process to allow incremental training from a recognition model 2016-12-01 07:51:17 +08:00			`phase_M_cluster_microfeatures`
			`phase_B_generate_ambiguities`
			`make__traineddata`
Clarify message to indicate additional LSTM training required for 4.0.0 2018-04-23 00:56:07 +08:00			`tlog "\nCompleted training for language '${LANG_CODE}'\n"`
Major updates to training system as a result of extensive testing on 100 languages 2015-05-13 09:04:31 +08:00			`fi`