2014-08-12 07:20:56 +08:00
|
|
|
#!/bin/bash
|
|
|
|
# (C) Copyright 2014, Google Inc.
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
#
|
|
|
|
# This script provides an easy way to execute various phases of training
|
|
|
|
# Tesseract. For a detailed description of the phases, see
|
2015-07-11 15:43:31 +08:00
|
|
|
# https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract
|
2014-08-12 07:20:56 +08:00
|
|
|
#
|
|
|
|
|
2018-10-02 17:35:10 +08:00
|
|
|
display_usage() {
|
|
|
|
echo -e "USAGE: tesstrain.sh
|
2019-01-27 15:00:25 +08:00
|
|
|
--exposures EXPOSURES # A list of exposure levels to use (e.g. "-1 0 1").
|
2018-10-02 17:35:10 +08:00
|
|
|
--fontlist FONTS # A list of fontnames to train on.
|
|
|
|
--fonts_dir FONTS_PATH # Path to font files.
|
|
|
|
--lang LANG_CODE # ISO 639 code.
|
|
|
|
--langdata_dir DATADIR # Path to tesseract/training/langdata directory.
|
2019-01-27 15:00:25 +08:00
|
|
|
--linedata_only # Only generate training data for lstmtraining.
|
2018-10-02 17:35:10 +08:00
|
|
|
--output_dir OUTPUTDIR # Location of output traineddata file.
|
|
|
|
--overwrite # Safe to overwrite files in output_dir.
|
|
|
|
--run_shape_clustering # Run shape clustering (use for Indic langs).
|
2019-01-27 15:00:25 +08:00
|
|
|
--maxpages # Specify maximum pages to output (default:0=all)
|
|
|
|
--save_box_tiff # Save box/tiff pairs along with lstmf files.
|
2019-02-01 03:03:05 +08:00
|
|
|
<<<<<<< HEAD
|
2019-01-27 15:00:25 +08:00
|
|
|
--x_size # Specify width of output image (default:3600)
|
2019-02-01 03:03:05 +08:00
|
|
|
=======
|
|
|
|
--xsize # Specify width of output image (default:3600)
|
|
|
|
|
|
|
|
OPTIONAL flag for specifying directory with user specified box/tiff pairs.
|
|
|
|
Files should be named similar to ${LANG_CODE}.${fontname}.exp${EXPOSURE}.box/tif
|
|
|
|
--my_boxtiff_dir MY_BOXTIFF_DIR # Location of user specified box/tiff files.
|
|
|
|
>>>>>>> c7cd112... allow box/tiff pairs for LSTM training
|
2019-01-27 15:00:25 +08:00
|
|
|
|
2018-10-02 17:35:10 +08:00
|
|
|
OPTIONAL flags for input data. If unspecified we will look for them in
|
|
|
|
the langdata_dir directory.
|
|
|
|
--training_text TEXTFILE # Text to render and use for training.
|
|
|
|
--wordlist WORDFILE # Word list for the language ordered by
|
|
|
|
# decreasing frequency.
|
|
|
|
OPTIONAL flag to specify location of existing traineddata files, required
|
|
|
|
during feature extraction. If unspecified will use TESSDATA_PREFIX defined in
|
|
|
|
the current environment.
|
|
|
|
--tessdata_dir TESSDATADIR # Path to tesseract/tessdata directory.
|
|
|
|
NOTE:
|
|
|
|
The font names specified in --fontlist need to be recognizable by Pango using
|
|
|
|
fontconfig. An easy way to list the canonical names of all fonts available on
|
|
|
|
your system is to run text2image with --list_available_fonts and the
|
|
|
|
appropriate --fonts_dir path."
|
|
|
|
}
|
2014-08-12 07:20:56 +08:00
|
|
|
|
2016-12-14 21:11:24 +08:00
|
|
|
source "$(dirname $0)/tesstrain_utils.sh"
|
2018-10-20 19:00:39 +08:00
|
|
|
if [[ $# -eq 0 || "$1" == "--help" || "$1" == "-h" ]]; then
|
2018-10-02 17:35:10 +08:00
|
|
|
display_usage
|
|
|
|
exit 0
|
|
|
|
fi
|
|
|
|
if [ $# == 0 ]; then
|
|
|
|
display_usage
|
|
|
|
exit 1
|
|
|
|
fi
|
2014-08-12 07:20:56 +08:00
|
|
|
|
|
|
|
ARGV=("$@")
|
|
|
|
parse_flags
|
|
|
|
|
|
|
|
mkdir -p ${TRAINING_DIR}
|
2019-02-01 03:03:05 +08:00
|
|
|
|
|
|
|
if [[ ${MY_BOXTIFF_DIR} != "" ]]; then
|
|
|
|
tlog "\n=== Copy existing box/tiff pairs from '${MY_BOXTIFF_DIR}'"
|
|
|
|
cp ${MY_BOXTIFF_DIR}/*.box ${TRAINING_DIR} | true
|
|
|
|
cp ${MY_BOXTIFF_DIR}/*.tif ${TRAINING_DIR} | true
|
|
|
|
ls -l ${TRAINING_DIR}
|
|
|
|
fi
|
|
|
|
|
2015-09-10 22:05:07 +08:00
|
|
|
tlog "\n=== Starting training for language '${LANG_CODE}'"
|
2014-08-12 07:20:56 +08:00
|
|
|
|
2016-12-14 21:11:24 +08:00
|
|
|
source "$(dirname $0)/language-specific.sh"
|
2015-05-13 09:04:31 +08:00
|
|
|
set_lang_specific_parameters ${LANG_CODE}
|
|
|
|
|
|
|
|
initialize_fontconfig
|
|
|
|
|
|
|
|
phase_I_generate_image 8
|
|
|
|
phase_UP_generate_unicharset
|
2016-12-14 21:11:24 +08:00
|
|
|
if ((LINEDATA)); then
|
2018-08-16 20:15:10 +08:00
|
|
|
phase_E_extract_features " --psm 6 lstm.train " 8 "lstmf"
|
2016-12-01 07:51:17 +08:00
|
|
|
make__lstmdata
|
2019-02-01 03:03:05 +08:00
|
|
|
tlog "\nCreated starter traineddata for LSTM training of language '${LANG_CODE}'\n"
|
|
|
|
tlog "\nRun 'lstmtraining' comman next to continue LSTM training for language '${LANG_CODE}'\n"
|
2016-12-01 07:51:17 +08:00
|
|
|
else
|
2017-08-03 04:29:23 +08:00
|
|
|
phase_D_generate_dawg
|
2016-12-01 07:51:17 +08:00
|
|
|
phase_E_extract_features "box.train" 8 "tr"
|
|
|
|
phase_C_cluster_prototypes "${TRAINING_DIR}/${LANG_CODE}.normproto"
|
2019-01-24 02:04:16 +08:00
|
|
|
phase_S_cluster_shapes
|
2016-12-01 07:51:17 +08:00
|
|
|
phase_M_cluster_microfeatures
|
|
|
|
phase_B_generate_ambiguities
|
|
|
|
make__traineddata
|
2018-04-23 00:56:07 +08:00
|
|
|
tlog "\nCompleted training for language '${LANG_CODE}'\n"
|
2015-05-13 09:04:31 +08:00
|
|
|
fi
|