Add --exposures option to tesstrain.sh

This flag can be used to specify multiple different exposure levels
for a training. There was some code already in tesstrain_utils.sh
to deal with multiple exposure levels, so it looks like this
functionality was always intended.

The default usage does not change, with exposure level 0 being the
only one used if --exposures is not used.
This commit is contained in:
Nick White 2015-09-10 14:57:17 +01:00 committed by Zdenko Podobný
parent 45590ba1c1
commit 143ef735a4
2 changed files with 41 additions and 29 deletions

View File

@ -24,6 +24,7 @@
# --output_dir OUTPUTDIR # Location of output traineddata file.
# --overwrite # Safe to overwrite files in output_dir.
# --run_shape_clustering # Run shape clustering (use for Indic langs).
# --exposures EXPOSURES # A list of exposure levels to use (e.g. "-1 0 1").
#
# OPTIONAL flags for input data. If unspecified we will look for them in
# the langdata_dir directory.

View File

@ -26,6 +26,7 @@ OVERWRITE=0
RUN_SHAPE_CLUSTERING=0
EXTRACT_FONT_PROPERTIES=1
WORKSPACE_DIR="/tmp/tesstrain"
EXPOSURES=0
# Logging helper functions.
tlog() {
@ -98,6 +99,16 @@ parse_flags() {
FONTS=( ${ARGV[$j]} )
IFS=$ofs
i=$j ;;
--exposures)
exp=""
while test $j -lt ${#ARGV[@]}; do
test -z ${ARGV[$j]} && break
test `echo ${ARGV[$j]} | cut -c -2` = "--" && break
exp="$exp ${ARGV[$j]}"
j=$((j+1))
done
parse_value "EXPOSURES" "$exp"
i=$((j-1)) ;;
--fonts_dir)
parse_value "FONTS_DIR" ${ARGV[$j]}
i=$j ;;
@ -226,8 +237,8 @@ phase_I_generate_image() {
err_exit "Could not find training text file ${TRAINING_TEXT}"
fi
CHAR_SPACING="0.0"
EXPOSURE="0"
for EXPOSURE in $EXPOSURES; do
if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
# Parse .bigram_freqs file and compose a .train_ngrams file with text
# for tesseract to recognize during training. Take only the ngrams whose
@ -256,6 +267,7 @@ phase_I_generate_image() {
local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
check_file_readable ${outbase}.box ${outbase}.tif
done
done
}
# Phase UP : Generate (U)nicharset and (P)roperties file.
@ -359,10 +371,9 @@ phase_E_extract_features() {
par_factor=1
fi
tlog "\n=== Phase E: Extracting features ==="
TRAIN_EXPOSURES='0'
local img_files=""
for exposure in ${TRAIN_EXPOSURES}; do
for exposure in ${EXPOSURES}; do
img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
done