Add --exposures option to tesstrain.sh

This flag can be used to specify multiple different exposure levels
for a training. There was some code already in tesstrain_utils.sh
to deal with multiple exposure levels, so it looks like this
functionality was always intended.

The default usage does not change, with exposure level 0 being the
only one used if --exposures is not used.
This commit is contained in:
Nick White 2015-09-10 14:57:17 +01:00 committed by Zdenko Podobný
parent 45590ba1c1
commit 143ef735a4
2 changed files with 41 additions and 29 deletions

View File

@ -24,6 +24,7 @@
# --output_dir OUTPUTDIR # Location of output traineddata file. # --output_dir OUTPUTDIR # Location of output traineddata file.
# --overwrite # Safe to overwrite files in output_dir. # --overwrite # Safe to overwrite files in output_dir.
# --run_shape_clustering # Run shape clustering (use for Indic langs). # --run_shape_clustering # Run shape clustering (use for Indic langs).
# --exposures EXPOSURES # A list of exposure levels to use (e.g. "-1 0 1").
# #
# OPTIONAL flags for input data. If unspecified we will look for them in # OPTIONAL flags for input data. If unspecified we will look for them in
# the langdata_dir directory. # the langdata_dir directory.

View File

@ -26,6 +26,7 @@ OVERWRITE=0
RUN_SHAPE_CLUSTERING=0 RUN_SHAPE_CLUSTERING=0
EXTRACT_FONT_PROPERTIES=1 EXTRACT_FONT_PROPERTIES=1
WORKSPACE_DIR="/tmp/tesstrain" WORKSPACE_DIR="/tmp/tesstrain"
EXPOSURES=0
# Logging helper functions. # Logging helper functions.
tlog() { tlog() {
@ -98,6 +99,16 @@ parse_flags() {
FONTS=( ${ARGV[$j]} ) FONTS=( ${ARGV[$j]} )
IFS=$ofs IFS=$ofs
i=$j ;; i=$j ;;
--exposures)
exp=""
while test $j -lt ${#ARGV[@]}; do
test -z ${ARGV[$j]} && break
test `echo ${ARGV[$j]} | cut -c -2` = "--" && break
exp="$exp ${ARGV[$j]}"
j=$((j+1))
done
parse_value "EXPOSURES" "$exp"
i=$((j-1)) ;;
--fonts_dir) --fonts_dir)
parse_value "FONTS_DIR" ${ARGV[$j]} parse_value "FONTS_DIR" ${ARGV[$j]}
i=$j ;; i=$j ;;
@ -226,35 +237,36 @@ phase_I_generate_image() {
err_exit "Could not find training text file ${TRAINING_TEXT}" err_exit "Could not find training text file ${TRAINING_TEXT}"
fi fi
CHAR_SPACING="0.0" CHAR_SPACING="0.0"
EXPOSURE="0"
if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then for EXPOSURE in $EXPOSURES; do
# Parse .bigram_freqs file and compose a .train_ngrams file with text if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
# for tesseract to recognize during training. Take only the ngrams whose # Parse .bigram_freqs file and compose a .train_ngrams file with text
# combined weight accounts for 95% of all the bigrams in the language. # for tesseract to recognize during training. Take only the ngrams whose
NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \ # combined weight accounts for 95% of all the bigrams in the language.
| awk '{s=s+$2}; END {print (s/100)*p}' p=99) NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \ | awk '{s=s+$2}; END {print (s/100)*p}' p=99)
| awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \ cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE} | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
check_file_readable ${TRAIN_NGRAMS_FILE} x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
fi check_file_readable ${TRAIN_NGRAMS_FILE}
local counter=0
for font in "${FONTS[@]}"; do
generate_font_image "${font}" &
let counter=counter+1
let rem=counter%par_factor
if [[ "${rem}" -eq 0 ]]; then
wait
fi fi
done
wait local counter=0
# Check that each process was successful. for font in "${FONTS[@]}"; do
for font in "${FONTS[@]}"; do generate_font_image "${font}" &
local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g') let counter=counter+1
local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE} let rem=counter%par_factor
check_file_readable ${outbase}.box ${outbase}.tif if [[ "${rem}" -eq 0 ]]; then
wait
fi
done
wait
# Check that each process was successful.
for font in "${FONTS[@]}"; do
local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
check_file_readable ${outbase}.box ${outbase}.tif
done
done done
} }
@ -359,10 +371,9 @@ phase_E_extract_features() {
par_factor=1 par_factor=1
fi fi
tlog "\n=== Phase E: Extracting features ===" tlog "\n=== Phase E: Extracting features ==="
TRAIN_EXPOSURES='0'
local img_files="" local img_files=""
for exposure in ${TRAIN_EXPOSURES}; do for exposure in ${EXPOSURES}; do
img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif) img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
done done