mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-11 12:43:17 +08:00
Add --exposures option to tesstrain.sh
This flag can be used to specify multiple different exposure levels for a training. There was some code already in tesstrain_utils.sh to deal with multiple exposure levels, so it looks like this functionality was always intended. The default usage does not change, with exposure level 0 being the only one used if --exposures is not used.
This commit is contained in:
parent
45590ba1c1
commit
143ef735a4
@ -24,6 +24,7 @@
|
|||||||
# --output_dir OUTPUTDIR # Location of output traineddata file.
|
# --output_dir OUTPUTDIR # Location of output traineddata file.
|
||||||
# --overwrite # Safe to overwrite files in output_dir.
|
# --overwrite # Safe to overwrite files in output_dir.
|
||||||
# --run_shape_clustering # Run shape clustering (use for Indic langs).
|
# --run_shape_clustering # Run shape clustering (use for Indic langs).
|
||||||
|
# --exposures EXPOSURES # A list of exposure levels to use (e.g. "-1 0 1").
|
||||||
#
|
#
|
||||||
# OPTIONAL flags for input data. If unspecified we will look for them in
|
# OPTIONAL flags for input data. If unspecified we will look for them in
|
||||||
# the langdata_dir directory.
|
# the langdata_dir directory.
|
||||||
|
@ -26,6 +26,7 @@ OVERWRITE=0
|
|||||||
RUN_SHAPE_CLUSTERING=0
|
RUN_SHAPE_CLUSTERING=0
|
||||||
EXTRACT_FONT_PROPERTIES=1
|
EXTRACT_FONT_PROPERTIES=1
|
||||||
WORKSPACE_DIR="/tmp/tesstrain"
|
WORKSPACE_DIR="/tmp/tesstrain"
|
||||||
|
EXPOSURES=0
|
||||||
|
|
||||||
# Logging helper functions.
|
# Logging helper functions.
|
||||||
tlog() {
|
tlog() {
|
||||||
@ -98,6 +99,16 @@ parse_flags() {
|
|||||||
FONTS=( ${ARGV[$j]} )
|
FONTS=( ${ARGV[$j]} )
|
||||||
IFS=$ofs
|
IFS=$ofs
|
||||||
i=$j ;;
|
i=$j ;;
|
||||||
|
--exposures)
|
||||||
|
exp=""
|
||||||
|
while test $j -lt ${#ARGV[@]}; do
|
||||||
|
test -z ${ARGV[$j]} && break
|
||||||
|
test `echo ${ARGV[$j]} | cut -c -2` = "--" && break
|
||||||
|
exp="$exp ${ARGV[$j]}"
|
||||||
|
j=$((j+1))
|
||||||
|
done
|
||||||
|
parse_value "EXPOSURES" "$exp"
|
||||||
|
i=$((j-1)) ;;
|
||||||
--fonts_dir)
|
--fonts_dir)
|
||||||
parse_value "FONTS_DIR" ${ARGV[$j]}
|
parse_value "FONTS_DIR" ${ARGV[$j]}
|
||||||
i=$j ;;
|
i=$j ;;
|
||||||
@ -226,35 +237,36 @@ phase_I_generate_image() {
|
|||||||
err_exit "Could not find training text file ${TRAINING_TEXT}"
|
err_exit "Could not find training text file ${TRAINING_TEXT}"
|
||||||
fi
|
fi
|
||||||
CHAR_SPACING="0.0"
|
CHAR_SPACING="0.0"
|
||||||
EXPOSURE="0"
|
|
||||||
|
|
||||||
if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
|
for EXPOSURE in $EXPOSURES; do
|
||||||
# Parse .bigram_freqs file and compose a .train_ngrams file with text
|
if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
|
||||||
# for tesseract to recognize during training. Take only the ngrams whose
|
# Parse .bigram_freqs file and compose a .train_ngrams file with text
|
||||||
# combined weight accounts for 95% of all the bigrams in the language.
|
# for tesseract to recognize during training. Take only the ngrams whose
|
||||||
NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
|
# combined weight accounts for 95% of all the bigrams in the language.
|
||||||
| awk '{s=s+$2}; END {print (s/100)*p}' p=99)
|
NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
|
||||||
cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
|
| awk '{s=s+$2}; END {print (s/100)*p}' p=99)
|
||||||
| awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
|
cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
|
||||||
x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
|
| awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
|
||||||
check_file_readable ${TRAIN_NGRAMS_FILE}
|
x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
|
||||||
fi
|
check_file_readable ${TRAIN_NGRAMS_FILE}
|
||||||
|
|
||||||
local counter=0
|
|
||||||
for font in "${FONTS[@]}"; do
|
|
||||||
generate_font_image "${font}" &
|
|
||||||
let counter=counter+1
|
|
||||||
let rem=counter%par_factor
|
|
||||||
if [[ "${rem}" -eq 0 ]]; then
|
|
||||||
wait
|
|
||||||
fi
|
fi
|
||||||
done
|
|
||||||
wait
|
local counter=0
|
||||||
# Check that each process was successful.
|
for font in "${FONTS[@]}"; do
|
||||||
for font in "${FONTS[@]}"; do
|
generate_font_image "${font}" &
|
||||||
local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
|
let counter=counter+1
|
||||||
local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
|
let rem=counter%par_factor
|
||||||
check_file_readable ${outbase}.box ${outbase}.tif
|
if [[ "${rem}" -eq 0 ]]; then
|
||||||
|
wait
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
wait
|
||||||
|
# Check that each process was successful.
|
||||||
|
for font in "${FONTS[@]}"; do
|
||||||
|
local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
|
||||||
|
local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
|
||||||
|
check_file_readable ${outbase}.box ${outbase}.tif
|
||||||
|
done
|
||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -359,10 +371,9 @@ phase_E_extract_features() {
|
|||||||
par_factor=1
|
par_factor=1
|
||||||
fi
|
fi
|
||||||
tlog "\n=== Phase E: Extracting features ==="
|
tlog "\n=== Phase E: Extracting features ==="
|
||||||
TRAIN_EXPOSURES='0'
|
|
||||||
|
|
||||||
local img_files=""
|
local img_files=""
|
||||||
for exposure in ${TRAIN_EXPOSURES}; do
|
for exposure in ${EXPOSURES}; do
|
||||||
img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
|
img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
|
||||||
done
|
done
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user