mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-21 17:13:09 +08:00
Merge pull request #3327 from Shreeshrii/patch-1
Create unicharset from training text to avoid normalization errors
This commit is contained in:
commit
2f4f7446b6
@ -353,8 +353,13 @@ phase_UP_generate_unicharset() {
|
||||
|
||||
local box_files=$(ls ${TRAINING_DIR}/*.box)
|
||||
UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset"
|
||||
run_command unicharset_extractor --output_unicharset "${UNICHARSET_FILE}" \
|
||||
--norm_mode "${NORM_MODE}" ${box_files}
|
||||
if [[ "${NORM_MODE}" == "2" ]] && [[ "${LANG_IS_RTL}" == "0" ]] ; then
|
||||
run_command unicharset_extractor --output_unicharset "${UNICHARSET_FILE}" \
|
||||
--norm_mode "${NORM_MODE}" ${TRAINING_TEXT}
|
||||
else
|
||||
run_command unicharset_extractor --output_unicharset "${UNICHARSET_FILE}" \
|
||||
--norm_mode "${NORM_MODE}" ${box_files}
|
||||
fi
|
||||
check_file_readable ${UNICHARSET_FILE}
|
||||
|
||||
XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights"
|
||||
|
Loading…
Reference in New Issue
Block a user