From 832c6edb971ab427318ad64cd49b6eb87d04bdbc Mon Sep 17 00:00:00 2001 From: Shree Date: Fri, 14 Jun 2019 09:25:54 +0000 Subject: [PATCH 1/3] Allow saving of box/tiff pairs during base tesseract training --- src/training/tesstrain_utils.sh | 36 ++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/src/training/tesstrain_utils.sh b/src/training/tesstrain_utils.sh index 9b0efb6b..e053b902 100644 --- a/src/training/tesstrain_utils.sh +++ b/src/training/tesstrain_utils.sh @@ -259,7 +259,7 @@ generate_font_image() { common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}" common_args+=" --outputbase=${outbase} --max_pages=${MAX_PAGES}" if $DISTORT_IMAGE; then - common_args+=" --distort_image " + common_args+=" --distort_image --invert=false" fi # add --writing_mode=vertical-upright to common_args if the font is @@ -326,6 +326,17 @@ phase_I_generate_image() { check_file_readable ${outbase}.box ${outbase}.tif done done + if $SAVE_BOX_TIFF && ( ! $LINEDATA ) ; then + tlog "\n=== Saving box/tiff pairs for training data ===" + for f in "${TRAINING_DIR}/${LANG_CODE}".*.box; do + tlog "Moving ${f} to ${OUTPUT_DIR}" + cp "${f}" "${OUTPUT_DIR}" + done + for f in "${TRAINING_DIR}/${LANG_CODE}".*.tif; do + tlog "Moving ${f} to ${OUTPUT_DIR}" + cp "${f}" "${OUTPUT_DIR}" + done + fi } # Phase UP : Generate (U)nicharset and (P)roperties file. @@ -386,7 +397,7 @@ phase_D_generate_dawg() { # Punctuation DAWG # -r arguments to wordlist2dawg denote RTL reverse policy - # (see Trie::RTLReversePolicy enum in tesseract/src/dict/trie.h). + # (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h). # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG, # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS, # 2/RRP_FORCE_REVERSE for the punctuation DAWG. @@ -562,17 +573,18 @@ make__lstmdata() { --output_dir "${OUTPUT_DIR}" --lang "${LANG_CODE}" \ "${pass_through}" "${lang_is_rtl}" - if $SAVE_BOX_TIFF; then + if $SAVE_BOX_TIFF ; then tlog "\n=== Saving box/tiff pairs for training data ===" - for f in "${TRAINING_DIR}/${LANG_CODE}".*.box; do - tlog "Moving ${f} to ${OUTPUT_DIR}" - mv "${f}" "${OUTPUT_DIR}" - done - for f in "${TRAINING_DIR}/${LANG_CODE}".*.tif; do - tlog "Moving ${f} to ${OUTPUT_DIR}" - mv "${f}" "${OUTPUT_DIR}" - done - fi + for f in "${TRAINING_DIR}/${LANG_CODE}".*.box; do + tlog "Moving ${f} to ${OUTPUT_DIR}" + mv "${f}" "${OUTPUT_DIR}" + done + for f in "${TRAINING_DIR}/${LANG_CODE}".*.tif; do + tlog "Moving ${f} to ${OUTPUT_DIR}" + mv "${f}" "${OUTPUT_DIR}" + done + fi + tlog "\n=== Moving lstmf files for training data ===" for f in "${TRAINING_DIR}/${LANG_CODE}".*.lstmf; do tlog "Moving ${f} to ${OUTPUT_DIR}" From 45cdf741ae9347f1b345b55419780b3705dd5be5 Mon Sep 17 00:00:00 2001 From: Shree Date: Fri, 14 Jun 2019 09:32:41 +0000 Subject: [PATCH 2/3] Allow saving of box/tiff pairs during base tesseract training --- src/training/tesstrain_utils.sh | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/training/tesstrain_utils.sh b/src/training/tesstrain_utils.sh index e053b902..51c370e0 100644 --- a/src/training/tesstrain_utils.sh +++ b/src/training/tesstrain_utils.sh @@ -397,7 +397,7 @@ phase_D_generate_dawg() { # Punctuation DAWG # -r arguments to wordlist2dawg denote RTL reverse policy - # (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h). + # (see Trie::RTLReversePolicy enum in tesseract/src/dict/trie.h). # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG, # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS, # 2/RRP_FORCE_REVERSE for the punctuation DAWG. @@ -573,17 +573,18 @@ make__lstmdata() { --output_dir "${OUTPUT_DIR}" --lang "${LANG_CODE}" \ "${pass_through}" "${lang_is_rtl}" - if $SAVE_BOX_TIFF ; then + if $SAVE_BOX_TIFF; then tlog "\n=== Saving box/tiff pairs for training data ===" - for f in "${TRAINING_DIR}/${LANG_CODE}".*.box; do - tlog "Moving ${f} to ${OUTPUT_DIR}" - mv "${f}" "${OUTPUT_DIR}" - done - for f in "${TRAINING_DIR}/${LANG_CODE}".*.tif; do - tlog "Moving ${f} to ${OUTPUT_DIR}" - mv "${f}" "${OUTPUT_DIR}" - done - fi + for f in "${TRAINING_DIR}/${LANG_CODE}".*.box; do + tlog "Moving ${f} to ${OUTPUT_DIR}" + mv "${f}" "${OUTPUT_DIR}" + done + for f in "${TRAINING_DIR}/${LANG_CODE}".*.tif; do + tlog "Moving ${f} to ${OUTPUT_DIR}" + mv "${f}" "${OUTPUT_DIR}" + done + fi + done tlog "\n=== Moving lstmf files for training data ===" for f in "${TRAINING_DIR}/${LANG_CODE}".*.lstmf; do From 6fa458794979d6b170642852e78a62844ec6169e Mon Sep 17 00:00:00 2001 From: Shree Date: Fri, 14 Jun 2019 09:35:39 +0000 Subject: [PATCH 3/3] Allow saving of box/tiff pairs during base tesseract training --- src/training/tesstrain_utils.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/src/training/tesstrain_utils.sh b/src/training/tesstrain_utils.sh index 51c370e0..ef8457e2 100644 --- a/src/training/tesstrain_utils.sh +++ b/src/training/tesstrain_utils.sh @@ -584,7 +584,6 @@ make__lstmdata() { mv "${f}" "${OUTPUT_DIR}" done fi - done tlog "\n=== Moving lstmf files for training data ===" for f in "${TRAINING_DIR}/${LANG_CODE}".*.lstmf; do