/****************************************************************** * File: docqual.cpp (Formerly docqual.c) * Description: Document Quality Metrics * Author: Phil Cheatle * Created: Mon May 9 11:27:28 BST 1994 * * (C) Copyright 1994, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/ #ifdef _MSC_VER #pragma warning(disable:4244) // Conversion warnings #endif #include "mfcpch.h" #include #include "docqual.h" #include "tstruct.h" #include "tfacep.h" #include "reject.h" #include "tessvars.h" #include "genblob.h" #include "secname.h" #include "globals.h" #include "tesseractclass.h" #define EXTERN EXTERN STRING_VAR (outlines_odd, "%| ", "Non standard number of outlines"); EXTERN STRING_VAR (outlines_2, "ij!?%\":;", "Non standard number of outlines"); EXTERN BOOL_VAR (docqual_excuse_outline_errs, FALSE, "Allow outline errs in unrejection?"); EXTERN BOOL_VAR (tessedit_good_quality_unrej, TRUE, "Reduce rejection on good docs"); EXTERN BOOL_VAR (tessedit_use_reject_spaces, TRUE, "Reject spaces?"); EXTERN double_VAR (tessedit_reject_doc_percent, 65.00, "%rej allowed before rej whole doc"); EXTERN double_VAR (tessedit_reject_block_percent, 45.00, "%rej allowed before rej whole block"); EXTERN double_VAR (tessedit_reject_row_percent, 40.00, "%rej allowed before rej whole row"); EXTERN double_VAR (tessedit_whole_wd_rej_row_percent, 70.00, "%of row rejects in whole word rejects which prevents whole row rejection"); EXTERN BOOL_VAR (tessedit_preserve_blk_rej_perfect_wds, TRUE, "Only rej partially rejected words in block rejection"); EXTERN BOOL_VAR (tessedit_preserve_row_rej_perfect_wds, TRUE, "Only rej partially rejected words in row rejection"); EXTERN BOOL_VAR (tessedit_dont_blkrej_good_wds, FALSE, "Use word segmentation quality metric"); EXTERN BOOL_VAR (tessedit_dont_rowrej_good_wds, FALSE, "Use word segmentation quality metric"); EXTERN INT_VAR (tessedit_preserve_min_wd_len, 2, "Only preserve wds longer than this"); EXTERN BOOL_VAR (tessedit_row_rej_good_docs, TRUE, "Apply row rejection to good docs"); EXTERN double_VAR (tessedit_good_doc_still_rowrej_wd, 1.1, "rej good doc wd if more than this fraction rejected"); EXTERN BOOL_VAR (tessedit_reject_bad_qual_wds, TRUE, "Reject all bad quality wds"); EXTERN BOOL_VAR (tessedit_debug_doc_rejection, FALSE, "Page stats"); EXTERN BOOL_VAR (tessedit_debug_quality_metrics, FALSE, "Output data to debug file"); EXTERN BOOL_VAR (bland_unrej, FALSE, "unrej potential with no checks"); EXTERN double_VAR (quality_rowrej_pc, 1.1, "good_quality_doc gte good char limit"); EXTERN BOOL_VAR (unlv_tilde_crunching, TRUE, "Mark v.bad words for tilde crunch"); EXTERN BOOL_VAR (crunch_early_merge_tess_fails, TRUE, "Before word crunch?"); EXTERN BOOL_EVAR (crunch_early_convert_bad_unlv_chs, FALSE, "Take out ~^ early?"); EXTERN double_VAR (crunch_terrible_rating, 80.0, "crunch rating lt this"); EXTERN BOOL_VAR (crunch_terrible_garbage, TRUE, "As it says"); EXTERN double_VAR (crunch_poor_garbage_cert, -9.0, "crunch garbage cert lt this"); EXTERN double_VAR (crunch_poor_garbage_rate, 60, "crunch garbage rating lt this"); EXTERN double_VAR (crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this"); EXTERN double_VAR (crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this"); EXTERN BOOL_VAR (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage"); EXTERN double_VAR (crunch_del_rating, 60, "POTENTIAL crunch rating lt this"); EXTERN double_VAR (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this"); EXTERN double_VAR (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this"); EXTERN double_VAR (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this"); EXTERN double_VAR (crunch_del_min_width, 3.0, "Del if word width lt xht x this"); EXTERN double_VAR (crunch_del_high_word, 1.5, "Del if word gt xht x this above bl"); EXTERN double_VAR (crunch_del_low_word, 0.5, "Del if word gt xht x this below bl"); EXTERN double_VAR (crunch_small_outlines_size, 0.6, "Small if lt xht x this"); EXTERN INT_VAR (crunch_rating_max, 10, "For adj length in rating per ch"); EXTERN INT_VAR (crunch_pot_indicators, 1, "How many potential indicators needed"); EXTERN BOOL_VAR (crunch_leave_ok_strings, TRUE, "Dont touch sensible strings"); EXTERN BOOL_VAR (crunch_accept_ok, TRUE, "Use acceptability in okstring"); EXTERN BOOL_VAR (crunch_leave_accept_strings, FALSE, "Dont pot crunch sensible strings"); EXTERN BOOL_VAR (crunch_include_numerals, FALSE, "Fiddle alpha figures"); EXTERN INT_VAR (crunch_leave_lc_strings, 4, "Dont crunch words with long lower case strings"); EXTERN INT_VAR (crunch_leave_uc_strings, 4, "Dont crunch words with long lower case strings"); EXTERN INT_VAR (crunch_long_repetitions, 3, "Crunch words with long repetitions"); EXTERN INT_VAR (crunch_debug, 0, "As it says"); static BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2); static void unrej_good_chs(WERD_RES *word, ROW *row); /************************************************************************* * word_blob_quality() * How many blobs in the outword are identical to those of the inword? * ASSUME blobs in both initial word and outword are in ascending order of * left hand blob edge. *************************************************************************/ inT16 word_blob_quality( //Blob seg changes WERD_RES *word, ROW *row) { WERD *bln_word; //BL norm init word TWERD *tessword; //tess format WERD *init_word; //BL norm init word PBLOB_IT outword_it; PBLOB_IT initial_it; inT16 i; inT16 init_blobs_left; inT16 match_count = 0; BOOL8 matched; TBOX out_box; PBLOB *test_blob; DENORM denorm; float bln_xht; if (word->word->gblob_list ()->empty ()) return 0; //xht used for blnorm bln_xht = bln_x_height / word->denorm.scale (); bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm); /* NOTE: Need to convert to tess format and back again to ensure that the same float -> int rounding of coords is done to source wd as out wd before comparison */ tessword = make_tess_word(bln_word, NULL); // Convert word. init_word = make_ed_word(tessword, bln_word); delete bln_word; delete_word(tessword); if (init_word == NULL) { // Conversion failed. return 0; } initial_it.set_to_list(init_word->blob_list()); init_blobs_left = initial_it.length(); outword_it.set_to_list(word->outword->blob_list()); for (outword_it.mark_cycle_pt(); !outword_it.cycled_list(); outword_it.forward()) { out_box = outword_it.data()->bounding_box(); // Skip any initial blobs LEFT of current outword blob. while (!initial_it.at_last() && (initial_it.data()->bounding_box().left() < out_box.left())) { initial_it.forward(); init_blobs_left--; } /* See if current outword blob matches any initial blob with the same left coord. (Normally only one but possibly more - in unknown order) */ i = 0; matched = FALSE; do { test_blob = initial_it.data_relative (i++); matched = crude_match_blobs (test_blob, outword_it.data ()); if (matched) match_count++; } while (!matched && (init_blobs_left - i > 0) && (i < 129) && !initial_it.at_last() && test_blob->bounding_box().left() == out_box.left()); } delete init_word; return match_count; } /************************************************************************* * crude_match_blobs() * Check bounding boxes are the same and the number of outlines are the same. *************************************************************************/ static BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2) { TBOX box1 = blob1->bounding_box(); TBOX box2 = blob2->bounding_box(); if (box1.contains(box2) && box2.contains(box1) && (blob1->out_list()->length() == blob1->out_list()->length())) return TRUE; else return FALSE; } inT16 word_outline_errs(WERD_RES *word) { PBLOB_IT outword_it; inT16 i = 0; inT16 err_count = 0; outword_it.set_to_list(word->outword->blob_list()); for (outword_it.mark_cycle_pt(); !outword_it.cycled_list(); outword_it.forward()) { err_count += count_outline_errs(word->best_choice->unichar_string()[i], outword_it.data()->out_list()->length()); i++; } return err_count; } /************************************************************************* * word_char_quality() * Combination of blob quality and outline quality - how many good chars are * there? - I.e chars which pass the blob AND outline tests. *************************************************************************/ void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count) { WERD *bln_word; // BL norm init word TWERD *tessword; // tess format WERD *init_word; // BL norm init word PBLOB_IT outword_it; PBLOB_IT initial_it; inT16 i; inT16 init_blobs_left; BOOL8 matched; TBOX out_box; PBLOB *test_blob; DENORM denorm; float bln_xht; inT16 j = 0; *match_count = 0; *accepted_match_count = 0; if (word->word->gblob_list ()->empty ()) return; // xht used for blnorm bln_xht = bln_x_height / word->denorm.scale(); bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm); /* NOTE: Need to convert to tess format and back again to ensure that the same float -> int rounding of coords is done to source wd as out wd before comparison */ tessword = make_tess_word(bln_word, NULL); // Convert word. init_word = make_ed_word(tessword, bln_word); delete bln_word; delete_word(tessword); if (init_word == NULL) return; initial_it.set_to_list(init_word->blob_list()); init_blobs_left = initial_it.length(); outword_it.set_to_list(word->outword->blob_list()); for (outword_it.mark_cycle_pt(); !outword_it.cycled_list(); outword_it.forward()) { out_box = outword_it.data()->bounding_box(); /* Skip any initial blobs LEFT of current outword blob */ while (!initial_it.at_last() && (initial_it.data()->bounding_box().left() < out_box.left())) { initial_it.forward(); init_blobs_left--; } /* See if current outword blob matches any initial blob with the same left coord. (Normally only one but possibly more - in unknown order) */ i = 0; matched = FALSE; do { test_blob = initial_it.data_relative(i++); matched = crude_match_blobs(test_blob, outword_it.data()); if (matched && (count_outline_errs (word->best_choice->unichar_string()[j], outword_it.data ()->out_list ()->length ()) == 0)) { (*match_count)++; if (word->reject_map[j].accepted ()) (*accepted_match_count)++; } } while (!matched && (init_blobs_left - i > 0) && (i < 129) && !initial_it.at_last() && test_blob->bounding_box().left() == out_box.left()); j++; } delete init_word; } /************************************************************************* * unrej_good_chs() * Unreject POTENTIAL rejects if the blob passes the blob and outline checks *************************************************************************/ static void unrej_good_chs(WERD_RES *word, ROW *row) { WERD *bln_word; // BL norm init word TWERD *tessword; // tess format WERD *init_word; // BL norm init word PBLOB_IT outword_it; PBLOB_IT initial_it; inT16 i; inT16 init_blobs_left; BOOL8 matched; TBOX out_box; PBLOB *test_blob; DENORM denorm; float bln_xht; inT16 j = 0; if (word->word->gblob_list ()->empty ()) return; // xht used for blnorm bln_xht = bln_x_height / word->denorm.scale (); bln_word = make_bln_copy(word->word, row, NULL, bln_xht, &denorm); /* NOTE: Need to convert to tess format and back again to ensure that the same float -> int rounding of coords is done to source wd as out wd before comparison */ tessword = make_tess_word(bln_word, NULL); // Convert word init_word = make_ed_word(tessword, bln_word); delete bln_word; delete_word(tessword); if (init_word == NULL) return; initial_it.set_to_list (init_word->blob_list ()); init_blobs_left = initial_it.length (); outword_it.set_to_list (word->outword->blob_list ()); for (outword_it.mark_cycle_pt (); !outword_it.cycled_list (); outword_it.forward ()) { out_box = outword_it.data ()->bounding_box (); /* Skip any initial blobs LEFT of current outword blob */ while (!initial_it.at_last () && (initial_it.data ()->bounding_box ().left () < out_box.left ())) { initial_it.forward (); init_blobs_left--; } /* See if current outword blob matches any initial blob with the same left coord. (Normally only one but possibly more - in unknown order) */ i = 0; matched = FALSE; do { test_blob = initial_it.data_relative (i++); matched = crude_match_blobs (test_blob, outword_it.data ()); if (matched && (word->reject_map[j].accept_if_good_quality ()) && (docqual_excuse_outline_errs || (count_outline_errs (word->best_choice->unichar_string()[j], outword_it.data ()->out_list ()-> length ()) == 0))) word->reject_map[j].setrej_quality_accept (); } while (!matched && (init_blobs_left - i > 0) && (i < 129) && !initial_it.at_last () && test_blob->bounding_box ().left () == out_box.left ()); j++; } delete init_word; } void print_boxes(WERD *word) { PBLOB_IT it; TBOX box; it.set_to_list (word->blob_list ()); for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) { box = it.data ()->bounding_box (); box.print (); } } inT16 count_outline_errs(char c, inT16 outline_count) { int expected_outline_count; if (STRING (outlines_odd).contains (c)) return 0; //Dont use this char else if (STRING (outlines_2).contains (c)) expected_outline_count = 2; else expected_outline_count = 1; return abs (outline_count - expected_outline_count); } namespace tesseract { void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc) { if ((tessedit_good_quality_unrej && good_quality_doc)) unrej_good_quality_words(page_res_it); doc_and_block_rejection(page_res_it, good_quality_doc); page_res_it.restart_page (); while (page_res_it.word () != NULL) { insert_rej_cblobs(page_res_it.word()); page_res_it.forward(); } if (unlv_tilde_crunching) { tilde_crunch(page_res_it); tilde_delete(page_res_it); } } /************************************************************************* * unrej_good_quality_words() * Accept potential rejects in words which pass the following checks: * - Contains a potential reject * - Word looks like a sensible alpha word. * - Word segmentation is the same as the original image * - All characters have the expected number of outlines * NOTE - the rejection counts are recalculated after unrejection * - CANT do it in a single pass without a bit of fiddling * - keep it simple but inefficient *************************************************************************/ void Tesseract::unrej_good_quality_words( //unreject potential PAGE_RES_IT &page_res_it) { WERD_RES *word; ROW_RES *current_row; BLOCK_RES *current_block; int i; page_res_it.restart_page (); while (page_res_it.word () != NULL) { check_debug_pt (page_res_it.word (), 100); if (bland_unrej) { word = page_res_it.word (); for (i = 0; i < word->reject_map.length (); i++) { if (word->reject_map[i].accept_if_good_quality ()) word->reject_map[i].setrej_quality_accept (); } page_res_it.forward (); } else if ((page_res_it.row ()->char_count > 0) && ((page_res_it.row ()->rej_count / (float) page_res_it.row ()->char_count) <= quality_rowrej_pc)) { word = page_res_it.word (); if (word->reject_map.quality_recoverable_rejects () && (tessedit_unrej_any_wd || acceptable_word_string (word->best_choice->unichar_string().string(), word->best_choice->unichar_lengths().string()) != AC_UNACCEPTABLE)) { unrej_good_chs (word, page_res_it.row ()->row); } page_res_it.forward (); } else { /* Skip to end of dodgy row */ current_row = page_res_it.row (); while ((page_res_it.word () != NULL) && (page_res_it.row () == current_row)) page_res_it.forward (); } check_debug_pt (page_res_it.word (), 110); } page_res_it.restart_page (); page_res_it.page_res->char_count = 0; page_res_it.page_res->rej_count = 0; current_block = NULL; current_row = NULL; while (page_res_it.word () != NULL) { if (current_block != page_res_it.block ()) { current_block = page_res_it.block (); current_block->char_count = 0; current_block->rej_count = 0; } if (current_row != page_res_it.row ()) { current_row = page_res_it.row (); current_row->char_count = 0; current_row->rej_count = 0; current_row->whole_word_rej_count = 0; } page_res_it.rej_stat_word (); page_res_it.forward (); } } /************************************************************************* * doc_and_block_rejection() * * If the page has too many rejects - reject all of it. * If any block has too many rejects - reject all words in the block *************************************************************************/ void Tesseract::doc_and_block_rejection( //reject big chunks PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc) { inT16 block_no = 0; inT16 row_no = 0; BLOCK_RES *current_block; ROW_RES *current_row; BOOL8 rej_word; BOOL8 prev_word_rejected; inT16 char_quality; inT16 accepted_char_quality; if ((page_res_it.page_res->rej_count * 100.0 / page_res_it.page_res->char_count) > tessedit_reject_doc_percent) { reject_whole_page(page_res_it); #ifndef SECURE_NAMES if (tessedit_debug_doc_rejection) { tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n", page_res_it.page_res->char_count, page_res_it.page_res->rej_count); } #endif } else { #ifndef SECURE_NAMES if (tessedit_debug_doc_rejection) tprintf ("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", page_res_it.page_res->char_count, page_res_it.page_res->rej_count); #endif /* Walk blocks testing for block rejection */ page_res_it.restart_page (); while (page_res_it.word () != NULL) { current_block = page_res_it.block(); block_no = current_block->block->index(); if ((page_res_it.block ()->char_count > 0) && ((page_res_it.block ()->rej_count * 100.0 / page_res_it.block ()->char_count) > tessedit_reject_block_percent)) { #ifndef SECURE_NAMES if (tessedit_debug_block_rejection) tprintf ("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", block_no, page_res_it.block ()->char_count, page_res_it.block ()->rej_count); #endif prev_word_rejected = FALSE; while ((page_res_it.word () != NULL) && (page_res_it.block () == current_block)) { if (tessedit_preserve_blk_rej_perfect_wds) { rej_word = (page_res_it.word ()->reject_map.reject_count () > 0) || (page_res_it.word ()->reject_map.length () < tessedit_preserve_min_wd_len); if (rej_word && tessedit_dont_blkrej_good_wds && !(page_res_it.word ()->reject_map.length () < tessedit_preserve_min_wd_len) && (acceptable_word_string (page_res_it.word()->best_choice->unichar_string().string(), page_res_it.word ()->best_choice->unichar_lengths().string()) != AC_UNACCEPTABLE)) { word_char_quality (page_res_it.word (), page_res_it.row ()->row, &char_quality, &accepted_char_quality); rej_word = char_quality != page_res_it.word ()->reject_map.length (); } } else rej_word = TRUE; if (rej_word) { /* Reject spacing if both current and prev words are rejected. NOTE - this is NOT restricted to FUZZY spaces. - When tried this generated more space errors. */ if (tessedit_use_reject_spaces && prev_word_rejected && (page_res_it.prev_row () == page_res_it.row ()) && (page_res_it.word ()->word->space () == 1)) page_res_it.word ()->reject_spaces = TRUE; page_res_it.word ()->reject_map.rej_word_block_rej (); } prev_word_rejected = rej_word; page_res_it.forward (); } } else { #ifndef SECURE_NAMES if (tessedit_debug_block_rejection) tprintf ("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", block_no, page_res_it.block ()->char_count, page_res_it.block ()->rej_count); #endif /* Walk rows in block testing for row rejection */ row_no = 0; while ((page_res_it.word () != NULL) && (page_res_it.block () == current_block)) { current_row = page_res_it.row (); row_no++; /* Reject whole row if: fraction of chars on row which are rejected exceed a limit AND fraction rejects which occur in WHOLE WERD rejects is LESS THAN a limit */ if ((page_res_it.row ()->char_count > 0) && ((page_res_it.row ()->rej_count * 100.0 / page_res_it.row ()->char_count) > tessedit_reject_row_percent) && ((page_res_it.row ()->whole_word_rej_count * 100.0 / page_res_it.row ()->rej_count) < tessedit_whole_wd_rej_row_percent)) { #ifndef SECURE_NAMES if (tessedit_debug_block_rejection) tprintf ("REJECTING ROW %d #chars: %d; #Rejects: %d\n", row_no, page_res_it.row ()->char_count, page_res_it.row ()->rej_count); #endif prev_word_rejected = FALSE; while ((page_res_it.word () != NULL) && (page_res_it.row () == current_row)) { /* Preserve words on good docs unless they are mostly rejected*/ if (!tessedit_row_rej_good_docs && good_quality_doc) { rej_word = page_res_it.word ()->reject_map. reject_count () / (float) page_res_it.word ()->reject_map. length () > tessedit_good_doc_still_rowrej_wd; } /* Preserve perfect words anyway */ else if (tessedit_preserve_row_rej_perfect_wds) { rej_word = (page_res_it.word ()->reject_map. reject_count () > 0) || (page_res_it.word ()->reject_map. length () < tessedit_preserve_min_wd_len); if (rej_word && tessedit_dont_rowrej_good_wds && !(page_res_it.word ()->reject_map. length () < tessedit_preserve_min_wd_len) && (acceptable_word_string (page_res_it.word ()->best_choice-> unichar_string().string(), page_res_it.word ()->best_choice-> unichar_lengths().string()) != AC_UNACCEPTABLE)) { word_char_quality (page_res_it.word (), page_res_it.row ()->row, &char_quality, &accepted_char_quality); rej_word = char_quality != page_res_it.word ()->reject_map.length (); } } else rej_word = TRUE; if (rej_word) { /* Reject spacing if both current and prev words are rejected. NOTE - this is NOT restricted to FUZZY spaces. - When tried this generated more space errors. */ if (tessedit_use_reject_spaces && prev_word_rejected && (page_res_it.prev_row () == page_res_it.row ()) && (page_res_it.word ()->word->space () == 1)) page_res_it.word ()->reject_spaces = TRUE; page_res_it.word ()->reject_map. rej_word_row_rej(); } prev_word_rejected = rej_word; page_res_it.forward (); } } else { #ifndef SECURE_NAMES if (tessedit_debug_block_rejection) tprintf ("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", row_no, page_res_it.row ()->char_count, page_res_it.row ()->rej_count); #endif while ((page_res_it.word () != NULL) && (page_res_it.row () == current_row)) page_res_it.forward (); } } } } } } } // namespace tesseract /************************************************************************* * reject_whole_page() * Dont believe any of it - set the reject map to 00..00 in all words * *************************************************************************/ void reject_whole_page(PAGE_RES_IT &page_res_it) { page_res_it.restart_page (); while (page_res_it.word () != NULL) { page_res_it.word ()->reject_map.rej_word_doc_rej (); page_res_it.forward (); } //whole page is rejected page_res_it.page_res->rejected = TRUE; } namespace tesseract { void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) { WERD_RES *word; GARBAGE_LEVEL garbage_level; PAGE_RES_IT copy_it; BOOL8 prev_potential_marked = FALSE; BOOL8 found_terrible_word = FALSE; BOOL8 ok_dict_word; page_res_it.restart_page (); while (page_res_it.word () != NULL) { word = page_res_it.word (); if (crunch_early_convert_bad_unlv_chs) convert_bad_unlv_chs(word); if (crunch_early_merge_tess_fails) merge_tess_fails(word); if (word->reject_map.accept_count () != 0) { found_terrible_word = FALSE; //Forget earlier potential crunches prev_potential_marked = FALSE; } else { ok_dict_word = safe_dict_word(*(word->best_choice)); garbage_level = garbage_word (word, ok_dict_word); if ((garbage_level != G_NEVER_CRUNCH) && (terrible_word_crunch (word, garbage_level))) { if (crunch_debug > 0) { tprintf ("T CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().string()); } word->unlv_crunch_mode = CR_KEEP_SPACE; if (prev_potential_marked) { while (copy_it.word () != word) { if (crunch_debug > 0) { tprintf ("P1 CRUNCHING: \"%s\"\n", copy_it.word()->best_choice->unichar_string().string()); } copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE; copy_it.forward (); } prev_potential_marked = FALSE; } found_terrible_word = TRUE; } else if ((garbage_level != G_NEVER_CRUNCH) && (potential_word_crunch (word, garbage_level, ok_dict_word))) { if (found_terrible_word) { if (crunch_debug > 0) { tprintf ("P2 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().string()); } word->unlv_crunch_mode = CR_KEEP_SPACE; } else if (!prev_potential_marked) { copy_it = page_res_it; prev_potential_marked = TRUE; if (crunch_debug > 1) { tprintf ("P3 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().string()); } } } else { found_terrible_word = FALSE; //Forget earlier potential crunches prev_potential_marked = FALSE; if (crunch_debug > 2) { tprintf ("NO CRUNCH: \"%s\"\n", word->best_choice->unichar_string().string()); } } } page_res_it.forward (); } } } // namespace tesseract BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) { float rating_per_ch; int adjusted_len; int crunch_mode = 0; if ((word->best_choice->unichar_string().length () == 0) || (strspn (word->best_choice->unichar_string().string(), " ") == word->best_choice->unichar_string().length ())) crunch_mode = 1; else { adjusted_len = word->reject_map.length (); if (adjusted_len > crunch_rating_max) adjusted_len = crunch_rating_max; rating_per_ch = word->best_choice->rating () / adjusted_len; if (rating_per_ch > crunch_terrible_rating) crunch_mode = 2; else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) crunch_mode = 3; else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) && (garbage_level != G_OK)) crunch_mode = 4; else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level != G_OK)) crunch_mode = 5; } if (crunch_mode > 0) { if (crunch_debug > 2) { tprintf ("Terrible_word_crunch (%d) on \"%s\"\n", crunch_mode, word->best_choice->unichar_string().string()); } return TRUE; } else return FALSE; } namespace tesseract { BOOL8 Tesseract::potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word) { float rating_per_ch; int adjusted_len; const char *str = word->best_choice->unichar_string().string(); const char *lengths = word->best_choice->unichar_lengths().string(); BOOL8 word_crunchable; int poor_indicator_count = 0; word_crunchable = !crunch_leave_accept_strings || (word->reject_map.length () < 3) || ((acceptable_word_string (str, lengths) == AC_UNACCEPTABLE) && !ok_dict_word); adjusted_len = word->reject_map.length (); if (adjusted_len > 10) adjusted_len = 10; rating_per_ch = word->best_choice->rating () / adjusted_len; if (rating_per_ch > crunch_pot_poor_rate) { if (crunch_debug > 2) { tprintf ("Potential poor rating on \"%s\"\n", word->best_choice->unichar_string().string()); } poor_indicator_count++; } if (word_crunchable && (word->best_choice->certainty () < crunch_pot_poor_cert)) { if (crunch_debug > 2) { tprintf ("Potential poor cert on \"%s\"\n", word->best_choice->unichar_string().string()); } poor_indicator_count++; } if (garbage_level != G_OK) { if (crunch_debug > 2) { tprintf ("Potential garbage on \"%s\"\n", word->best_choice->unichar_string().string()); } poor_indicator_count++; } return (poor_indicator_count >= crunch_pot_indicators); } } // namespace tesseract namespace tesseract { void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) { WERD_RES *word; PAGE_RES_IT copy_it; BOOL8 deleting_from_bol = FALSE; BOOL8 marked_delete_point = FALSE; inT16 debug_delete_mode; CRUNCH_MODE delete_mode; inT16 x_debug_delete_mode; CRUNCH_MODE x_delete_mode; page_res_it.restart_page (); while (page_res_it.word () != NULL) { word = page_res_it.word (); delete_mode = word_deletable (word, debug_delete_mode); if (delete_mode != CR_NONE) { if (word->word->flag (W_BOL) || deleting_from_bol) { if (crunch_debug > 0) { tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode, word->best_choice->unichar_string().string()); } word->unlv_crunch_mode = delete_mode; deleting_from_bol = TRUE; } else if (word->word->flag (W_EOL)) { if (marked_delete_point) { while (copy_it.word () != word) { x_delete_mode = word_deletable (copy_it.word (), x_debug_delete_mode); if (crunch_debug > 0) { tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n", x_debug_delete_mode, copy_it.word()->best_choice->unichar_string().string()); } copy_it.word ()->unlv_crunch_mode = x_delete_mode; copy_it.forward (); } } if (crunch_debug > 0) { tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode, word->best_choice->unichar_string().string()); } word->unlv_crunch_mode = delete_mode; deleting_from_bol = FALSE; marked_delete_point = FALSE; } else { if (!marked_delete_point) { copy_it = page_res_it; marked_delete_point = TRUE; } } } else { deleting_from_bol = FALSE; //Forget earlier potential crunches marked_delete_point = FALSE; } /* The following step has been left till now as the tess fails are used to determine if the word is deletable. */ if (!crunch_early_merge_tess_fails) merge_tess_fails(word); page_res_it.forward (); } } void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) { int i; UNICHAR_ID unichar_dash = unicharset.unichar_to_id("-"); UNICHAR_ID unichar_space = unicharset.unichar_to_id(" "); UNICHAR_ID unichar_tilde = unicharset.unichar_to_id("~"); UNICHAR_ID unichar_pow = unicharset.unichar_to_id("^"); bool modified = false; for (i = 0; i < word_res->reject_map.length(); ++i) { if (word_res->best_choice->unichar_id(i) == unichar_tilde) { word_res->best_choice->set_unichar_id(unichar_dash, i); modified = true; if (word_res->reject_map[i].accepted ()) word_res->reject_map[i].setrej_unlv_rej (); } if (word_res->best_choice->unichar_id(i) == unichar_pow) { word_res->best_choice->set_unichar_id(unichar_space, i); modified = true; if (word_res->reject_map[i].accepted ()) word_res->reject_map[i].setrej_unlv_rej (); } } if (modified) { word_res->best_choice->populate_unichars(unicharset); } } // Change pairs of tess failures to a single one void Tesseract::merge_tess_fails(WERD_RES *word_res) { PBLOB_IT blob_it; //blobs int len = word_res->best_choice->length(); bool modified = false; ASSERT_HOST (word_res->reject_map.length () == len); ASSERT_HOST (word_res->outword->blob_list ()->length () == len); UNICHAR_ID unichar_space = unicharset.unichar_to_id(" "); blob_it = word_res->outword->blob_list (); int i = 0; while (i < word_res->best_choice->length()-1) { if ((word_res->best_choice->unichar_id(i) == unichar_space) && (word_res->best_choice->unichar_id(i+1) == unichar_space)) { modified = true; word_res->best_choice->remove_unichar_id(i); word_res->reject_map.remove_pos (i); merge_blobs (blob_it.data_relative (1), blob_it.data ()); delete blob_it.extract (); //get rid of spare } else { i++; } blob_it.forward (); } len = word_res->best_choice->length(); ASSERT_HOST (word_res->reject_map.length () == len); ASSERT_HOST (word_res->outword->blob_list ()->length () == len); if (modified) { word_res->best_choice->populate_unichars(unicharset); } } GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) { enum STATES { JUNK, FIRST_UPPER, FIRST_LOWER, FIRST_NUM, SUBSEQUENT_UPPER, SUBSEQUENT_LOWER, SUBSEQUENT_NUM }; const char *str = word->best_choice->unichar_string().string(); const char *lengths = word->best_choice->unichar_lengths().string(); STATES state = JUNK; int len = 0; int isolated_digits = 0; int isolated_alphas = 0; int bad_char_count = 0; int tess_rejs = 0; int dodgy_chars = 0; int ok_chars; UNICHAR_ID last_char = -1; int alpha_repetition_count = 0; int longest_alpha_repetition_count = 0; int longest_lower_run_len = 0; int lower_string_count = 0; int longest_upper_run_len = 0; int upper_string_count = 0; int total_alpha_count = 0; int total_digit_count = 0; for (; *str != '\0'; str += *(lengths++)) { len++; if (unicharset.get_isupper (str, *lengths)) { total_alpha_count++; switch (state) { case SUBSEQUENT_UPPER: case FIRST_UPPER: state = SUBSEQUENT_UPPER; upper_string_count++; if (longest_upper_run_len < upper_string_count) longest_upper_run_len = upper_string_count; if (last_char == unicharset.unichar_to_id(str, *lengths)) { alpha_repetition_count++; if (longest_alpha_repetition_count < alpha_repetition_count) { longest_alpha_repetition_count = alpha_repetition_count; } } else { last_char = unicharset.unichar_to_id(str, *lengths); alpha_repetition_count = 1; } break; case FIRST_NUM: isolated_digits++; default: state = FIRST_UPPER; last_char = unicharset.unichar_to_id(str, *lengths); alpha_repetition_count = 1; upper_string_count = 1; break; } } else if (unicharset.get_islower (str, *lengths)) { total_alpha_count++; switch (state) { case SUBSEQUENT_LOWER: case FIRST_LOWER: state = SUBSEQUENT_LOWER; lower_string_count++; if (longest_lower_run_len < lower_string_count) longest_lower_run_len = lower_string_count; if (last_char == unicharset.unichar_to_id(str, *lengths)) { alpha_repetition_count++; if (longest_alpha_repetition_count < alpha_repetition_count) { longest_alpha_repetition_count = alpha_repetition_count; } } else { last_char = unicharset.unichar_to_id(str, *lengths); alpha_repetition_count = 1; } break; case FIRST_NUM: isolated_digits++; default: state = FIRST_LOWER; last_char = unicharset.unichar_to_id(str, *lengths); alpha_repetition_count = 1; lower_string_count = 1; break; } } else if (unicharset.get_isdigit (str, *lengths)) { total_digit_count++; switch (state) { case FIRST_NUM: state = SUBSEQUENT_NUM; case SUBSEQUENT_NUM: break; case FIRST_UPPER: case FIRST_LOWER: isolated_alphas++; default: state = FIRST_NUM; break; } } else { if (*lengths == 1 && *str == ' ') tess_rejs++; else bad_char_count++; switch (state) { case FIRST_NUM: isolated_digits++; break; case FIRST_UPPER: case FIRST_LOWER: isolated_alphas++; default: break; } state = JUNK; } } switch (state) { case FIRST_NUM: isolated_digits++; break; case FIRST_UPPER: case FIRST_LOWER: isolated_alphas++; default: break; } if (crunch_include_numerals) { total_alpha_count += total_digit_count - isolated_digits; } if (crunch_leave_ok_strings && (len >= 4) && (2 * (total_alpha_count - isolated_alphas) > len) && (longest_alpha_repetition_count < crunch_long_repetitions)) { if ((crunch_accept_ok && (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE)) || (longest_lower_run_len > crunch_leave_lc_strings) || (longest_upper_run_len > crunch_leave_uc_strings)) return G_NEVER_CRUNCH; } if ((word->reject_map.length () > 1) && (strpbrk (str, " ") == NULL) && ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) || (word->best_choice->permuter () == FREQ_DAWG_PERM) || (word->best_choice->permuter () == USER_DAWG_PERM) || (word->best_choice->permuter () == NUMBER_PERM) || (acceptable_word_string (str, lengths) != AC_UNACCEPTABLE) || ok_dict_word)) return G_OK; ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs; if (crunch_debug > 3) { tprintf ("garbage_word: \"%s\"\n", word->best_choice->unichar_string().string()); tprintf ("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", len, bad_char_count, isolated_digits, isolated_alphas, tess_rejs); } if ((bad_char_count == 0) && (tess_rejs == 0) && ((len > isolated_digits + isolated_alphas) || (len <= 2))) return G_OK; if ((tess_rejs > ok_chars) || ((tess_rejs > 0) && ((bad_char_count + tess_rejs) * 2 > len))) return G_TERRIBLE; if (len > 4) { dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + isolated_alphas; if ((dodgy_chars > 5) || ((dodgy_chars / (float) len) > 0.5)) return G_DODGY; else return G_OK; } else { dodgy_chars = 2 * tess_rejs + bad_char_count; if (((len == 4) && (dodgy_chars > 2)) || ((len == 3) && (dodgy_chars > 2)) || (dodgy_chars >= len)) return G_DODGY; else return G_OK; } } } // namespace tesseract /************************************************************************* * word_deletable() * DELETE WERDS AT ENDS OF ROWS IF * Word is crunched && * ( string length = 0 OR * > 50% of chars are "|" (before merging) OR * certainty < -10 OR * rating /char > 60 OR * TOP of word is more than 0.5 xht BELOW baseline OR * BOTTOM of word is more than 0.5 xht ABOVE xht OR * length of word < 3xht OR * height of word < 0.7 xht OR * height of word > 3.0 xht OR * >75% of the outline BBs have longest dimension < 0.5xht *************************************************************************/ CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode) { int word_len = word->reject_map.length (); float rating_per_ch; TBOX box; //BB of word if (word->unlv_crunch_mode == CR_NONE) { delete_mode = 0; return CR_NONE; } if (word_len == 0) { delete_mode = 1; return CR_DELETE; } box = word->outword->bounding_box (); if (box.height () < crunch_del_min_ht * bln_x_height) { delete_mode = 4; return CR_DELETE; } if (noise_outlines (word->outword)) { delete_mode = 5; return CR_DELETE; } if ((failure_count (word) * 1.5) > word_len) { delete_mode = 2; return CR_LOOSE_SPACE; } if (word->best_choice->certainty () < crunch_del_cert) { delete_mode = 7; return CR_LOOSE_SPACE; } rating_per_ch = word->best_choice->rating () / word_len; if (rating_per_ch > crunch_del_rating) { delete_mode = 8; return CR_LOOSE_SPACE; } if (box.top () < bln_baseline_offset - crunch_del_low_word * bln_x_height) { delete_mode = 9; return CR_LOOSE_SPACE; } if (box.bottom () > bln_baseline_offset + crunch_del_high_word * bln_x_height) { delete_mode = 10; return CR_LOOSE_SPACE; } if (box.height () > crunch_del_max_ht * bln_x_height) { delete_mode = 11; return CR_LOOSE_SPACE; } if (box.width () < crunch_del_min_width * bln_x_height) { delete_mode = 3; return CR_LOOSE_SPACE; } delete_mode = 0; return CR_NONE; } inT16 failure_count(WERD_RES *word) { const char *str = word->best_choice->unichar_string().string(); int tess_rejs = 0; for (; *str != '\0'; str++) { if (*str == ' ') tess_rejs++; } return tess_rejs; } BOOL8 noise_outlines(WERD *word) { PBLOB_IT blob_it; OUTLINE_IT outline_it; TBOX box; //BB of outline inT16 outline_count = 0; inT16 small_outline_count = 0; inT16 max_dimension; float small_limit = bln_x_height * crunch_small_outlines_size; blob_it.set_to_list (word->blob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { outline_it.set_to_list (blob_it.data ()->out_list ()); for (outline_it.mark_cycle_pt (); !outline_it.cycled_list (); outline_it.forward ()) { outline_count++; box = outline_it.data ()->bounding_box (); if (box.height () > box.width ()) max_dimension = box.height (); else max_dimension = box.width (); if (max_dimension < small_limit) small_outline_count++; } } return (small_outline_count >= outline_count); } /************************************************************************* * insert_rej_cblobs() * Put rejected word blobs back into the outword. * NOTE!!! AFTER THIS THE CHOICES LIST WILL NOT HAVE THE CORRECT NUMBER * OF ELEMENTS. *************************************************************************/ namespace tesseract { void Tesseract::insert_rej_cblobs(WERD_RES *word) { PBLOB_IT blob_it; //blob iterator PBLOB_IT rej_blob_it; const STRING *word_str; const STRING *word_lengths; int old_len; int rej_len; char new_str[512 * UNICHAR_LEN]; char new_lengths[512]; REJMAP new_map; int i = 0; //new_str index int j = 0; //old_str index int i_offset = 0; //new_str offset int j_offset = 0; //old_str offset int new_len; gblob_sort_list (word->outword->rej_blob_list (), TRUE); rej_blob_it.set_to_list (word->outword->rej_blob_list ()); if (rej_blob_it.empty ()) return; rej_len = rej_blob_it.length (); blob_it.set_to_list (word->outword->blob_list ()); word_str = &(word->best_choice->unichar_string()); word_lengths = &(word->best_choice->unichar_lengths()); old_len = word->best_choice->length(); ASSERT_HOST (word->reject_map.length () == old_len); ASSERT_HOST (blob_it.length () == old_len); if ((old_len + rej_len) > 511) return; //Word is garbage anyway prevent abort new_map.initialise (old_len + rej_len); while (!rej_blob_it.empty ()) { if ((j >= old_len) || (rej_blob_it.data ()->bounding_box ().left () <= blob_it.data ()->bounding_box ().left ())) { /* Insert reject blob */ if (j >= old_len) blob_it.add_to_end (rej_blob_it.extract ()); else blob_it.add_before_stay_put (rej_blob_it.extract ()); if (!rej_blob_it.empty ()) rej_blob_it.forward (); new_str[i_offset] = ' '; new_lengths[i] = 1; new_map[i].setrej_rej_cblob (); i_offset += new_lengths[i++]; } else { strncpy(new_str + i_offset, &(*word_str)[j_offset], (*word_lengths)[j]); new_lengths[i] = (*word_lengths)[j]; new_map[i] = word->reject_map[j]; i_offset += new_lengths[i++]; j_offset += (*word_lengths)[j++]; blob_it.forward (); } } /* Add any extra normal blobs to strings */ while (j < word_lengths->length ()) { strncpy(new_str + i_offset, &(*word_str)[j_offset], (*word_lengths)[j]); new_lengths[i] = (*word_lengths)[j]; new_map[i] = word->reject_map[j]; i_offset += new_lengths[i++]; j_offset += (*word_lengths)[j++]; } new_str[i_offset] = '\0'; new_lengths[i] = 0; /* tprintf( "\nOld len %d; New len %d; New str \"%s\"; New map \"%s\"\n", old_len, i, new_str, new_map ); */ ASSERT_HOST (i == blob_it.length ()); ASSERT_HOST (i == old_len + rej_len); word->reject_map = new_map; // Update word->best_choice if needed. if (strcmp(new_str, word->best_choice->unichar_string().string()) != 0 || strcmp(new_lengths, word->best_choice->unichar_lengths().string()) != 0) { WERD_CHOICE *new_choice = new WERD_CHOICE(new_str, new_lengths, word->best_choice->rating(), word->best_choice->certainty(), word->best_choice->permuter(), getDict().getUnicharset()); new_choice->populate_unichars(getDict().getUnicharset()); delete word->best_choice; word->best_choice = new_choice; } new_len = word->best_choice->length(); ASSERT_HOST (word->reject_map.length () == new_len); ASSERT_HOST (word->outword->blob_list ()->length () == new_len); } } // namespace tesseract