mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 20:59:36 +08:00
08defee46e
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@450 d0cd1f9f-072b-0410-8dd7-cf729c803f20
989 lines
33 KiB
C++
989 lines
33 KiB
C++
/******************************************************************
|
|
* File: fixspace.cpp (Formerly fixspace.c)
|
|
* Description: Implements a pass over the page res, exploring the alternative
|
|
* spacing possibilities, trying to use context to improve the
|
|
word spacing
|
|
* Author: Phil Cheatle
|
|
* Created: Thu Oct 21 11:38:43 BST 1993
|
|
*
|
|
* (C) Copyright 1993, Hewlett-Packard Ltd.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
#include "mfcpch.h"
|
|
#include <ctype.h>
|
|
#include "reject.h"
|
|
#include "statistc.h"
|
|
#include "genblob.h"
|
|
#include "control.h"
|
|
#include "fixspace.h"
|
|
#include "tessvars.h"
|
|
#include "tessbox.h"
|
|
#include "secname.h"
|
|
#include "globals.h"
|
|
#include "tesseractclass.h"
|
|
|
|
#define EXTERN
|
|
|
|
EXTERN BOOL_VAR (fixsp_check_for_fp_noise_space, TRUE,
|
|
"Try turning noise to space in fixed pitch");
|
|
EXTERN BOOL_VAR (fixsp_fp_eval, TRUE, "Use alternate evaluation for fp");
|
|
EXTERN BOOL_VAR (fixsp_noise_score_fixing, TRUE, "More sophisticated?");
|
|
EXTERN INT_VAR (fixsp_non_noise_limit, 1,
|
|
"How many non-noise blbs either side?");
|
|
EXTERN double_VAR (fixsp_small_outlines_size, 0.28, "Small if lt xht x this");
|
|
|
|
EXTERN BOOL_VAR (fixsp_ignore_punct, TRUE, "In uniform spacing calc");
|
|
EXTERN BOOL_VAR (fixsp_numeric_fix, TRUE, "Try to deal with numeric punct");
|
|
EXTERN BOOL_VAR (fixsp_prefer_joined_1s, TRUE, "Arbitrary boost");
|
|
EXTERN BOOL_VAR (tessedit_test_uniform_wd_spacing, FALSE,
|
|
"Limit context word spacing");
|
|
EXTERN BOOL_VAR (tessedit_prefer_joined_punct, FALSE,
|
|
"Reward punctation joins");
|
|
EXTERN INT_VAR (fixsp_done_mode, 1, "What constitues done for spacing");
|
|
EXTERN INT_VAR (debug_fix_space_level, 0, "Contextual fixspace debug");
|
|
EXTERN STRING_VAR (numeric_punctuation, ".,",
|
|
"Punct. chs expected WITHIN numbers");
|
|
|
|
#define PERFECT_WERDS 999
|
|
#define MAXSPACING 128 /*max expected spacing in pix */
|
|
|
|
namespace tesseract {
|
|
/**
|
|
* @name fix_fuzzy_spaces()
|
|
* Walk over the page finding sequences of words joined by fuzzy spaces. Extract
|
|
* them as a sublist, process the sublist to find the optimal arrangement of
|
|
* spaces then replace the sublist in the ROW_RES.
|
|
*
|
|
* @param monitor progress monitor
|
|
* @param word_count count of words in doc
|
|
* @param[out] page_res
|
|
*/
|
|
void Tesseract::fix_fuzzy_spaces(volatile ETEXT_DESC *monitor,
|
|
inT32 word_count,
|
|
PAGE_RES *page_res) {
|
|
BLOCK_RES_IT block_res_it; //iterators
|
|
ROW_RES_IT row_res_it;
|
|
WERD_RES_IT word_res_it_from;
|
|
WERD_RES_IT word_res_it_to;
|
|
WERD_RES *word_res;
|
|
WERD_RES_LIST fuzzy_space_words;
|
|
inT16 new_length;
|
|
BOOL8 prevent_null_wd_fixsp; //DONT process blobless wds
|
|
inT32 word_index; //current word
|
|
|
|
block_res_it.set_to_list (&page_res->block_res_list);
|
|
word_index = 0;
|
|
for (block_res_it.mark_cycle_pt ();
|
|
!block_res_it.cycled_list (); block_res_it.forward ()) {
|
|
row_res_it.set_to_list (&block_res_it.data ()->row_res_list);
|
|
for (row_res_it.mark_cycle_pt ();
|
|
!row_res_it.cycled_list (); row_res_it.forward ()) {
|
|
word_res_it_from.set_to_list (&row_res_it.data ()->word_res_list);
|
|
while (!word_res_it_from.at_last ()) {
|
|
word_res = word_res_it_from.data ();
|
|
while (!word_res_it_from.at_last () &&
|
|
!(word_res->combination ||
|
|
word_res_it_from.data_relative (1)->word->flag (W_FUZZY_NON) ||
|
|
word_res_it_from.data_relative (1)->word->flag (W_FUZZY_SP))) {
|
|
fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
|
|
block_res_it.data()->block);
|
|
word_res = word_res_it_from.forward ();
|
|
word_index++;
|
|
if (monitor != NULL) {
|
|
monitor->ocr_alive = TRUE;
|
|
monitor->progress = 90 + 5 * word_index / word_count;
|
|
}
|
|
}
|
|
|
|
if (!word_res_it_from.at_last ()) {
|
|
word_res_it_to = word_res_it_from;
|
|
prevent_null_wd_fixsp =
|
|
word_res->word->gblob_list ()->empty ();
|
|
if (check_debug_pt (word_res, 60))
|
|
debug_fix_space_level.set_value (10);
|
|
word_res_it_to.forward ();
|
|
word_index++;
|
|
if (monitor != NULL) {
|
|
monitor->ocr_alive = TRUE;
|
|
monitor->progress = 90 + 5 * word_index / word_count;
|
|
}
|
|
while (!word_res_it_to.at_last () &&
|
|
(word_res_it_to.data_relative (1)->word->flag (W_FUZZY_NON) ||
|
|
word_res_it_to.data_relative (1)->word->flag (W_FUZZY_SP))) {
|
|
if (check_debug_pt (word_res, 60))
|
|
debug_fix_space_level.set_value (10);
|
|
if (word_res->word->gblob_list ()->empty ())
|
|
prevent_null_wd_fixsp = TRUE;
|
|
word_res = word_res_it_to.forward ();
|
|
}
|
|
if (check_debug_pt (word_res, 60))
|
|
debug_fix_space_level.set_value (10);
|
|
if (word_res->word->gblob_list ()->empty ())
|
|
prevent_null_wd_fixsp = TRUE;
|
|
if (prevent_null_wd_fixsp) {
|
|
word_res_it_from = word_res_it_to;
|
|
} else {
|
|
fuzzy_space_words.assign_to_sublist(&word_res_it_from,
|
|
&word_res_it_to);
|
|
fix_fuzzy_space_list(fuzzy_space_words,
|
|
row_res_it.data()->row,
|
|
block_res_it.data()->block);
|
|
new_length = fuzzy_space_words.length ();
|
|
word_res_it_from.add_list_before (&fuzzy_space_words);
|
|
for (; (!word_res_it_from.at_last () && (new_length > 0)); new_length--) {
|
|
word_res_it_from.forward ();
|
|
}
|
|
}
|
|
if (test_pt)
|
|
debug_fix_space_level.set_value (0);
|
|
}
|
|
fix_sp_fp_word(word_res_it_from, row_res_it.data ()->row,
|
|
block_res_it.data()->block);
|
|
//Last word in row
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
|
|
ROW *row,
|
|
BLOCK* block) {
|
|
inT16 best_score;
|
|
WERD_RES_LIST current_perm;
|
|
inT16 current_score;
|
|
BOOL8 improved = FALSE;
|
|
|
|
best_score = eval_word_spacing(best_perm); // default score
|
|
dump_words (best_perm, best_score, 1, improved);
|
|
|
|
if (best_score != PERFECT_WERDS)
|
|
initialise_search(best_perm, current_perm);
|
|
|
|
while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) {
|
|
match_current_words(current_perm, row, block);
|
|
current_score = eval_word_spacing (current_perm);
|
|
dump_words (current_perm, current_score, 2, improved);
|
|
if (current_score > best_score) {
|
|
best_perm.clear();
|
|
best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy);
|
|
best_score = current_score;
|
|
improved = TRUE;
|
|
}
|
|
if (current_score < PERFECT_WERDS)
|
|
transform_to_next_perm(current_perm);
|
|
}
|
|
dump_words (best_perm, best_score, 3, improved);
|
|
}
|
|
|
|
} // namespace tesseract
|
|
|
|
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
|
|
WERD_RES_IT src_it(&src_list);
|
|
WERD_RES_IT new_it(&new_list);
|
|
WERD_RES *src_wd;
|
|
WERD_RES *new_wd;
|
|
|
|
for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
|
|
src_wd = src_it.data ();
|
|
if (!src_wd->combination) {
|
|
new_wd = new WERD_RES (*src_wd);
|
|
new_wd->combination = FALSE;
|
|
new_wd->part_of_combo = FALSE;
|
|
new_it.add_after_then_move (new_wd);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
namespace tesseract {
|
|
void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
|
|
BLOCK* block) {
|
|
WERD_RES_IT word_it(&words);
|
|
WERD_RES *word;
|
|
|
|
for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
|
|
word = word_it.data ();
|
|
if ((!word->part_of_combo) && (word->outword == NULL))
|
|
classify_word_pass2(word, block, row);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* @name eval_word_spacing()
|
|
* The basic measure is the number of characters in contextually confirmed
|
|
* words. (I.e the word is done)
|
|
* If all words are contextually confirmed the evaluation is deemed perfect.
|
|
*
|
|
* Some fiddles are done to handle "1"s as these are VERY frequent causes of
|
|
* fuzzy spaces. The problem with the basic measure is that "561 63" would score
|
|
* the same as "56163", though given our knowledge that the space is fuzzy, and
|
|
* that there is a "1" next to the fuzzy space, we need to ensure that "56163"
|
|
* is prefered.
|
|
*
|
|
* The solution is to NOT COUNT the score of any word which has a digit at one
|
|
* end and a "1Il" as the character the other side of the space.
|
|
*
|
|
* Conversly, any character next to a "1" within a word is counted as a positive
|
|
* score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of
|
|
* the "1" joined). "56163" would score 7 - all chars in a numeric word + 2
|
|
* sides of a "1" joined.
|
|
*
|
|
* The joined 1 rule is applied to any word REGARDLESS of contextual
|
|
* confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally
|
|
* confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.
|
|
*
|
|
*/
|
|
inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
|
|
WERD_RES_IT word_res_it(&word_res_list);
|
|
inT16 total_score = 0;
|
|
inT16 word_count = 0;
|
|
inT16 done_word_count = 0;
|
|
inT16 word_len;
|
|
inT16 i;
|
|
inT16 offset;
|
|
WERD_RES *word; //current word
|
|
inT16 prev_word_score = 0;
|
|
BOOL8 prev_word_done = FALSE;
|
|
BOOL8 prev_char_1 = FALSE; //prev ch a "1/I/l"?
|
|
BOOL8 prev_char_digit = FALSE; //prev ch 2..9 or 0
|
|
BOOL8 current_char_1 = FALSE;
|
|
BOOL8 current_word_ok_so_far;
|
|
STRING punct_chars = "!\"`',.:;";
|
|
BOOL8 prev_char_punct = FALSE;
|
|
BOOL8 current_char_punct = FALSE;
|
|
BOOL8 word_done = FALSE;
|
|
|
|
do {
|
|
word = word_res_it.data ();
|
|
word_done = fixspace_thinks_word_done (word);
|
|
word_count++;
|
|
if (word->tess_failed) {
|
|
total_score += prev_word_score;
|
|
if (prev_word_done)
|
|
done_word_count++;
|
|
prev_word_score = 0;
|
|
prev_char_1 = FALSE;
|
|
prev_char_digit = FALSE;
|
|
prev_word_done = FALSE;
|
|
}
|
|
else {
|
|
/*
|
|
Can we add the prev word score and potentially count this word?
|
|
Yes IF it didnt end in a 1 when the first char of this word is a digit
|
|
AND it didnt end in a digit when the first char of this word is a 1
|
|
*/
|
|
word_len = word->reject_map.length ();
|
|
current_word_ok_so_far = FALSE;
|
|
if (!((prev_char_1 &&
|
|
digit_or_numeric_punct (word, 0)) ||
|
|
(prev_char_digit &&
|
|
((word_done &&
|
|
(word->best_choice->unichar_lengths().string()[0] == 1 &&
|
|
word->best_choice->unichar_string()[0] == '1')) ||
|
|
(!word_done &&
|
|
STRING(conflict_set_I_l_1).contains(word->best_choice->unichar_string ()[0])))))) {
|
|
total_score += prev_word_score;
|
|
if (prev_word_done)
|
|
done_word_count++;
|
|
current_word_ok_so_far = word_done;
|
|
}
|
|
|
|
if ((current_word_ok_so_far) &&
|
|
(!tessedit_test_uniform_wd_spacing ||
|
|
((word->best_choice->permuter () == NUMBER_PERM) ||
|
|
uniformly_spaced (word)))) {
|
|
prev_word_done = TRUE;
|
|
prev_word_score = word_len;
|
|
}
|
|
else {
|
|
prev_word_done = FALSE;
|
|
prev_word_score = 0;
|
|
}
|
|
|
|
if (fixsp_prefer_joined_1s) {
|
|
/* Add 1 to total score for every joined 1 regardless of context and
|
|
rejtn */
|
|
|
|
for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
|
|
current_char_1 = word->best_choice->unichar_string()[i] == '1';
|
|
if (prev_char_1 || (current_char_1 && (i > 0)))
|
|
total_score++;
|
|
prev_char_1 = current_char_1;
|
|
}
|
|
}
|
|
|
|
/* Add 1 to total score for every joined punctuation regardless of context
|
|
and rejtn */
|
|
if (tessedit_prefer_joined_punct) {
|
|
for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len;
|
|
offset += word->best_choice->unichar_lengths()[i++]) {
|
|
current_char_punct =
|
|
punct_chars.contains (word->best_choice->unichar_string()[offset]);
|
|
if (prev_char_punct || (current_char_punct && (i > 0)))
|
|
total_score++;
|
|
prev_char_punct = current_char_punct;
|
|
}
|
|
}
|
|
prev_char_digit = digit_or_numeric_punct (word, word_len - 1);
|
|
for (i = 0, offset = 0; i < word_len - 1;
|
|
offset += word->best_choice->unichar_lengths()[i++]);
|
|
prev_char_1 =
|
|
((word_done
|
|
&& (word->best_choice->unichar_string()[offset] == '1'))
|
|
|| (!word_done
|
|
&& STRING(conflict_set_I_l_1).contains(
|
|
word->best_choice->unichar_string()[offset])));
|
|
}
|
|
/* Find next word */
|
|
do
|
|
word_res_it.forward ();
|
|
while (word_res_it.data ()->part_of_combo);
|
|
}
|
|
while (!word_res_it.at_first ());
|
|
total_score += prev_word_score;
|
|
if (prev_word_done)
|
|
done_word_count++;
|
|
if (done_word_count == word_count)
|
|
return PERFECT_WERDS;
|
|
else
|
|
return total_score;
|
|
}
|
|
|
|
|
|
BOOL8 Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
|
|
int i;
|
|
int offset;
|
|
|
|
for (i = 0, offset = 0; i < char_position;
|
|
offset += word->best_choice->unichar_lengths()[i++]);
|
|
return (unicharset.get_isdigit(word->best_choice->unichar_string().string() + offset,
|
|
word->best_choice->unichar_lengths()[i]) ||
|
|
(fixsp_numeric_fix &&
|
|
(word->best_choice->permuter () == NUMBER_PERM) &&
|
|
STRING (numeric_punctuation).contains
|
|
(word->best_choice->unichar_string().string()[offset])));
|
|
}
|
|
} // namespace tesseract
|
|
|
|
|
|
/**
|
|
* @name transform_to_next_perm()
|
|
* Examines the current word list to find the smallest word gap size. Then walks
|
|
* the word list closing any gaps of this size by either inserted new
|
|
* combination words, or extending existing ones.
|
|
*
|
|
* The routine COULD be limited to stop it building words longer than N blobs.
|
|
*
|
|
* If there are no more gaps then it DELETES the entire list and returns the
|
|
* empty list to cause termination.
|
|
*/
|
|
void transform_to_next_perm(WERD_RES_LIST &words) {
|
|
WERD_RES_IT word_it(&words);
|
|
WERD_RES_IT prev_word_it(&words);
|
|
WERD_RES *word;
|
|
WERD_RES *prev_word;
|
|
WERD_RES *combo;
|
|
WERD *copy_word;
|
|
inT16 prev_right = -1;
|
|
TBOX box;
|
|
inT16 gap;
|
|
inT16 min_gap = MAX_INT16;
|
|
|
|
for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
|
|
word = word_it.data ();
|
|
if (!word->part_of_combo) {
|
|
box = word->word->bounding_box ();
|
|
if (prev_right >= 0) {
|
|
gap = box.left () - prev_right;
|
|
if (gap < min_gap)
|
|
min_gap = gap;
|
|
}
|
|
prev_right = box.right ();
|
|
}
|
|
}
|
|
if (min_gap < MAX_INT16) {
|
|
prev_right = -1; //back to start
|
|
word_it.set_to_list (&words);
|
|
//cant use cycle pt due to inserted combos at start of list
|
|
for (; (prev_right < 0) || !word_it.at_first (); word_it.forward ()) {
|
|
word = word_it.data ();
|
|
if (!word->part_of_combo) {
|
|
box = word->word->bounding_box ();
|
|
if (prev_right >= 0) {
|
|
gap = box.left () - prev_right;
|
|
if (gap <= min_gap) {
|
|
prev_word = prev_word_it.data ();
|
|
if (prev_word->combination)
|
|
combo = prev_word;
|
|
else {
|
|
/* Make a new combination and insert before the first word being joined */
|
|
copy_word = new WERD;
|
|
*copy_word = *(prev_word->word);
|
|
//deep copy
|
|
combo = new WERD_RES (copy_word);
|
|
combo->combination = TRUE;
|
|
combo->x_height = prev_word->x_height;
|
|
prev_word->part_of_combo = TRUE;
|
|
prev_word_it.add_before_then_move (combo);
|
|
}
|
|
combo->word->set_flag (W_EOL, word->word->flag (W_EOL));
|
|
if (word->combination) {
|
|
combo->word->join_on (word->word);
|
|
//Move blbs to combo
|
|
//old combo no longer needed
|
|
delete word_it.extract ();
|
|
}
|
|
else {
|
|
//Cpy current wd to combo
|
|
combo->copy_on (word);
|
|
word->part_of_combo = TRUE;
|
|
}
|
|
combo->done = FALSE;
|
|
if (combo->outword != NULL) {
|
|
delete combo->outword;
|
|
delete combo->best_choice;
|
|
delete combo->raw_choice;
|
|
combo->outword = NULL;
|
|
combo->best_choice = NULL;
|
|
combo->raw_choice = NULL;
|
|
}
|
|
}
|
|
else
|
|
//catch up
|
|
prev_word_it = word_it;
|
|
}
|
|
prev_right = box.right ();
|
|
}
|
|
}
|
|
}
|
|
else
|
|
words.clear (); //signal termination
|
|
}
|
|
|
|
|
|
void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved) {
|
|
WERD_RES_IT word_res_it(&perm);
|
|
static STRING initial_str;
|
|
|
|
if (debug_fix_space_level > 0) {
|
|
if (mode == 1) {
|
|
initial_str = "";
|
|
for (word_res_it.mark_cycle_pt ();
|
|
!word_res_it.cycled_list (); word_res_it.forward ()) {
|
|
if (!word_res_it.data ()->part_of_combo) {
|
|
initial_str += word_res_it.data()->best_choice->unichar_string();
|
|
initial_str += ' ';
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifndef SECURE_NAMES
|
|
if (debug_fix_space_level > 1) {
|
|
switch (mode) {
|
|
case 1:
|
|
tprintf ("EXTRACTED (%d): \"", score);
|
|
break;
|
|
case 2:
|
|
tprintf ("TESTED (%d): \"", score);
|
|
break;
|
|
case 3:
|
|
tprintf ("RETURNED (%d): \"", score);
|
|
break;
|
|
}
|
|
|
|
for (word_res_it.mark_cycle_pt ();
|
|
!word_res_it.cycled_list (); word_res_it.forward ()) {
|
|
if (!word_res_it.data ()->part_of_combo)
|
|
tprintf("%s/%1d ",
|
|
word_res_it.data()->best_choice->unichar_string().string(),
|
|
(int)word_res_it.data()->best_choice->permuter());
|
|
}
|
|
tprintf ("\"\n");
|
|
}
|
|
else if (improved) {
|
|
tprintf ("FIX SPACING \"%s\" => \"", initial_str.string ());
|
|
for (word_res_it.mark_cycle_pt ();
|
|
!word_res_it.cycled_list (); word_res_it.forward ()) {
|
|
if (!word_res_it.data ()->part_of_combo)
|
|
tprintf ("%s/%1d ",
|
|
word_res_it.data()->best_choice->unichar_string().string(),
|
|
(int)word_res_it.data()->best_choice->permuter());
|
|
}
|
|
tprintf ("\"\n");
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* @name uniformly_spaced()
|
|
* Return true if one of the following are true:
|
|
* - All inter-char gaps are the same width
|
|
* - The largest gap is no larger than twice the mean/median of the others
|
|
* - The largest gap is < 64/5 = 13 and all others are <= 0
|
|
* **** REMEMBER - WE'RE NOW WORKING WITH A BLN WERD !!!
|
|
*/
|
|
BOOL8 uniformly_spaced(WERD_RES *word) {
|
|
PBLOB_IT blob_it;
|
|
TBOX box;
|
|
inT16 prev_right = -MAX_INT16;
|
|
inT16 gap;
|
|
inT16 max_gap = -MAX_INT16;
|
|
inT16 max_gap_count = 0;
|
|
STATS gap_stats (0, MAXSPACING);
|
|
BOOL8 result;
|
|
const ROW *row = word->denorm.row ();
|
|
float max_non_space;
|
|
float normalised_max_nonspace;
|
|
inT16 i = 0;
|
|
inT16 offset = 0;
|
|
STRING punct_chars = "\"`',.:;";
|
|
|
|
blob_it.set_to_list (word->outword->blob_list ());
|
|
|
|
for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
|
|
box = blob_it.data ()->bounding_box ();
|
|
if ((prev_right > -MAX_INT16) &&
|
|
(!fixsp_ignore_punct ||
|
|
(!punct_chars.contains (word->best_choice->unichar_string()
|
|
[offset - word->best_choice->unichar_lengths()[i - 1]]) &&
|
|
!punct_chars.contains (word->best_choice->unichar_string()[offset])))) {
|
|
gap = box.left () - prev_right;
|
|
if (gap < max_gap)
|
|
gap_stats.add (gap, 1);
|
|
else if (gap == max_gap)
|
|
max_gap_count++;
|
|
else {
|
|
if (max_gap_count > 0)
|
|
gap_stats.add (max_gap, max_gap_count);
|
|
max_gap = gap;
|
|
max_gap_count = 1;
|
|
}
|
|
}
|
|
prev_right = box.right ();
|
|
offset += word->best_choice->unichar_lengths()[i++];
|
|
}
|
|
|
|
max_non_space = (row->space () + 3 * row->kern ()) / 4;
|
|
normalised_max_nonspace = max_non_space * bln_x_height / row->x_height ();
|
|
|
|
result = ((gap_stats.get_total () == 0) ||
|
|
(max_gap <= normalised_max_nonspace) ||
|
|
((gap_stats.get_total () > 2) &&
|
|
(max_gap <= 2 * gap_stats.median ())) ||
|
|
((gap_stats.get_total () <= 2) &&
|
|
(max_gap <= 2 * gap_stats.mean ())));
|
|
#ifndef SECURE_NAMES
|
|
if ((debug_fix_space_level > 1)) {
|
|
if (result)
|
|
tprintf
|
|
("ACCEPT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d total=%d mean=%f median=%f\n",
|
|
word->best_choice->unichar_string().string (), normalised_max_nonspace,
|
|
max_gap, max_gap_count, gap_stats.get_total (), gap_stats.mean (),
|
|
gap_stats.median ());
|
|
else
|
|
tprintf
|
|
("REJECT SPACING FOR: \"%s\" norm_maxnon = %f max=%d maxcount=%d total=%d mean=%f median=%f\n",
|
|
word->best_choice->unichar_string().string (), normalised_max_nonspace,
|
|
max_gap, max_gap_count, gap_stats.get_total (), gap_stats.mean (),
|
|
gap_stats.median ());
|
|
}
|
|
#endif
|
|
|
|
return result;
|
|
}
|
|
|
|
|
|
BOOL8 fixspace_thinks_word_done(WERD_RES *word) {
|
|
if (word->done)
|
|
return TRUE;
|
|
|
|
/*
|
|
Use all the standard pass 2 conditions for mode 5 in set_done() in
|
|
reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT
|
|
CARE WHETHER WE HAVE of/at on/an etc.
|
|
*/
|
|
if ((fixsp_done_mode > 0) &&
|
|
(word->tess_accepted ||
|
|
((fixsp_done_mode == 2) &&
|
|
(word->reject_map.reject_count () == 0)) ||
|
|
(fixsp_done_mode == 3)) &&
|
|
(strchr (word->best_choice->unichar_string().string (), ' ') == NULL) &&
|
|
((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
|
|
(word->best_choice->permuter () == FREQ_DAWG_PERM) ||
|
|
(word->best_choice->permuter () == USER_DAWG_PERM) ||
|
|
(word->best_choice->permuter () == NUMBER_PERM)))
|
|
return TRUE;
|
|
else
|
|
return FALSE;
|
|
}
|
|
|
|
|
|
namespace tesseract {
|
|
/**
|
|
* @name fix_sp_fp_word()
|
|
* Test the current word to see if it can be split by deleting noise blobs. If
|
|
* so, do the business.
|
|
* Return with the iterator pointing to the same place if the word is unchanged,
|
|
* or the last of the replacement words.
|
|
*/
|
|
void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
|
|
BLOCK* block) {
|
|
WERD_RES *word_res;
|
|
WERD_RES_LIST sub_word_list;
|
|
WERD_RES_IT sub_word_list_it(&sub_word_list);
|
|
inT16 blob_index;
|
|
inT16 new_length;
|
|
float junk;
|
|
|
|
word_res = word_res_it.data ();
|
|
if (!fixsp_check_for_fp_noise_space ||
|
|
word_res->word->flag (W_REP_CHAR) ||
|
|
word_res->combination ||
|
|
word_res->part_of_combo || !word_res->word->flag (W_DONT_CHOP))
|
|
return;
|
|
|
|
blob_index = worst_noise_blob (word_res, &junk);
|
|
if (blob_index < 0)
|
|
return;
|
|
|
|
#ifndef SECURE_NAMES
|
|
if (debug_fix_space_level > 1) {
|
|
tprintf ("FP fixspace working on \"%s\"\n",
|
|
word_res->best_choice->unichar_string().string());
|
|
}
|
|
#endif
|
|
gblob_sort_list ((PBLOB_LIST *) word_res->word->rej_cblob_list (), FALSE);
|
|
sub_word_list_it.add_after_stay_put (word_res_it.extract ());
|
|
fix_noisy_space_list(sub_word_list, row, block);
|
|
new_length = sub_word_list.length ();
|
|
word_res_it.add_list_before (&sub_word_list);
|
|
for (; (!word_res_it.at_last () && (new_length > 1)); new_length--) {
|
|
word_res_it.forward ();
|
|
}
|
|
}
|
|
|
|
void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
|
|
BLOCK* block) {
|
|
inT16 best_score;
|
|
WERD_RES_IT best_perm_it(&best_perm);
|
|
WERD_RES_LIST current_perm;
|
|
WERD_RES_IT current_perm_it(¤t_perm);
|
|
WERD_RES *old_word_res;
|
|
WERD_RES *new_word_res;
|
|
inT16 current_score;
|
|
BOOL8 improved = FALSE;
|
|
|
|
//default score
|
|
best_score = fp_eval_word_spacing (best_perm);
|
|
|
|
dump_words (best_perm, best_score, 1, improved);
|
|
|
|
new_word_res = new WERD_RES;
|
|
old_word_res = best_perm_it.data ();
|
|
//Kludge to force deep copy
|
|
old_word_res->combination = TRUE;
|
|
*new_word_res = *old_word_res; //deep copy
|
|
//Undo kludge
|
|
old_word_res->combination = FALSE;
|
|
//Undo kludge
|
|
new_word_res->combination = FALSE;
|
|
current_perm_it.add_to_end (new_word_res);
|
|
|
|
break_noisiest_blob_word(current_perm);
|
|
|
|
while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) {
|
|
match_current_words(current_perm, row, block);
|
|
current_score = fp_eval_word_spacing (current_perm);
|
|
dump_words (current_perm, current_score, 2, improved);
|
|
if (current_score > best_score) {
|
|
best_perm.clear();
|
|
best_perm.deep_copy(¤t_perm, &WERD_RES::deep_copy);
|
|
best_score = current_score;
|
|
improved = TRUE;
|
|
}
|
|
if (current_score < PERFECT_WERDS)
|
|
break_noisiest_blob_word(current_perm);
|
|
}
|
|
dump_words (best_perm, best_score, 3, improved);
|
|
}
|
|
} // namespace tesseract
|
|
|
|
|
|
/**
|
|
* break_noisiest_blob_word()
|
|
* Find the word with the blob which looks like the worst noise.
|
|
* Break the word into two, deleting the noise blob.
|
|
*/
|
|
void break_noisiest_blob_word(WERD_RES_LIST &words) {
|
|
WERD_RES_IT word_it(&words);
|
|
WERD_RES_IT worst_word_it;
|
|
float worst_noise_score = 9999;
|
|
int worst_blob_index = -1; //noisiest blb of noisiest wd
|
|
int blob_index; //of wds noisiest blb
|
|
float noise_score; //of wds noisiest blb
|
|
WERD_RES *word_res;
|
|
C_BLOB_IT blob_it;
|
|
C_BLOB_IT rej_cblob_it;
|
|
C_BLOB_LIST new_blob_list;
|
|
C_BLOB_IT new_blob_it;
|
|
C_BLOB_IT new_rej_cblob_it;
|
|
WERD *new_word;
|
|
inT16 start_of_noise_blob;
|
|
inT16 i;
|
|
|
|
for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
|
|
blob_index = worst_noise_blob (word_it.data (), &noise_score);
|
|
if ((blob_index > -1) && (worst_noise_score > noise_score)) {
|
|
worst_noise_score = noise_score;
|
|
worst_blob_index = blob_index;
|
|
worst_word_it = word_it;
|
|
}
|
|
}
|
|
if (worst_blob_index < 0) {
|
|
words.clear (); //signal termination
|
|
return;
|
|
}
|
|
|
|
/* Now split the worst_word_it */
|
|
|
|
word_res = worst_word_it.data ();
|
|
|
|
/* Move blobs before noise blob to a new bloblist */
|
|
|
|
new_blob_it.set_to_list (&new_blob_list);
|
|
blob_it.set_to_list (word_res->word->cblob_list ());
|
|
for (i = 0; i < worst_blob_index; i++, blob_it.forward ()) {
|
|
new_blob_it.add_after_then_move (blob_it.extract ());
|
|
}
|
|
start_of_noise_blob = blob_it.data ()->bounding_box ().left ();
|
|
delete blob_it.extract (); //throw out noise blb
|
|
|
|
new_word = new WERD (&new_blob_list, word_res->word);
|
|
new_word->set_flag (W_EOL, FALSE);
|
|
word_res->word->set_flag (W_BOL, FALSE);
|
|
word_res->word->set_blanks (1);//After break
|
|
|
|
new_rej_cblob_it.set_to_list (new_word->rej_cblob_list ());
|
|
rej_cblob_it.set_to_list (word_res->word->rej_cblob_list ());
|
|
for (;
|
|
(!rej_cblob_it.empty () &&
|
|
(rej_cblob_it.data ()->bounding_box ().left () <
|
|
start_of_noise_blob)); rej_cblob_it.forward ()) {
|
|
new_rej_cblob_it.add_after_then_move (rej_cblob_it.extract ());
|
|
}
|
|
|
|
worst_word_it.add_before_then_move (new WERD_RES (new_word));
|
|
|
|
word_res->done = FALSE;
|
|
if (word_res->outword != NULL) {
|
|
delete word_res->outword;
|
|
delete word_res->best_choice;
|
|
delete word_res->raw_choice;
|
|
word_res->outword = NULL;
|
|
word_res->best_choice = NULL;
|
|
word_res->raw_choice = NULL;
|
|
}
|
|
}
|
|
|
|
|
|
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {
|
|
PBLOB_IT blob_it;
|
|
inT16 blob_count;
|
|
float noise_score[512];
|
|
int i;
|
|
int min_noise_blob; //1st contender
|
|
int max_noise_blob; //last contender
|
|
int non_noise_count;
|
|
int worst_noise_blob; //Worst blob
|
|
float small_limit = bln_x_height * fixsp_small_outlines_size;
|
|
float non_noise_limit = bln_x_height * 0.8;
|
|
|
|
blob_it.set_to_list (word_res->outword->blob_list ());
|
|
//normalised
|
|
blob_count = blob_it.length ();
|
|
ASSERT_HOST (blob_count <= 512);
|
|
if (blob_count < 5)
|
|
return -1; //too short to split
|
|
/* Get the noise scores for all blobs */
|
|
|
|
#ifndef SECURE_NAMES
|
|
if (debug_fix_space_level > 5)
|
|
tprintf ("FP fixspace Noise metrics for \"%s\": ",
|
|
word_res->best_choice->unichar_string().string());
|
|
#endif
|
|
|
|
for (i = 0; i < blob_count; i++, blob_it.forward ()) {
|
|
if (word_res->reject_map[i].accepted ())
|
|
noise_score[i] = non_noise_limit;
|
|
else
|
|
noise_score[i] = blob_noise_score (blob_it.data ());
|
|
|
|
if (debug_fix_space_level > 5)
|
|
tprintf ("%1.1f ", noise_score[i]);
|
|
}
|
|
if (debug_fix_space_level > 5)
|
|
tprintf ("\n");
|
|
|
|
/* Now find the worst one which is far enough away from the end of the word */
|
|
|
|
non_noise_count = 0;
|
|
for (i = 0;
|
|
(i < blob_count) && (non_noise_count < fixsp_non_noise_limit); i++) {
|
|
if (noise_score[i] >= non_noise_limit)
|
|
non_noise_count++;
|
|
}
|
|
if (non_noise_count < fixsp_non_noise_limit)
|
|
return -1;
|
|
min_noise_blob = i;
|
|
|
|
non_noise_count = 0;
|
|
for (i = blob_count - 1;
|
|
(i >= 0) && (non_noise_count < fixsp_non_noise_limit); i--) {
|
|
if (noise_score[i] >= non_noise_limit)
|
|
non_noise_count++;
|
|
}
|
|
if (non_noise_count < fixsp_non_noise_limit)
|
|
return -1;
|
|
max_noise_blob = i;
|
|
|
|
if (min_noise_blob > max_noise_blob)
|
|
return -1;
|
|
|
|
*worst_noise_score = small_limit;
|
|
worst_noise_blob = -1;
|
|
for (i = min_noise_blob; i <= max_noise_blob; i++) {
|
|
if (noise_score[i] < *worst_noise_score) {
|
|
worst_noise_blob = i;
|
|
*worst_noise_score = noise_score[i];
|
|
}
|
|
}
|
|
return worst_noise_blob;
|
|
}
|
|
|
|
|
|
float blob_noise_score(PBLOB *blob) {
|
|
OUTLINE_IT outline_it;
|
|
TBOX box; //BB of outline
|
|
inT16 outline_count = 0;
|
|
inT16 max_dimension;
|
|
inT16 largest_outline_dimension = 0;
|
|
|
|
outline_it.set_to_list (blob->out_list ());
|
|
for (outline_it.mark_cycle_pt ();
|
|
!outline_it.cycled_list (); outline_it.forward ()) {
|
|
outline_count++;
|
|
box = outline_it.data ()->bounding_box ();
|
|
if (box.height () > box.width ())
|
|
max_dimension = box.height ();
|
|
else
|
|
max_dimension = box.width ();
|
|
|
|
if (largest_outline_dimension < max_dimension)
|
|
largest_outline_dimension = max_dimension;
|
|
}
|
|
|
|
if (fixsp_noise_score_fixing) {
|
|
if (outline_count > 5)
|
|
//penalise LOTS of blobs
|
|
largest_outline_dimension *= 2;
|
|
|
|
box = blob->bounding_box ();
|
|
|
|
if ((box.bottom () > bln_baseline_offset * 4) ||
|
|
(box.top () < bln_baseline_offset / 2))
|
|
//Lax blob is if high or low
|
|
largest_outline_dimension /= 2;
|
|
}
|
|
return largest_outline_dimension;
|
|
}
|
|
|
|
|
|
void fixspace_dbg(WERD_RES *word) {
|
|
TBOX box = word->word->bounding_box ();
|
|
BOOL8 show_map_detail = FALSE;
|
|
inT16 i;
|
|
|
|
box.print ();
|
|
#ifndef SECURE_NAMES
|
|
tprintf (" \"%s\" ", word->best_choice->unichar_string().string ());
|
|
tprintf ("Blob count: %d (word); %d/%d (outword)\n",
|
|
word->word->gblob_list ()->length (),
|
|
word->outword->gblob_list ()->length (),
|
|
word->outword->rej_blob_list ()->length ());
|
|
word->reject_map.print (debug_fp);
|
|
tprintf ("\n");
|
|
if (show_map_detail) {
|
|
tprintf ("\"%s\"\n", word->best_choice->unichar_string().string ());
|
|
for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
|
|
tprintf ("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
|
|
word->reject_map[i].full_print (debug_fp);
|
|
}
|
|
}
|
|
|
|
tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
|
|
tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
|
|
#endif
|
|
}
|
|
|
|
|
|
/**
|
|
* fp_eval_word_spacing()
|
|
* Evaluation function for fixed pitch word lists.
|
|
*
|
|
* Basically, count the number of "nice" characters - those which are in tess
|
|
* acceptable words or in dict words and are not rejected.
|
|
* Penalise any potential noise chars
|
|
*/
|
|
namespace tesseract {
|
|
inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
|
|
WERD_RES_IT word_it(&word_res_list);
|
|
WERD_RES *word;
|
|
PBLOB_IT blob_it;
|
|
inT16 word_length;
|
|
inT16 score = 0;
|
|
inT16 i;
|
|
float small_limit = bln_x_height * fixsp_small_outlines_size;
|
|
|
|
if (!fixsp_fp_eval)
|
|
return (eval_word_spacing (word_res_list));
|
|
|
|
for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
|
|
word = word_it.data ();
|
|
word_length = word->reject_map.length();
|
|
if ((word->done ||
|
|
word->tess_accepted) ||
|
|
(word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
|
|
(word->best_choice->permuter() == FREQ_DAWG_PERM) ||
|
|
(word->best_choice->permuter() == USER_DAWG_PERM) ||
|
|
(safe_dict_word(*(word->best_choice)) > 0)) {
|
|
blob_it.set_to_list(word->outword->blob_list());
|
|
UNICHAR_ID space = getDict().getUnicharset().unichar_to_id(" ");
|
|
for (i = 0; i < word->best_choice->length(); ++i, blob_it.forward()) {
|
|
if (word->best_choice->unichar_id(i) == space ||
|
|
(blob_noise_score(blob_it.data()) < small_limit)) {
|
|
score -= 1; // penalise possibly erroneous non-space
|
|
} else if (word->reject_map[i].accepted()) {
|
|
score++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (score < 0)
|
|
score = 0;
|
|
return score;
|
|
}
|
|
} // namespace tesseract
|