mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 14:41:36 +08:00
4ddb3e5941
During our initial chopping for each word, pay attention to whether a dangerous ambiguity (like rn <-> m) would lead us to a dictionary word. If so, make sure that blob gets chopped so that we can evaluate said dictionary word during the segmentation search. Large accuracy improvement, especially on English printed books (~9%). git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@713 d0cd1f9f-072b-0410-8dd7-cf729c803f20
1055 lines
34 KiB
C++
1055 lines
34 KiB
C++
/* -*-C-*-
|
|
********************************************************************************
|
|
*
|
|
* File: chopper.c (Formerly chopper.c)
|
|
* Description:
|
|
* Author: Mark Seaman, OCR Technology
|
|
* Created: Fri Oct 16 14:37:00 1987
|
|
* Modified: Tue Jul 30 16:18:52 1991 (Mark Seaman) marks@hpgrlt
|
|
* Language: C
|
|
* Package: N/A
|
|
* Status: Reusable Software Component
|
|
*
|
|
* (c) Copyright 1987, Hewlett-Packard Company.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**************************************************************************/
|
|
|
|
/*----------------------------------------------------------------------
|
|
I n c l u d e s
|
|
----------------------------------------------------------------------*/
|
|
|
|
#include <math.h>
|
|
|
|
#include "chopper.h"
|
|
|
|
#include "assert.h"
|
|
#include "associate.h"
|
|
#include "callcpp.h"
|
|
#include "const.h"
|
|
#include "findseam.h"
|
|
#include "freelist.h"
|
|
#include "globals.h"
|
|
#include "makechop.h"
|
|
#include "render.h"
|
|
#include "pageres.h"
|
|
#include "permute.h"
|
|
#include "seam.h"
|
|
#include "stopper.h"
|
|
#include "structures.h"
|
|
#include "unicharset.h"
|
|
#include "wordclass.h"
|
|
#include "wordrec.h"
|
|
|
|
// Include automatically generated configuration file if running autoconf.
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config_auto.h"
|
|
#endif
|
|
|
|
/*----------------------------------------------------------------------
|
|
F u n c t i o n s
|
|
----------------------------------------------------------------------*/
|
|
/**
|
|
* @name preserve_outline_tree
|
|
*
|
|
* Copy the list of outlines.
|
|
*/
|
|
void preserve_outline(EDGEPT *start) {
|
|
EDGEPT *srcpt;
|
|
|
|
if (start == NULL)
|
|
return;
|
|
srcpt = start;
|
|
do {
|
|
srcpt->flags[1] = 1;
|
|
srcpt = srcpt->next;
|
|
}
|
|
while (srcpt != start);
|
|
srcpt->flags[1] = 2;
|
|
}
|
|
|
|
|
|
/**************************************************************************/
|
|
void preserve_outline_tree(TESSLINE *srcline) {
|
|
TESSLINE *outline;
|
|
|
|
for (outline = srcline; outline != NULL; outline = outline->next) {
|
|
preserve_outline (outline->loop);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* @name restore_outline_tree
|
|
*
|
|
* Copy the list of outlines.
|
|
*/
|
|
EDGEPT *restore_outline(EDGEPT *start) {
|
|
EDGEPT *srcpt;
|
|
EDGEPT *real_start;
|
|
EDGEPT *deadpt;
|
|
|
|
if (start == NULL)
|
|
return NULL;
|
|
srcpt = start;
|
|
do {
|
|
if (srcpt->flags[1] == 2)
|
|
break;
|
|
srcpt = srcpt->next;
|
|
}
|
|
while (srcpt != start);
|
|
real_start = srcpt;
|
|
do {
|
|
if (srcpt->flags[1] == 0) {
|
|
deadpt = srcpt;
|
|
srcpt = srcpt->next;
|
|
srcpt->prev = deadpt->prev;
|
|
deadpt->prev->next = srcpt;
|
|
deadpt->prev->vec.x = srcpt->pos.x - deadpt->prev->pos.x;
|
|
deadpt->prev->vec.y = srcpt->pos.y - deadpt->prev->pos.y;
|
|
delete deadpt;
|
|
}
|
|
else
|
|
srcpt = srcpt->next;
|
|
}
|
|
while (srcpt != real_start);
|
|
return real_start;
|
|
}
|
|
|
|
|
|
/******************************************************************************/
|
|
void restore_outline_tree(TESSLINE *srcline) {
|
|
TESSLINE *outline;
|
|
|
|
for (outline = srcline; outline != NULL; outline = outline->next) {
|
|
outline->loop = restore_outline (outline->loop);
|
|
outline->start = outline->loop->pos;
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* @name attempt_blob_chop
|
|
*
|
|
* Try to split the this blob after this one. Check to make sure that
|
|
* it was successful.
|
|
*/
|
|
namespace tesseract {
|
|
SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number,
|
|
bool italic_blob, SEAMS seam_list) {
|
|
TBLOB *next_blob = blob->next;
|
|
TBLOB *other_blob;
|
|
SEAM *seam;
|
|
|
|
if (repair_unchopped_blobs)
|
|
preserve_outline_tree (blob->outlines);
|
|
other_blob = new TBLOB; /* Make new blob */
|
|
other_blob->next = blob->next;
|
|
other_blob->outlines = NULL;
|
|
blob->next = other_blob;
|
|
|
|
seam = NULL;
|
|
if (prioritize_division) {
|
|
TPOINT location;
|
|
if (divisible_blob(blob, italic_blob, &location)) {
|
|
seam = new_seam(0.0f, location, NULL, NULL, NULL);
|
|
}
|
|
}
|
|
if (seam == NULL)
|
|
seam = pick_good_seam(blob);
|
|
if (seam == NULL && word->latin_script) {
|
|
// If the blob can simply be divided into outlines, then do that.
|
|
TPOINT location;
|
|
if (divisible_blob(blob, italic_blob, &location)) {
|
|
seam = new_seam(0.0f, location, NULL, NULL, NULL);
|
|
}
|
|
}
|
|
if (chop_debug) {
|
|
if (seam != NULL) {
|
|
print_seam ("Good seam picked=", seam);
|
|
}
|
|
else
|
|
cprintf ("\n** no seam picked *** \n");
|
|
}
|
|
if (seam) {
|
|
apply_seam(blob, other_blob, italic_blob, seam);
|
|
}
|
|
|
|
if ((seam == NULL) ||
|
|
(blob->outlines == NULL) ||
|
|
(other_blob->outlines == NULL) ||
|
|
total_containment (blob, other_blob) ||
|
|
check_blob (other_blob) ||
|
|
!(check_seam_order (blob, seam) &&
|
|
check_seam_order (other_blob, seam)) ||
|
|
any_shared_split_points (seam_list, seam) ||
|
|
!test_insert_seam(seam_list, blob_number, blob, word->blobs)) {
|
|
|
|
blob->next = next_blob;
|
|
if (seam) {
|
|
undo_seam(blob, other_blob, seam);
|
|
delete_seam(seam);
|
|
#ifndef GRAPHICS_DISABLED
|
|
if (chop_debug) {
|
|
if (chop_debug >2)
|
|
display_blob(blob, Red);
|
|
cprintf ("\n** seam being removed ** \n");
|
|
}
|
|
#endif
|
|
} else {
|
|
delete other_blob;
|
|
}
|
|
|
|
if (repair_unchopped_blobs)
|
|
restore_outline_tree (blob->outlines);
|
|
return (NULL);
|
|
}
|
|
return (seam);
|
|
}
|
|
|
|
|
|
SEAM *Wordrec::chop_numbered_blob(TWERD *word, inT32 blob_number,
|
|
bool italic_blob, SEAMS seam_list) {
|
|
TBLOB *blob;
|
|
inT16 x;
|
|
|
|
blob = word->blobs;
|
|
for (x = 0; x < blob_number; x++)
|
|
blob = blob->next;
|
|
|
|
return attempt_blob_chop(word, blob, blob_number,
|
|
italic_blob, seam_list);
|
|
}
|
|
|
|
|
|
SEAM *Wordrec::chop_overlapping_blob(const GenericVector<TBOX>& boxes,
|
|
WERD_RES *word_res, inT32 *blob_number,
|
|
bool italic_blob, SEAMS seam_list) {
|
|
TWERD *word = word_res->chopped_word;
|
|
TBLOB *blob;
|
|
|
|
*blob_number = 0;
|
|
blob = word->blobs;
|
|
while (blob != NULL) {
|
|
TPOINT topleft, botright;
|
|
topleft.x = blob->bounding_box().left();
|
|
topleft.y = blob->bounding_box().top();
|
|
botright.x = blob->bounding_box().right();
|
|
botright.y = blob->bounding_box().bottom();
|
|
|
|
TPOINT original_topleft, original_botright;
|
|
word_res->denorm.DenormTransform(topleft, &original_topleft);
|
|
word_res->denorm.DenormTransform(botright, &original_botright);
|
|
|
|
TBOX original_box = TBOX(original_topleft.x, original_botright.y,
|
|
original_botright.x, original_topleft.y);
|
|
|
|
bool almost_equal_box = false;
|
|
int num_overlap = 0;
|
|
for (int i = 0; i < boxes.size(); i++) {
|
|
if (original_box.overlap_fraction(boxes[i]) > 0.125)
|
|
num_overlap++;
|
|
if (original_box.almost_equal(boxes[i], 3))
|
|
almost_equal_box = true;
|
|
}
|
|
|
|
TPOINT location;
|
|
if (divisible_blob(blob, italic_blob, &location) ||
|
|
(!almost_equal_box && num_overlap > 1)) {
|
|
SEAM *seam = attempt_blob_chop(word, blob, *blob_number,
|
|
italic_blob, seam_list);
|
|
if (seam != NULL)
|
|
return seam;
|
|
}
|
|
|
|
*blob_number = *blob_number + 1;
|
|
blob = blob->next;
|
|
}
|
|
|
|
*blob_number = -1;
|
|
return NULL;
|
|
}
|
|
|
|
} // namespace tesseract
|
|
|
|
|
|
/**
|
|
* @name any_shared_split_points
|
|
*
|
|
* Return true if any of the splits share a point with this one.
|
|
*/
|
|
int any_shared_split_points(SEAMS seam_list, SEAM *seam) {
|
|
int length;
|
|
int index;
|
|
|
|
length = array_count (seam_list);
|
|
for (index = 0; index < length; index++)
|
|
if (shared_split_points ((SEAM *) array_value (seam_list, index), seam))
|
|
return TRUE;
|
|
return FALSE;
|
|
}
|
|
|
|
|
|
/**
|
|
* @name check_blob
|
|
*
|
|
* @return true if blob has a non whole outline.
|
|
*/
|
|
int check_blob(TBLOB *blob) {
|
|
TESSLINE *outline;
|
|
EDGEPT *edgept;
|
|
|
|
for (outline = blob->outlines; outline != NULL; outline = outline->next) {
|
|
edgept = outline->loop;
|
|
do {
|
|
if (edgept == NULL)
|
|
break;
|
|
edgept = edgept->next;
|
|
}
|
|
while (edgept != outline->loop);
|
|
if (edgept == NULL)
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
|
|
namespace tesseract {
|
|
/**
|
|
* @name improve_one_blob
|
|
*
|
|
* Start with the current word of blobs and its classification. Find
|
|
* the worst blobs and try to divide it up to improve the ratings.
|
|
*/
|
|
bool Wordrec::improve_one_blob(WERD_RES *word_res,
|
|
BLOB_CHOICE_LIST_VECTOR *char_choices,
|
|
inT32 *blob_number,
|
|
SEAMS *seam_list,
|
|
DANGERR *fixpt,
|
|
bool split_next_to_fragment,
|
|
BlamerBundle *blamer_bundle) {
|
|
TWERD* word = word_res->chopped_word;
|
|
TBLOB *blob;
|
|
inT16 x = 0;
|
|
float rating_ceiling = MAX_FLOAT32;
|
|
BLOB_CHOICE_LIST *answer;
|
|
BLOB_CHOICE_IT answer_it;
|
|
SEAM *seam;
|
|
|
|
do {
|
|
*blob_number = select_blob_to_split_from_fixpt(fixpt);
|
|
bool split_point_from_dict = (*blob_number != -1);
|
|
if (split_point_from_dict) {
|
|
fixpt->clear();
|
|
} else {
|
|
*blob_number = select_blob_to_split(*char_choices, rating_ceiling,
|
|
split_next_to_fragment);
|
|
}
|
|
if (chop_debug)
|
|
cprintf("blob_number = %d\n", *blob_number);
|
|
if (*blob_number == -1)
|
|
return false;
|
|
|
|
// TODO(rays) it may eventually help to allow italic_blob to be true,
|
|
seam = chop_numbered_blob(word, *blob_number, false, *seam_list);
|
|
if (seam != NULL)
|
|
break;
|
|
/* Must split null blobs */
|
|
answer = char_choices->get(*blob_number);
|
|
if (answer == NULL)
|
|
return false;
|
|
answer_it.set_to_list(answer);
|
|
if (!split_point_from_dict) {
|
|
// We chopped the worst rated blob, try something else next time.
|
|
rating_ceiling = answer_it.data()->rating();
|
|
}
|
|
} while (true);
|
|
/* Split OK */
|
|
for (blob = word->blobs; x < *blob_number; x++) {
|
|
blob = blob->next;
|
|
}
|
|
|
|
*seam_list =
|
|
insert_seam (*seam_list, *blob_number, seam, blob, word->blobs);
|
|
|
|
delete char_choices->get(*blob_number);
|
|
|
|
answer = classify_blob(blob, word_res->denorm, "improve 1:", Red,
|
|
blamer_bundle);
|
|
char_choices->insert(answer, *blob_number);
|
|
|
|
answer = classify_blob(blob->next, word_res->denorm, "improve 2:", Yellow,
|
|
blamer_bundle);
|
|
char_choices->set(answer, *blob_number + 1);
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* @name modify_blob_choice
|
|
*
|
|
* Takes a blob and its chop index, converts that chop index to a
|
|
* unichar_id, and stores the chop index in place of the blob's
|
|
* original unichar_id.
|
|
*/
|
|
void Wordrec::modify_blob_choice(BLOB_CHOICE_LIST *answer,
|
|
int chop_index) {
|
|
char chop_index_string[2];
|
|
if (chop_index <= 9) {
|
|
snprintf(chop_index_string, sizeof(chop_index_string), "%d", chop_index);
|
|
} else {
|
|
chop_index_string[0] = static_cast<char>('A' - 10 + chop_index);
|
|
chop_index_string[1] = '\0';
|
|
}
|
|
UNICHAR_ID unichar_id = unicharset.unichar_to_id(chop_index_string);
|
|
if (unichar_id == INVALID_UNICHAR_ID) {
|
|
// If the word is very long, we might exhaust the possibilities.
|
|
unichar_id = 1;
|
|
}
|
|
BLOB_CHOICE_IT answer_it(answer);
|
|
BLOB_CHOICE *modified_blob =
|
|
new BLOB_CHOICE(unichar_id,
|
|
answer_it.data()->rating(),
|
|
answer_it.data()->certainty(),
|
|
answer_it.data()->fontinfo_id(),
|
|
answer_it.data()->fontinfo_id2(),
|
|
answer_it.data()->script_id(),
|
|
answer_it.data()->min_xheight(),
|
|
answer_it.data()->max_xheight(),
|
|
answer_it.data()->adapted());
|
|
answer->clear();
|
|
answer_it.set_to_list(answer);
|
|
answer_it.add_after_then_move(modified_blob);
|
|
}
|
|
|
|
|
|
/**
|
|
* @name chop_one_blob
|
|
*
|
|
* Start with the current one-blob word and its classification. Find
|
|
* the worst blobs and try to divide it up to improve the ratings.
|
|
* Used for testing chopper.
|
|
*/
|
|
bool Wordrec::chop_one_blob(TWERD *word,
|
|
BLOB_CHOICE_LIST_VECTOR *char_choices,
|
|
inT32 *blob_number,
|
|
SEAMS *seam_list,
|
|
int *right_chop_index) {
|
|
TBLOB *blob;
|
|
inT16 x = 0;
|
|
float rating_ceiling = MAX_FLOAT32;
|
|
BLOB_CHOICE_LIST *answer;
|
|
BLOB_CHOICE_IT answer_it;
|
|
SEAM *seam;
|
|
UNICHAR_ID unichar_id = 0;
|
|
int left_chop_index = 0;
|
|
|
|
do {
|
|
*blob_number = select_blob_to_split(*char_choices, rating_ceiling, false);
|
|
if (chop_debug)
|
|
cprintf("blob_number = %d\n", *blob_number);
|
|
if (*blob_number == -1)
|
|
return false;
|
|
seam = chop_numbered_blob(word, *blob_number, true, *seam_list);
|
|
if (seam != NULL)
|
|
break;
|
|
/* Must split null blobs */
|
|
answer = char_choices->get(*blob_number);
|
|
if (answer == NULL)
|
|
return false;
|
|
answer_it.set_to_list(answer);
|
|
rating_ceiling = answer_it.data()->rating(); // try a different blob
|
|
} while (true);
|
|
/* Split OK */
|
|
for (blob = word->blobs; x < *blob_number; x++) {
|
|
blob = blob->next;
|
|
}
|
|
if (chop_debug) {
|
|
tprintf("Chop made blob1:");
|
|
blob->bounding_box().print();
|
|
tprintf("and blob2:");
|
|
blob->next->bounding_box().print();
|
|
}
|
|
*seam_list = insert_seam(*seam_list, *blob_number, seam, blob, word->blobs);
|
|
|
|
answer = char_choices->get(*blob_number);
|
|
answer_it.set_to_list(answer);
|
|
unichar_id = answer_it.data()->unichar_id();
|
|
float rating = answer_it.data()->rating() / exp(1.0);
|
|
left_chop_index = atoi(unicharset.id_to_unichar(unichar_id));
|
|
|
|
delete char_choices->get(*blob_number);
|
|
// combine confidence w/ serial #
|
|
answer = fake_classify_blob(0, rating, -rating);
|
|
modify_blob_choice(answer, left_chop_index);
|
|
char_choices->insert(answer, *blob_number);
|
|
|
|
answer = fake_classify_blob(0, rating - 0.125f, -rating);
|
|
modify_blob_choice(answer, ++*right_chop_index);
|
|
char_choices->set(answer, *blob_number + 1);
|
|
return true;
|
|
}
|
|
|
|
|
|
bool Wordrec::chop_one_blob2(const GenericVector<TBOX>& boxes,
|
|
WERD_RES *word_res,
|
|
SEAMS *seam_list) {
|
|
inT32 blob_number;
|
|
inT16 x = 0;
|
|
TBLOB *blob;
|
|
SEAM *seam;
|
|
|
|
seam = chop_overlapping_blob(boxes, word_res, &blob_number,
|
|
true, *seam_list);
|
|
if (seam == NULL)
|
|
return false;
|
|
|
|
/* Split OK */
|
|
for (blob = word_res->chopped_word->blobs; x < blob_number; x++) {
|
|
blob = blob->next;
|
|
}
|
|
if (chop_debug) {
|
|
tprintf("Chop made blob1:");
|
|
blob->bounding_box().print();
|
|
tprintf("and blob2:");
|
|
blob->next->bounding_box().print();
|
|
}
|
|
*seam_list = insert_seam(*seam_list, blob_number, seam, blob,
|
|
word_res->chopped_word->blobs);
|
|
return true;
|
|
}
|
|
} // namespace tesseract
|
|
|
|
/**
|
|
* @name check_seam_order
|
|
*
|
|
* Make sure that each of the splits in this seam match to outlines
|
|
* in this blob. If any of the splits could not correspond to this
|
|
* blob then there is a problem (and FALSE should be returned to the
|
|
* caller).
|
|
*/
|
|
inT16 check_seam_order(TBLOB *blob, SEAM *seam) {
|
|
TESSLINE *outline;
|
|
TESSLINE *last_outline;
|
|
inT8 found_em[3];
|
|
|
|
if (seam->split1 == NULL || seam->split1 == NULL || blob == NULL)
|
|
return (TRUE);
|
|
|
|
found_em[0] = found_em[1] = found_em[2] = FALSE;
|
|
|
|
for (outline = blob->outlines; outline; outline = outline->next) {
|
|
if (!found_em[0] &&
|
|
((seam->split1 == NULL) ||
|
|
is_split_outline (outline, seam->split1))) {
|
|
found_em[0] = TRUE;
|
|
}
|
|
if (!found_em[1] &&
|
|
((seam->split2 == NULL) ||
|
|
is_split_outline (outline, seam->split2))) {
|
|
found_em[1] = TRUE;
|
|
}
|
|
if (!found_em[2] &&
|
|
((seam->split3 == NULL) ||
|
|
is_split_outline (outline, seam->split3))) {
|
|
found_em[2] = TRUE;
|
|
}
|
|
last_outline = outline;
|
|
}
|
|
|
|
if (!found_em[0] || !found_em[1] || !found_em[2])
|
|
return (FALSE);
|
|
else
|
|
return (TRUE);
|
|
}
|
|
|
|
namespace tesseract {
|
|
/**
|
|
* @name chop_word_main
|
|
*
|
|
* Classify the blobs in this word and permute the results. Find the
|
|
* worst blob in the word and chop it up. Continue this process until
|
|
* a good answer has been found or all the blobs have been chopped up
|
|
* enough. Return the word level ratings.
|
|
*/
|
|
BLOB_CHOICE_LIST_VECTOR *Wordrec::chop_word_main(WERD_RES *word) {
|
|
TBLOB *blob;
|
|
int index;
|
|
int did_chopping;
|
|
STATE state;
|
|
BLOB_CHOICE_LIST *match_result;
|
|
MATRIX *ratings = NULL;
|
|
DANGERR fixpt; /*dangerous ambig */
|
|
inT32 bit_count; //no of bits
|
|
|
|
BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
|
|
BLOB_CHOICE_LIST_VECTOR *best_char_choices = new BLOB_CHOICE_LIST_VECTOR();
|
|
|
|
did_chopping = 0;
|
|
for (blob = word->chopped_word->blobs, index = 0;
|
|
blob != NULL; blob = blob->next, index++) {
|
|
match_result = classify_blob(blob, word->denorm, "chop_word:", Green,
|
|
word->blamer_bundle);
|
|
if (match_result == NULL)
|
|
cprintf("Null classifier output!\n");
|
|
*char_choices += match_result;
|
|
}
|
|
bit_count = index - 1;
|
|
set_n_ones(&state, char_choices->length() - 1);
|
|
bool acceptable = false;
|
|
bool replaced = false;
|
|
bool best_choice_updated =
|
|
getDict().permute_characters(*char_choices, word->best_choice,
|
|
word->raw_choice);
|
|
if (best_choice_updated &&
|
|
getDict().AcceptableChoice(char_choices, word->best_choice, &fixpt,
|
|
CHOPPER_CALLER, &replaced)) {
|
|
acceptable = true;
|
|
}
|
|
if (replaced)
|
|
update_blob_classifications(word->chopped_word, *char_choices);
|
|
CopyCharChoices(*char_choices, best_char_choices);
|
|
if (!acceptable) { // do more work to find a better choice
|
|
did_chopping = 1;
|
|
|
|
bool best_choice_acceptable = false;
|
|
if (chop_enable)
|
|
improve_by_chopping(word,
|
|
char_choices,
|
|
&state,
|
|
best_char_choices,
|
|
&fixpt,
|
|
&best_choice_acceptable);
|
|
if (chop_debug)
|
|
print_seams ("Final seam list:", word->seam_array);
|
|
|
|
if (word->blamer_bundle != NULL &&
|
|
!ChoiceIsCorrect(*word->uch_set, word->best_choice,
|
|
word->blamer_bundle->truth_text)) {
|
|
set_chopper_blame(word);
|
|
}
|
|
|
|
// The force_word_assoc is almost redundant to enable_assoc. However,
|
|
// it is not conditioned on the dict behavior. For CJK, we need to force
|
|
// the associator to be invoked. When we figure out the exact behavior
|
|
// of dict on CJK, we can remove the flag if it turns out to be redundant.
|
|
if ((wordrec_enable_assoc && !best_choice_acceptable) || force_word_assoc) {
|
|
ratings = word_associator(false, word, &state, best_char_choices,
|
|
&fixpt, &state);
|
|
}
|
|
}
|
|
best_char_choices = rebuild_current_state(word, &state, best_char_choices,
|
|
ratings);
|
|
|
|
// If after running only the chopper best_choice is incorrect and no blame
|
|
// has been yet set, blame the classifier if best_choice is classifier's
|
|
// top choice and is a dictionary word (i.e. language model could not have
|
|
// helped). Otherwise blame the tradeoff between the classifier and
|
|
// the old language model (permuters).
|
|
if (word->blamer_bundle != NULL &&
|
|
word->blamer_bundle->incorrect_result_reason == IRR_CORRECT &&
|
|
ratings == NULL && // only the chopper was run
|
|
!ChoiceIsCorrect(*word->uch_set, word->best_choice,
|
|
word->blamer_bundle->truth_text)) {
|
|
if (word->best_choice != NULL &&
|
|
Dict::valid_word_permuter(word->best_choice->permuter(), false)) {
|
|
// Find out whether best choice is a top choice.
|
|
word->blamer_bundle->best_choice_is_dict_and_top_choice = true;
|
|
for (int i = 0; i < word->best_choice->length(); ++i) {
|
|
BLOB_CHOICE_IT blob_choice_it(best_char_choices->get(i));
|
|
ASSERT_HOST(!blob_choice_it.empty());
|
|
BLOB_CHOICE *first_choice = NULL;
|
|
for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
|
|
blob_choice_it.forward()) { // find first non-fragment choice
|
|
if (!(getDict().getUnicharset().get_fragment(
|
|
blob_choice_it.data()->unichar_id()))) {
|
|
first_choice = blob_choice_it.data();
|
|
break;
|
|
}
|
|
}
|
|
ASSERT_HOST(first_choice != NULL);
|
|
if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) {
|
|
word->blamer_bundle->best_choice_is_dict_and_top_choice = false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
STRING debug;
|
|
if (word->blamer_bundle->best_choice_is_dict_and_top_choice) {
|
|
debug = "Best choice is: incorrect, top choice, dictionary word";
|
|
debug += " with permuter ";
|
|
debug += word->best_choice->permuter_name();
|
|
} else {
|
|
debug = "Classifier/Old LM tradeoff is to blame";
|
|
}
|
|
word->blamer_bundle->SetBlame(
|
|
word->blamer_bundle->best_choice_is_dict_and_top_choice ?
|
|
IRR_CLASSIFIER : IRR_CLASS_OLD_LM_TRADEOFF,
|
|
debug, word->best_choice, wordrec_debug_blamer);
|
|
}
|
|
|
|
if (word->blamer_bundle != NULL && this->fill_lattice_ != NULL) {
|
|
if (ratings == NULL) {
|
|
ratings = word_associator(true, word, NULL, NULL, NULL, NULL);
|
|
}
|
|
CallFillLattice(*ratings, getDict().getBestChoices(),
|
|
*word->uch_set, word->blamer_bundle);
|
|
}
|
|
if (ratings != NULL) {
|
|
if (wordrec_debug_level > 0) {
|
|
tprintf("Final Ratings Matrix:\n");
|
|
ratings->print(getDict().getUnicharset());
|
|
}
|
|
ratings->delete_matrix_pointers();
|
|
delete ratings;
|
|
}
|
|
getDict().FilterWordChoices();
|
|
// TODO(antonova, eger): check that FilterWordChoices() does not filter
|
|
// out anything useful for word bigram or phrase search.
|
|
// TODO(antonova, eger): when implementing word bigram and phrase search
|
|
// we will need to think carefully about how to replace a word with its
|
|
// alternative choice.
|
|
// In particular it might be required to save the segmentation state
|
|
// associated with the word, so that best_char_choices could be updated
|
|
// by rebuild_current_state() correctly.
|
|
if (save_alt_choices) SaveAltChoices(getDict().getBestChoices(), word);
|
|
char_choices->delete_data_pointers();
|
|
delete char_choices;
|
|
|
|
return best_char_choices;
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
* @name improve_by_chopping
|
|
*
|
|
* Start with the current word of blobs and its classification. Find
|
|
* the worst blobs and try to divide them up to improve the ratings.
|
|
* As long as ratings are produced by the new blob splitting. When
|
|
* all the splitting has been accomplished all the ratings memory is
|
|
* reclaimed.
|
|
*/
|
|
void Wordrec::improve_by_chopping(WERD_RES *word,
|
|
BLOB_CHOICE_LIST_VECTOR *char_choices,
|
|
STATE *best_state,
|
|
BLOB_CHOICE_LIST_VECTOR *best_char_choices,
|
|
DANGERR *fixpt,
|
|
bool *best_choice_acceptable) {
|
|
inT32 blob_number;
|
|
float old_best;
|
|
bool updated_best_choice = false;
|
|
|
|
while (1) { // improvement loop
|
|
old_best = word->best_choice->rating();
|
|
if (improve_one_blob(word, char_choices,
|
|
&blob_number, &word->seam_array,
|
|
fixpt, (fragments_guide_chopper &&
|
|
word->best_choice->fragment_mark()),
|
|
word->blamer_bundle)) {
|
|
getDict().LogNewSplit(blob_number);
|
|
updated_best_choice =
|
|
getDict().permute_characters(*char_choices, word->best_choice,
|
|
word->raw_choice);
|
|
|
|
if (old_best > word->best_choice->rating()) {
|
|
set_n_ones(best_state, char_choices->length() - 1);
|
|
} else {
|
|
insert_new_chunk(best_state, blob_number, char_choices->length() - 2);
|
|
fixpt->clear();
|
|
}
|
|
|
|
if (chop_debug)
|
|
print_state("best state = ",
|
|
best_state, count_blobs(word->chopped_word->blobs) - 1);
|
|
} else {
|
|
break;
|
|
}
|
|
|
|
// Check if we should break from the loop.
|
|
bool done = false;
|
|
bool replaced = false;
|
|
if ((updated_best_choice &&
|
|
(*best_choice_acceptable =
|
|
getDict().AcceptableChoice(char_choices, word->best_choice,
|
|
fixpt, CHOPPER_CALLER, &replaced))) ||
|
|
char_choices->length() >= MAX_NUM_CHUNKS) {
|
|
done = true;
|
|
}
|
|
if (replaced) update_blob_classifications(word->chopped_word,
|
|
*char_choices);
|
|
if (updated_best_choice) CopyCharChoices(*char_choices, best_char_choices);
|
|
if (done) break;
|
|
}
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* select_blob_to_split
|
|
*
|
|
* These are the results of the last classification. Find a likely
|
|
* place to apply splits. If none, return -1.
|
|
**********************************************************************/
|
|
inT16 Wordrec::select_blob_to_split(const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
|
float rating_ceiling,
|
|
bool split_next_to_fragment) {
|
|
BLOB_CHOICE_IT blob_choice_it;
|
|
BLOB_CHOICE *blob_choice;
|
|
BLOB_CHOICE_IT temp_it;
|
|
int x;
|
|
float worst = -MAX_FLOAT32;
|
|
int worst_index = -1;
|
|
float worst_near_fragment = -MAX_FLOAT32;
|
|
int worst_index_near_fragment = -1;
|
|
const CHAR_FRAGMENT **fragments = NULL;
|
|
|
|
if (chop_debug) {
|
|
if (rating_ceiling < MAX_FLOAT32)
|
|
cprintf("rating_ceiling = %8.4f\n", rating_ceiling);
|
|
else
|
|
cprintf("rating_ceiling = No Limit\n");
|
|
}
|
|
|
|
if (split_next_to_fragment && char_choices.length() > 0) {
|
|
fragments = new const CHAR_FRAGMENT *[char_choices.length()];
|
|
if (char_choices.get(0) != NULL) {
|
|
temp_it.set_to_list(char_choices.get(0));
|
|
fragments[0] = getDict().getUnicharset().get_fragment(
|
|
temp_it.data()->unichar_id());
|
|
} else {
|
|
fragments[0] = NULL;
|
|
}
|
|
}
|
|
|
|
for (x = 0; x < char_choices.length(); ++x) {
|
|
if (char_choices.get(x) == NULL) {
|
|
if (fragments != NULL) {
|
|
delete[] fragments;
|
|
}
|
|
return x;
|
|
} else {
|
|
blob_choice_it.set_to_list(char_choices.get(x));
|
|
blob_choice = blob_choice_it.data();
|
|
// Populate fragments for the following position.
|
|
if (split_next_to_fragment && x+1 < char_choices.length()) {
|
|
if (char_choices.get(x+1) != NULL) {
|
|
temp_it.set_to_list(char_choices.get(x+1));
|
|
fragments[x+1] = getDict().getUnicharset().get_fragment(
|
|
temp_it.data()->unichar_id());
|
|
} else {
|
|
fragments[x+1] = NULL;
|
|
}
|
|
}
|
|
if (blob_choice->rating() < rating_ceiling &&
|
|
blob_choice->certainty() < tessedit_certainty_threshold) {
|
|
// Update worst and worst_index.
|
|
if (blob_choice->rating() > worst) {
|
|
worst_index = x;
|
|
worst = blob_choice->rating();
|
|
}
|
|
if (split_next_to_fragment) {
|
|
// Update worst_near_fragment and worst_index_near_fragment.
|
|
bool expand_following_fragment =
|
|
(x + 1 < char_choices.length() &&
|
|
fragments[x+1] != NULL && !fragments[x+1]->is_beginning());
|
|
bool expand_preceding_fragment =
|
|
(x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending());
|
|
if ((expand_following_fragment || expand_preceding_fragment) &&
|
|
blob_choice->rating() > worst_near_fragment) {
|
|
worst_index_near_fragment = x;
|
|
worst_near_fragment = blob_choice->rating();
|
|
if (chop_debug) {
|
|
cprintf("worst_index_near_fragment=%d"
|
|
" expand_following_fragment=%d"
|
|
" expand_preceding_fragment=%d\n",
|
|
worst_index_near_fragment,
|
|
expand_following_fragment,
|
|
expand_preceding_fragment);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (fragments != NULL) {
|
|
delete[] fragments;
|
|
}
|
|
// TODO(daria): maybe a threshold of badness for
|
|
// worst_near_fragment would be useful.
|
|
return worst_index_near_fragment != -1 ?
|
|
worst_index_near_fragment : worst_index;
|
|
}
|
|
|
|
/**********************************************************************
|
|
* select_blob_to_split_from_fixpt
|
|
*
|
|
* Given the fix point from a dictionary search, if there is a single
|
|
* dangerous blob that maps to multiple characters, return that blob
|
|
* index as a place we need to split. If none, return -1.
|
|
**********************************************************************/
|
|
inT16 Wordrec::select_blob_to_split_from_fixpt(DANGERR *fixpt) {
|
|
if (!fixpt)
|
|
return -1;
|
|
for (int i = 0; i < fixpt->size(); i++) {
|
|
if ((*fixpt)[i].begin == (*fixpt)[i].end &&
|
|
(*fixpt)[i].dangerous &&
|
|
(*fixpt)[i].correct_is_ngram) {
|
|
return (*fixpt)[i].begin;
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
/**********************************************************************
|
|
* set_chopper_blame
|
|
*
|
|
* Check whether chops were made at all the character bounding box boundaries
|
|
* in word->truth_word. If not - blame the chopper for an incorrect answer.
|
|
**********************************************************************/
|
|
void Wordrec::set_chopper_blame(WERD_RES *word) {
|
|
BlamerBundle *blamer_bundle = word->blamer_bundle;
|
|
assert(blamer_bundle != NULL);
|
|
if (blamer_bundle->NoTruth() || !(blamer_bundle->truth_has_char_boxes) ||
|
|
word->chopped_word->blobs == NULL) {
|
|
return;
|
|
}
|
|
STRING debug;
|
|
bool missing_chop = false;
|
|
TBLOB * curr_blob = word->chopped_word->blobs;
|
|
int b = 0;
|
|
inT16 truth_x;
|
|
while (b < blamer_bundle->truth_word.length() && curr_blob != NULL) {
|
|
truth_x = blamer_bundle->norm_truth_word.BlobBox(b).right();
|
|
if (curr_blob->bounding_box().right() <
|
|
(truth_x - blamer_bundle->norm_box_tolerance)) {
|
|
curr_blob = curr_blob->next;
|
|
continue; // encountered an extra chop, keep looking
|
|
} else if (curr_blob->bounding_box().right() >
|
|
(truth_x + blamer_bundle->norm_box_tolerance)) {
|
|
missing_chop = true;
|
|
break;
|
|
} else {
|
|
curr_blob = curr_blob->next;
|
|
++b;
|
|
}
|
|
}
|
|
if (missing_chop || b < blamer_bundle->norm_truth_word.length()) {
|
|
STRING debug;
|
|
char debug_buffer[256];
|
|
if (missing_chop) {
|
|
sprintf(debug_buffer, "Detected missing chop (tolerance=%d) at ",
|
|
blamer_bundle->norm_box_tolerance);
|
|
debug += debug_buffer;
|
|
curr_blob->bounding_box().append_debug(&debug);
|
|
debug.add_str_int("\nNo chop for truth at x=", truth_x);
|
|
} else {
|
|
debug.add_str_int("Missing chops for last ",
|
|
blamer_bundle->norm_truth_word.length()-b);
|
|
debug += " truth box(es)";
|
|
}
|
|
debug += "\nMaximally chopped word boxes:\n";
|
|
for (curr_blob = word->chopped_word->blobs; curr_blob != NULL;
|
|
curr_blob = curr_blob->next) {
|
|
const TBOX &tbox = curr_blob->bounding_box();
|
|
sprintf(debug_buffer, "(%d,%d)->(%d,%d)\n",
|
|
tbox.left(), tbox.bottom(), tbox.right(), tbox.top());
|
|
debug += debug_buffer;
|
|
}
|
|
debug += "Truth bounding boxes:\n";
|
|
for (b = 0; b < blamer_bundle->norm_truth_word.length(); ++b) {
|
|
const TBOX &tbox = blamer_bundle->norm_truth_word.BlobBox(b);
|
|
sprintf(debug_buffer, "(%d,%d)->(%d,%d)\n",
|
|
tbox.left(), tbox.bottom(), tbox.right(), tbox.top());
|
|
debug += debug_buffer;
|
|
}
|
|
blamer_bundle->SetBlame(IRR_CHOPPER, debug, word->best_choice,
|
|
wordrec_debug_blamer);
|
|
}
|
|
}
|
|
|
|
/**********************************************************************
|
|
* word_associator
|
|
*
|
|
* Reassociate and classify the blobs in a word. Continue this process
|
|
* until a good answer is found or all the possibilities have been tried.
|
|
**********************************************************************/
|
|
MATRIX *Wordrec::word_associator(bool only_create_ratings_matrix,
|
|
WERD_RES *word,
|
|
STATE *state,
|
|
BLOB_CHOICE_LIST_VECTOR *best_char_choices,
|
|
DANGERR *fixpt,
|
|
STATE *best_state) {
|
|
CHUNKS_RECORD chunks_record;
|
|
BLOB_WEIGHTS blob_weights;
|
|
int x;
|
|
int num_chunks;
|
|
BLOB_CHOICE_IT blob_choice_it;
|
|
|
|
num_chunks = array_count(word->seam_array) + 1;
|
|
|
|
TBLOB* blobs = word->chopped_word->blobs;
|
|
chunks_record.ratings = record_piece_ratings(blobs);
|
|
chunks_record.chunks = blobs;
|
|
chunks_record.word_res = word;
|
|
chunks_record.splits = word->seam_array;
|
|
chunks_record.chunk_widths = blobs_widths(blobs);
|
|
chunks_record.char_widths = blobs_widths(blobs);
|
|
/* Save chunk weights */
|
|
for (x = 0; x < num_chunks; x++) {
|
|
BLOB_CHOICE_LIST* choices = get_piece_rating(chunks_record.ratings, blobs,
|
|
chunks_record.word_res->denorm,
|
|
word->seam_array, x, x,
|
|
word->blamer_bundle);
|
|
blob_choice_it.set_to_list(choices);
|
|
//This is done by Jetsoft. Divide by zero is possible.
|
|
if (blob_choice_it.data()->certainty() == 0) {
|
|
blob_weights[x]=0;
|
|
} else {
|
|
blob_weights[x] =
|
|
-(inT16) (10 * blob_choice_it.data()->rating() /
|
|
blob_choice_it.data()->certainty());
|
|
}
|
|
}
|
|
chunks_record.weights = blob_weights;
|
|
|
|
if (chop_debug)
|
|
chunks_record.ratings->print(getDict().getUnicharset());
|
|
|
|
if (!only_create_ratings_matrix) {
|
|
if (enable_new_segsearch) {
|
|
SegSearch(&chunks_record, word->best_choice,
|
|
best_char_choices, word->raw_choice,
|
|
state, word->blamer_bundle);
|
|
} else {
|
|
best_first_search(&chunks_record, best_char_choices, word,
|
|
state, fixpt, best_state);
|
|
}
|
|
}
|
|
|
|
free_widths(chunks_record.chunk_widths);
|
|
free_widths(chunks_record.char_widths);
|
|
return chunks_record.ratings;
|
|
}
|
|
} // namespace tesseract
|
|
|
|
|
|
/**********************************************************************
|
|
* total_containment
|
|
*
|
|
* Check to see if one of these outlines is totally contained within
|
|
* the bounding box of the other.
|
|
**********************************************************************/
|
|
inT16 total_containment(TBLOB *blob1, TBLOB *blob2) {
|
|
TBOX box1 = blob1->bounding_box();
|
|
TBOX box2 = blob2->bounding_box();
|
|
return box1.contains(box2) || box2.contains(box1);
|
|
}
|