mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-05 02:47:00 +08:00
d11dc049e3
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1015 d0cd1f9f-072b-0410-8dd7-cf729c803f20
492 lines
15 KiB
C++
492 lines
15 KiB
C++
/**********************************************************************
|
|
* File: werd.cpp (Formerly word.c)
|
|
* Description: Code for the WERD class.
|
|
* Author: Ray Smith
|
|
* Created: Tue Oct 08 14:32:12 BST 1991
|
|
*
|
|
* (C) Copyright 1991, Hewlett-Packard Ltd.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
#include "blckerr.h"
|
|
#include "helpers.h"
|
|
#include "linlsq.h"
|
|
#include "werd.h"
|
|
|
|
// Include automatically generated configuration file if running autoconf.
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config_auto.h"
|
|
#endif
|
|
|
|
#define FIRST_COLOUR ScrollView::RED //< first rainbow colour
|
|
#define LAST_COLOUR ScrollView::AQUAMARINE //< last rainbow colour
|
|
#define CHILD_COLOUR ScrollView::BROWN //< colour of children
|
|
|
|
const ERRCODE CANT_SCALE_EDGESTEPS =
|
|
"Attempted to scale an edgestep format word";
|
|
|
|
ELIST2IZE(WERD)
|
|
|
|
/**
|
|
* WERD::WERD
|
|
*
|
|
* Constructor to build a WERD from a list of C_BLOBs.
|
|
* blob_list The C_BLOBs (in word order) are not copied;
|
|
* we take its elements and put them in our lists.
|
|
* blank_count blanks in front of the word
|
|
* text correct text, outlives this WERD
|
|
*/
|
|
WERD::WERD(C_BLOB_LIST *blob_list, uinT8 blank_count, const char *text)
|
|
: blanks(blank_count),
|
|
flags(0),
|
|
script_id_(0),
|
|
correct(text) {
|
|
C_BLOB_IT start_it = blob_list;
|
|
C_BLOB_IT end_it = blob_list;
|
|
C_BLOB_IT rej_cblob_it = &rej_cblobs;
|
|
C_OUTLINE_IT c_outline_it;
|
|
inT16 inverted_vote = 0;
|
|
inT16 non_inverted_vote = 0;
|
|
|
|
// Move blob_list's elements into cblobs.
|
|
while (!end_it.at_last())
|
|
end_it.forward();
|
|
cblobs.assign_to_sublist(&start_it, &end_it);
|
|
|
|
/*
|
|
Set white on black flag for the WERD, moving any duff blobs onto the
|
|
rej_cblobs list.
|
|
First, walk the cblobs checking the inverse flag for each outline of each
|
|
cblob. If a cblob has inconsistent flag settings for its different
|
|
outlines, move the blob to the reject list. Otherwise, increment the
|
|
appropriate w-on-b or b-on-w vote for the word.
|
|
|
|
Now set the inversion flag for the WERD by maximum vote.
|
|
|
|
Walk the blobs again, moving any blob whose inversion flag does not agree
|
|
with the concencus onto the reject list.
|
|
*/
|
|
start_it.set_to_list(&cblobs);
|
|
if (start_it.empty())
|
|
return;
|
|
for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
|
|
BOOL8 reject_blob = FALSE;
|
|
BOOL8 blob_inverted;
|
|
|
|
c_outline_it.set_to_list(start_it.data()->out_list());
|
|
blob_inverted = c_outline_it.data()->flag(COUT_INVERSE);
|
|
for (c_outline_it.mark_cycle_pt();
|
|
!c_outline_it.cycled_list() && !reject_blob;
|
|
c_outline_it.forward()) {
|
|
reject_blob = c_outline_it.data()->flag(COUT_INVERSE) != blob_inverted;
|
|
}
|
|
if (reject_blob) {
|
|
rej_cblob_it.add_after_then_move(start_it.extract());
|
|
} else {
|
|
if (blob_inverted)
|
|
inverted_vote++;
|
|
else
|
|
non_inverted_vote++;
|
|
}
|
|
}
|
|
|
|
flags.set_bit(W_INVERSE, (inverted_vote > non_inverted_vote));
|
|
|
|
start_it.set_to_list(&cblobs);
|
|
if (start_it.empty())
|
|
return;
|
|
for (start_it.mark_cycle_pt(); !start_it.cycled_list(); start_it.forward()) {
|
|
c_outline_it.set_to_list(start_it.data()->out_list());
|
|
if (c_outline_it.data()->flag(COUT_INVERSE) != flags.bit(W_INVERSE))
|
|
rej_cblob_it.add_after_then_move(start_it.extract());
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* WERD::WERD
|
|
*
|
|
* Constructor to build a WERD from a list of C_BLOBs.
|
|
* The C_BLOBs are not copied so the source list is emptied.
|
|
*/
|
|
|
|
WERD::WERD(C_BLOB_LIST * blob_list, //< In word order
|
|
WERD * clone) //< Source of flags
|
|
: flags(clone->flags),
|
|
script_id_(clone->script_id_),
|
|
correct(clone->correct) {
|
|
C_BLOB_IT start_it = blob_list; // iterator
|
|
C_BLOB_IT end_it = blob_list; // another
|
|
|
|
while (!end_it.at_last ())
|
|
end_it.forward (); //move to last
|
|
((C_BLOB_LIST *) (&cblobs))->assign_to_sublist (&start_it, &end_it);
|
|
//move to our list
|
|
blanks = clone->blanks;
|
|
// fprintf(stderr,"Wrong constructor!!!!\n");
|
|
}
|
|
|
|
// Construct a WERD from a single_blob and clone the flags from this.
|
|
// W_BOL and W_EOL flags are set according to the given values.
|
|
WERD* WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob) {
|
|
C_BLOB_LIST temp_blobs;
|
|
C_BLOB_IT temp_it(&temp_blobs);
|
|
temp_it.add_after_then_move(blob);
|
|
WERD* blob_word = new WERD(&temp_blobs, this);
|
|
blob_word->set_flag(W_BOL, bol);
|
|
blob_word->set_flag(W_EOL, eol);
|
|
return blob_word;
|
|
}
|
|
|
|
/**
|
|
* WERD::bounding_box
|
|
*
|
|
* Return the bounding box of the WERD.
|
|
* This is quite a mess to compute!
|
|
* ORIGINALLY, REJECT CBLOBS WERE EXCLUDED, however, this led to bugs when the
|
|
* words on the row were re-sorted. The original words were built with reject
|
|
* blobs included. The FUZZY SPACE flags were set accordingly. If ALL the
|
|
* blobs in a word are rejected the BB for the word is NULL, causing the sort
|
|
* to screw up, leading to the erroneous possibility of the first word in a
|
|
* row being marked as FUZZY space.
|
|
*/
|
|
|
|
TBOX WERD::bounding_box() {
|
|
TBOX box; // box being built
|
|
C_BLOB_IT rej_cblob_it = &rej_cblobs; // rejected blobs
|
|
|
|
for (rej_cblob_it.mark_cycle_pt(); !rej_cblob_it.cycled_list();
|
|
rej_cblob_it.forward()) {
|
|
box += rej_cblob_it.data()->bounding_box();
|
|
}
|
|
|
|
C_BLOB_IT it = &cblobs; // blobs of WERD
|
|
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
|
|
box += it.data()->bounding_box();
|
|
}
|
|
return box;
|
|
}
|
|
|
|
|
|
/**
|
|
* WERD::move
|
|
*
|
|
* Reposition WERD by vector
|
|
* NOTE!! REJECT CBLOBS ARE NOT MOVED
|
|
*/
|
|
|
|
void WERD::move(const ICOORD vec) {
|
|
C_BLOB_IT cblob_it(&cblobs); // cblob iterator
|
|
|
|
for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward())
|
|
cblob_it.data()->move(vec);
|
|
}
|
|
|
|
/**
|
|
* WERD::join_on
|
|
*
|
|
* Join other word onto this one. Delete the old word.
|
|
*/
|
|
|
|
void WERD::join_on(WERD* other) {
|
|
C_BLOB_IT blob_it(&cblobs);
|
|
C_BLOB_IT src_it(&other->cblobs);
|
|
C_BLOB_IT rej_cblob_it(&rej_cblobs);
|
|
C_BLOB_IT src_rej_it(&other->rej_cblobs);
|
|
|
|
while (!src_it.empty()) {
|
|
blob_it.add_to_end(src_it.extract());
|
|
src_it.forward();
|
|
}
|
|
while (!src_rej_it.empty()) {
|
|
rej_cblob_it.add_to_end(src_rej_it.extract());
|
|
src_rej_it.forward();
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* WERD::copy_on
|
|
*
|
|
* Copy blobs from other word onto this one.
|
|
*/
|
|
|
|
void WERD::copy_on(WERD* other) {
|
|
bool reversed = other->bounding_box().left() < bounding_box().left();
|
|
C_BLOB_IT c_blob_it(&cblobs);
|
|
C_BLOB_LIST c_blobs;
|
|
|
|
c_blobs.deep_copy(&other->cblobs, &C_BLOB::deep_copy);
|
|
if (reversed) {
|
|
c_blob_it.add_list_before(&c_blobs);
|
|
} else {
|
|
c_blob_it.move_to_last();
|
|
c_blob_it.add_list_after(&c_blobs);
|
|
}
|
|
if (!other->rej_cblobs.empty()) {
|
|
C_BLOB_IT rej_c_blob_it(&rej_cblobs);
|
|
C_BLOB_LIST new_rej_c_blobs;
|
|
|
|
new_rej_c_blobs.deep_copy(&other->rej_cblobs, &C_BLOB::deep_copy);
|
|
if (reversed) {
|
|
rej_c_blob_it.add_list_before(&new_rej_c_blobs);
|
|
} else {
|
|
rej_c_blob_it.move_to_last();
|
|
rej_c_blob_it.add_list_after(&new_rej_c_blobs);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* WERD::print
|
|
*
|
|
* Display members
|
|
*/
|
|
|
|
void WERD::print() {
|
|
tprintf("Blanks= %d\n", blanks);
|
|
bounding_box().print();
|
|
tprintf("Flags = %d = 0%o\n", flags.val, flags.val);
|
|
tprintf(" W_SEGMENTED = %s\n", flags.bit(W_SEGMENTED) ? "TRUE" : "FALSE ");
|
|
tprintf(" W_ITALIC = %s\n", flags.bit(W_ITALIC) ? "TRUE" : "FALSE ");
|
|
tprintf(" W_BOL = %s\n", flags.bit(W_BOL) ? "TRUE" : "FALSE ");
|
|
tprintf(" W_EOL = %s\n", flags.bit(W_EOL) ? "TRUE" : "FALSE ");
|
|
tprintf(" W_NORMALIZED = %s\n",
|
|
flags.bit(W_NORMALIZED) ? "TRUE" : "FALSE ");
|
|
tprintf(" W_SCRIPT_HAS_XHEIGHT = %s\n",
|
|
flags.bit(W_SCRIPT_HAS_XHEIGHT) ? "TRUE" : "FALSE ");
|
|
tprintf(" W_SCRIPT_IS_LATIN = %s\n",
|
|
flags.bit(W_SCRIPT_IS_LATIN) ? "TRUE" : "FALSE ");
|
|
tprintf(" W_DONT_CHOP = %s\n", flags.bit(W_DONT_CHOP) ? "TRUE" : "FALSE ");
|
|
tprintf(" W_REP_CHAR = %s\n", flags.bit(W_REP_CHAR) ? "TRUE" : "FALSE ");
|
|
tprintf(" W_FUZZY_SP = %s\n", flags.bit(W_FUZZY_SP) ? "TRUE" : "FALSE ");
|
|
tprintf(" W_FUZZY_NON = %s\n", flags.bit(W_FUZZY_NON) ? "TRUE" : "FALSE ");
|
|
tprintf("Correct= %s\n", correct.string());
|
|
tprintf("Rejected cblob count = %d\n", rej_cblobs.length());
|
|
tprintf("Script = %d\n", script_id_);
|
|
}
|
|
|
|
|
|
/**
|
|
* WERD::plot
|
|
*
|
|
* Draw the WERD in the given colour.
|
|
*/
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
|
void WERD::plot(ScrollView *window, ScrollView::Color colour) {
|
|
C_BLOB_IT it = &cblobs;
|
|
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
|
|
it.data()->plot(window, colour, colour);
|
|
}
|
|
plot_rej_blobs(window);
|
|
}
|
|
|
|
// Get the next color in the (looping) rainbow.
|
|
ScrollView::Color WERD::NextColor(ScrollView::Color colour) {
|
|
ScrollView::Color next = static_cast<ScrollView::Color>(colour + 1);
|
|
if (next >= LAST_COLOUR || next < FIRST_COLOUR)
|
|
next = FIRST_COLOUR;
|
|
return next;
|
|
}
|
|
|
|
/**
|
|
* WERD::plot
|
|
*
|
|
* Draw the WERD in rainbow colours in window.
|
|
*/
|
|
|
|
void WERD::plot(ScrollView* window) {
|
|
ScrollView::Color colour = FIRST_COLOUR;
|
|
C_BLOB_IT it = &cblobs;
|
|
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
|
|
it.data()->plot(window, colour, CHILD_COLOUR);
|
|
colour = NextColor(colour);
|
|
}
|
|
plot_rej_blobs(window);
|
|
}
|
|
|
|
|
|
/**
|
|
* WERD::plot_rej_blobs
|
|
*
|
|
* Draw the WERD rejected blobs in window - ALWAYS GREY
|
|
*/
|
|
|
|
|
|
void WERD::plot_rej_blobs(ScrollView *window) {
|
|
C_BLOB_IT it = &rej_cblobs;
|
|
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
|
|
it.data()->plot(window, ScrollView::GREY, ScrollView::GREY);
|
|
}
|
|
}
|
|
#endif // GRAPHICS_DISABLED
|
|
|
|
|
|
/**
|
|
* WERD::shallow_copy()
|
|
*
|
|
* Make a shallow copy of a word
|
|
*/
|
|
|
|
WERD *WERD::shallow_copy() {
|
|
WERD *new_word = new WERD;
|
|
|
|
new_word->blanks = blanks;
|
|
new_word->flags = flags;
|
|
new_word->dummy = dummy;
|
|
new_word->correct = correct;
|
|
return new_word;
|
|
}
|
|
|
|
|
|
/**
|
|
* WERD::operator=
|
|
*
|
|
* Assign a word, DEEP copying the blob list
|
|
*/
|
|
|
|
WERD & WERD::operator= (const WERD & source) {
|
|
this->ELIST2_LINK::operator= (source);
|
|
blanks = source.blanks;
|
|
flags = source.flags;
|
|
script_id_ = source.script_id_;
|
|
dummy = source.dummy;
|
|
correct = source.correct;
|
|
if (!cblobs.empty())
|
|
cblobs.clear();
|
|
cblobs.deep_copy(&source.cblobs, &C_BLOB::deep_copy);
|
|
|
|
if (!rej_cblobs.empty())
|
|
rej_cblobs.clear();
|
|
rej_cblobs.deep_copy(&source.rej_cblobs, &C_BLOB::deep_copy);
|
|
return *this;
|
|
}
|
|
|
|
|
|
/**
|
|
* word_comparator()
|
|
*
|
|
* word comparator used to sort a word list so that words are in increasing
|
|
* order of left edge.
|
|
*/
|
|
|
|
int word_comparator(const void *word1p, const void *word2p) {
|
|
WERD *word1 = *(WERD **)word1p;
|
|
WERD *word2 = *(WERD **)word2p;
|
|
return word1->bounding_box().left() - word2->bounding_box().left();
|
|
}
|
|
|
|
/**
|
|
* WERD::ConstructWerdWithNewBlobs()
|
|
*
|
|
* This method returns a new werd constructed using the blobs in the input
|
|
* all_blobs list, which correspond to the blobs in this werd object. The
|
|
* blobs used to construct the new word are consumed and removed from the
|
|
* input all_blobs list.
|
|
* Returns NULL if the word couldn't be constructed.
|
|
* Returns original blobs for which no matches were found in the output list
|
|
* orphan_blobs (appends).
|
|
*/
|
|
|
|
WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
|
|
C_BLOB_LIST* orphan_blobs) {
|
|
C_BLOB_LIST current_blob_list;
|
|
C_BLOB_IT werd_blobs_it(¤t_blob_list);
|
|
// Add the word's c_blobs.
|
|
werd_blobs_it.add_list_after(cblob_list());
|
|
|
|
// New blob list. These contain the blobs which will form the new word.
|
|
C_BLOB_LIST new_werd_blobs;
|
|
C_BLOB_IT new_blobs_it(&new_werd_blobs);
|
|
|
|
// not_found_blobs contains the list of current word's blobs for which a
|
|
// corresponding blob wasn't found in the input all_blobs list.
|
|
C_BLOB_LIST not_found_blobs;
|
|
C_BLOB_IT not_found_it(¬_found_blobs);
|
|
not_found_it.move_to_last();
|
|
|
|
werd_blobs_it.move_to_first();
|
|
for (werd_blobs_it.mark_cycle_pt(); !werd_blobs_it.cycled_list();
|
|
werd_blobs_it.forward()) {
|
|
C_BLOB* werd_blob = werd_blobs_it.extract();
|
|
TBOX werd_blob_box = werd_blob->bounding_box();
|
|
bool found = false;
|
|
// Now find the corresponding blob for this blob in the all_blobs
|
|
// list. For now, follow the inefficient method of pairwise
|
|
// comparisons. Ideally, one can pre-bucket the blobs by row.
|
|
C_BLOB_IT all_blobs_it(all_blobs);
|
|
for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list();
|
|
all_blobs_it.forward()) {
|
|
C_BLOB* a_blob = all_blobs_it.data();
|
|
// Compute the overlap of the two blobs. If major, a_blob should
|
|
// be added to the new blobs list.
|
|
TBOX a_blob_box = a_blob->bounding_box();
|
|
if (a_blob_box.null_box()) {
|
|
tprintf("Bounding box couldn't be ascertained\n");
|
|
}
|
|
if (werd_blob_box.contains(a_blob_box) ||
|
|
werd_blob_box.major_overlap(a_blob_box)) {
|
|
// Old blobs are from minimal splits, therefore are expected to be
|
|
// bigger. The new small blobs should cover a significant portion.
|
|
// This is it.
|
|
all_blobs_it.extract();
|
|
new_blobs_it.add_after_then_move(a_blob);
|
|
found = true;
|
|
}
|
|
}
|
|
if (!found) {
|
|
not_found_it.add_after_then_move(werd_blob);
|
|
} else {
|
|
delete werd_blob;
|
|
}
|
|
}
|
|
// Iterate over all not found blobs. Some of them may be due to
|
|
// under-segmentation (which is OK, since the corresponding blob is already
|
|
// in the list in that case.
|
|
not_found_it.move_to_first();
|
|
for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list();
|
|
not_found_it.forward()) {
|
|
C_BLOB* not_found = not_found_it.data();
|
|
TBOX not_found_box = not_found->bounding_box();
|
|
C_BLOB_IT existing_blobs_it(new_blobs_it);
|
|
for (existing_blobs_it.mark_cycle_pt(); !existing_blobs_it.cycled_list();
|
|
existing_blobs_it.forward()) {
|
|
C_BLOB* a_blob = existing_blobs_it.data();
|
|
TBOX a_blob_box = a_blob->bounding_box();
|
|
if ((not_found_box.major_overlap(a_blob_box) ||
|
|
a_blob_box.major_overlap(not_found_box)) &&
|
|
not_found_box.y_overlap_fraction(a_blob_box) > 0.8) {
|
|
// Already taken care of.
|
|
delete not_found_it.extract();
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (orphan_blobs) {
|
|
C_BLOB_IT orphan_blobs_it(orphan_blobs);
|
|
orphan_blobs_it.move_to_last();
|
|
orphan_blobs_it.add_list_after(¬_found_blobs);
|
|
}
|
|
|
|
// New blobs are ready. Create a new werd object with these.
|
|
WERD* new_werd = NULL;
|
|
if (!new_werd_blobs.empty()) {
|
|
new_werd = new WERD(&new_werd_blobs, this);
|
|
} else {
|
|
// Add the blobs back to this word so that it can be reused.
|
|
C_BLOB_IT this_list_it(cblob_list());
|
|
this_list_it.add_list_after(¬_found_blobs);
|
|
}
|
|
return new_werd;
|
|
}
|