mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-30 23:49:05 +08:00
7870d67c21
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@157 d0cd1f9f-072b-0410-8dd7-cf729c803f20
194 lines
7.3 KiB
C++
194 lines
7.3 KiB
C++
/**********************************************************************
|
|
* File: werdit.cpp (Formerly wordit.c)
|
|
* Description: An iterator for passing over all the words in a document.
|
|
* Author: Ray Smith
|
|
* Created: Mon Apr 27 08:51:22 BST 1992
|
|
*
|
|
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
#include "mfcpch.h"
|
|
#include "werdit.h"
|
|
|
|
#define EXTERN
|
|
|
|
//EXTERN BOOL_VAR(wordit_linearc,FALSE,"Pass poly of linearc to Tess");
|
|
|
|
/**********************************************************************
|
|
* WERDIT::start_page
|
|
*
|
|
* Get ready to iterate over the page by setting the iterators.
|
|
**********************************************************************/
|
|
|
|
void WERDIT::start_page( //set iterators
|
|
BLOCK_LIST *block_list //blocks to check
|
|
) {
|
|
block_it.set_to_list (block_list);
|
|
block_it.mark_cycle_pt ();
|
|
do {
|
|
while (block_it.data ()->row_list ()->empty ()
|
|
&& !block_it.cycled_list ()) {
|
|
block_it.forward ();
|
|
}
|
|
if (!block_it.data ()->row_list ()->empty ()) {
|
|
row_it.set_to_list (block_it.data ()->row_list ());
|
|
row_it.mark_cycle_pt ();
|
|
while (row_it.data ()->word_list ()->empty ()
|
|
&& !row_it.cycled_list ()) {
|
|
row_it.forward ();
|
|
}
|
|
if (!row_it.data ()->word_list ()->empty ()) {
|
|
word_it.set_to_list (row_it.data ()->word_list ());
|
|
word_it.mark_cycle_pt ();
|
|
}
|
|
}
|
|
}
|
|
while (!block_it.cycled_list () && row_it.data ()->word_list ()->empty ());
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* WERDIT::forward
|
|
*
|
|
* Give the next word on the page, or NULL if none left.
|
|
* This code assumes all rows to be non-empty, but blocks are allowed
|
|
* to be empty as eventually we will have non-text blocks.
|
|
* The output is always a copy and needs to be deleted by somebody.
|
|
**********************************************************************/
|
|
|
|
WERD *WERDIT::forward() { //use iterators
|
|
WERD *word; //actual word
|
|
// WERD *larc_word; //linearc copy
|
|
WERD *result; //output word
|
|
ROW *row; //row of word
|
|
|
|
if (word_it.cycled_list ()) {
|
|
return NULL; //finished page
|
|
}
|
|
else {
|
|
word = word_it.data ();
|
|
row = row_it.data ();
|
|
word_it.forward ();
|
|
if (word_it.cycled_list ()) {
|
|
row_it.forward (); //finished row
|
|
if (row_it.cycled_list ()) {
|
|
do {
|
|
block_it.forward (); //finished block
|
|
if (!block_it.cycled_list ()) {
|
|
row_it.set_to_list (block_it.data ()->row_list ());
|
|
row_it.mark_cycle_pt ();
|
|
}
|
|
}
|
|
//find non-empty block
|
|
while (!block_it.cycled_list ()
|
|
&& row_it.cycled_list ());
|
|
}
|
|
if (!row_it.cycled_list ()) {
|
|
word_it.set_to_list (row_it.data ()->word_list ());
|
|
word_it.mark_cycle_pt ();
|
|
}
|
|
}
|
|
|
|
// if (wordit_linearc && !word->flag(W_POLYGON))
|
|
// {
|
|
// larc_word=word->larc_copy(row->x_height());
|
|
// result=larc_word->poly_copy(row->x_height());
|
|
// delete larc_word;
|
|
// }
|
|
// else
|
|
result = word->poly_copy (row->x_height ());
|
|
return result;
|
|
}
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* make_pseudo_word
|
|
*
|
|
* Make all the blobs inside a selection into a single word.
|
|
* The word is always a copy and needs to be deleted.
|
|
**********************************************************************/
|
|
|
|
WERD *make_pseudo_word( //make fake word
|
|
BLOCK_LIST *block_list, //blocks to check //block of selection
|
|
TBOX &selection_box,
|
|
BLOCK *&pseudo_block,
|
|
ROW *&pseudo_row //row of selection
|
|
) {
|
|
BLOCK_IT block_it(block_list);
|
|
BLOCK *block;
|
|
ROW_IT row_it;
|
|
ROW *row;
|
|
WERD_IT word_it;
|
|
WERD *word;
|
|
PBLOB_IT blob_it;
|
|
PBLOB *blob;
|
|
PBLOB_LIST new_blobs; //list of gathered blobs
|
|
//iterator
|
|
PBLOB_IT new_blob_it = &new_blobs;
|
|
WERD *pseudo_word; //fabricated word
|
|
WERD *poly_word; //poly copy of word
|
|
// WERD *larc_word; //linearc copy
|
|
|
|
for (block_it.mark_cycle_pt ();
|
|
!block_it.cycled_list (); block_it.forward ()) {
|
|
block = block_it.data ();
|
|
if (block->bounding_box ().overlap (selection_box)) {
|
|
pseudo_block = block;
|
|
row_it.set_to_list (block->row_list ());
|
|
for (row_it.mark_cycle_pt ();
|
|
!row_it.cycled_list (); row_it.forward ()) {
|
|
row = row_it.data ();
|
|
if (row->bounding_box ().overlap (selection_box)) {
|
|
word_it.set_to_list (row->word_list ());
|
|
for (word_it.mark_cycle_pt ();
|
|
!word_it.cycled_list (); word_it.forward ()) {
|
|
word = word_it.data ();
|
|
if (word->bounding_box ().overlap (selection_box)) {
|
|
// if (wordit_linearc && !word->flag(W_POLYGON))
|
|
// {
|
|
// larc_word=word->larc_copy(row->x_height());
|
|
// poly_word=larc_word->poly_copy(row->x_height());
|
|
// delete larc_word;
|
|
// }
|
|
// else
|
|
poly_word = word->poly_copy (row->x_height ());
|
|
blob_it.set_to_list (poly_word->blob_list ());
|
|
for (blob_it.mark_cycle_pt ();
|
|
!blob_it.cycled_list (); blob_it.forward ()) {
|
|
blob = blob_it.data ();
|
|
if (blob->bounding_box ().
|
|
overlap (selection_box)) {
|
|
new_blob_it.add_after_then_move (blob_it.
|
|
extract
|
|
());
|
|
//steal off list
|
|
pseudo_row = row;
|
|
}
|
|
}
|
|
delete poly_word; //get rid of it
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (!new_blobs.empty ()) {
|
|
//make new word
|
|
pseudo_word = new WERD (&new_blobs, 1, NULL);
|
|
}
|
|
else
|
|
pseudo_word = NULL;
|
|
return pseudo_word;
|
|
}
|