mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-06 01:07:49 +08:00
425d593ebe
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20
312 lines
9.6 KiB
C++
312 lines
9.6 KiB
C++
/**********************************************************************
|
|
* File: pageres.h (Formerly page_res.h)
|
|
* Description: Results classes used by control.c
|
|
* Author: Phil Cheatle
|
|
* Created: Tue Sep 22 08:42:49 BST 1992
|
|
*
|
|
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
#ifndef PAGERES_H
|
|
#define PAGERES_H
|
|
|
|
#include "elst.h"
|
|
#include "ocrblock.h"
|
|
#include "ocrrow.h"
|
|
#include "werd.h"
|
|
#include "ratngs.h"
|
|
#include "rejctmap.h"
|
|
#include "notdll.h"
|
|
#include "notdll.h"
|
|
|
|
/* Forward declarations */
|
|
|
|
class BLOCK_RES;
|
|
|
|
ELISTIZEH (BLOCK_RES) CLISTIZEH (BLOCK_RES)
|
|
class
|
|
ROW_RES;
|
|
|
|
ELISTIZEH (ROW_RES)
|
|
class WERD_RES;
|
|
|
|
ELISTIZEH (WERD_RES)
|
|
/*************************************************************************
|
|
* PAGE_RES - Page results
|
|
*************************************************************************/
|
|
class PAGE_RES //page result
|
|
{
|
|
public:
|
|
INT32 char_count;
|
|
INT32 rej_count;
|
|
BLOCK_RES_LIST block_res_list;
|
|
BOOL8 rejected;
|
|
|
|
PAGE_RES() {
|
|
} //empty constructor
|
|
|
|
PAGE_RES( //simple constructor
|
|
BLOCK_LIST *block_list); //real blocks
|
|
|
|
~PAGE_RES () { //destructor
|
|
}
|
|
};
|
|
|
|
/*************************************************************************
|
|
* BLOCK_RES - Block results
|
|
*************************************************************************/
|
|
|
|
class BLOCK_RES:public ELIST_LINK
|
|
//page block result
|
|
{
|
|
public:
|
|
BLOCK * block; //real block
|
|
INT32 char_count; //chars in block
|
|
INT32 rej_count; //rejected chars
|
|
INT16 font_class; //
|
|
INT16 row_count;
|
|
float x_height;
|
|
BOOL8 font_assigned; // block already
|
|
// processed
|
|
BOOL8 bold; // all bold
|
|
BOOL8 italic; // all italic
|
|
|
|
ROW_RES_LIST row_res_list;
|
|
|
|
BLOCK_RES() {
|
|
} //empty constructor
|
|
|
|
BLOCK_RES( //simple constructor
|
|
BLOCK *the_block); //real block
|
|
|
|
~BLOCK_RES () { //destructor
|
|
}
|
|
};
|
|
|
|
/*************************************************************************
|
|
* ROW_RES - Row results
|
|
*************************************************************************/
|
|
|
|
class ROW_RES:public ELIST_LINK //row result
|
|
{
|
|
public:
|
|
ROW * row; //real row
|
|
INT32 char_count; //chars in block
|
|
INT32 rej_count; //rejected chars
|
|
INT32 whole_word_rej_count; //rejs in total rej wds
|
|
WERD_RES_LIST word_res_list;
|
|
float font_class_score;
|
|
INT16 font_class; //
|
|
INT32 italic;
|
|
INT32 bold;
|
|
INT8 font1; //primary font
|
|
INT8 font1_count; //no of voters
|
|
INT8 font2; //secondary font
|
|
INT8 font2_count; //no of voters
|
|
|
|
ROW_RES() {
|
|
} //empty constructor
|
|
|
|
ROW_RES( //simple constructor
|
|
ROW *the_row); //real row
|
|
|
|
~ROW_RES () { //destructor
|
|
}
|
|
};
|
|
|
|
/*************************************************************************
|
|
* WERD_RES - Word results
|
|
*************************************************************************/
|
|
enum CRUNCH_MODE
|
|
{
|
|
CR_NONE,
|
|
CR_KEEP_SPACE,
|
|
CR_LOOSE_SPACE,
|
|
CR_DELETE
|
|
};
|
|
|
|
class WERD_RES:public ELIST_LINK //word result
|
|
{
|
|
public:
|
|
WERD * word; //non-bln real word
|
|
WERD *outword; //bln best choice
|
|
//segmentation
|
|
DENORM denorm; //for use on outword
|
|
WERD_CHOICE *best_choice; //tess output
|
|
WERD_CHOICE *raw_choice; //top choice permuter
|
|
WERD_CHOICE *ep_choice; //ep text
|
|
REJMAP reject_map; //best_choice rejects
|
|
BOOL8 tess_failed;
|
|
/*
|
|
If tess_failed is TRUE, one of the following tests failed when Tess
|
|
returned:
|
|
- The outword blob list was not the same length as the best_choice string;
|
|
- The best_choice string contained ALL blanks;
|
|
- The best_choice string was zero length
|
|
*/
|
|
BOOL8 tess_accepted; //Tess thinks its ok?
|
|
BOOL8 tess_would_adapt; //Tess would adapt?
|
|
BOOL8 done; //ready for output?
|
|
INT8 italic;
|
|
INT8 bold;
|
|
INT8 font1; //primary font
|
|
INT8 font1_count; //no of voters
|
|
INT8 font2; //secondary font
|
|
INT8 font2_count; //no of voters
|
|
CRUNCH_MODE unlv_crunch_mode;
|
|
float x_height; //Post match estimate
|
|
float caps_height; //Post match estimate
|
|
BOOL8 guessed_x_ht;
|
|
BOOL8 guessed_caps_ht;
|
|
/*
|
|
To deal with fuzzy spaces we need to be able to combine "words" to form
|
|
combinations when we suspect that the gap is a non-space. The (new) text
|
|
ord code generates separate words for EVERY fuzzy gap - flags in the word
|
|
indicate whether the gap is below the threshold (fuzzy kern) and is thus
|
|
NOT a real word break by default, or above the threshold (fuzzy space) and
|
|
this is a real word break by default.
|
|
|
|
The WERD_RES list contains all these words PLUS "combination" words built
|
|
out of (copies of) the words split by fuzzy kerns. The separate parts have
|
|
their "part_of_combo" flag set true and should be IGNORED on a default
|
|
reading of the list.
|
|
|
|
Combination words are FOLLOWED by the sequence of part_of_combo words
|
|
which they combine.
|
|
*/
|
|
BOOL8 combination; //of two fuzzy gap wds
|
|
BOOL8 part_of_combo; //part of a combo
|
|
BOOL8 reject_spaces; //Reject spacing?
|
|
|
|
WERD_RES() {
|
|
} //empty constructor
|
|
|
|
WERD_RES( //simple constructor
|
|
WERD *the_word) { //real word
|
|
word = the_word;
|
|
outword = NULL;
|
|
best_choice = NULL;
|
|
raw_choice = NULL;
|
|
ep_choice = NULL;
|
|
tess_failed = FALSE;
|
|
tess_accepted = FALSE;
|
|
tess_would_adapt = FALSE;
|
|
done = FALSE;
|
|
unlv_crunch_mode = CR_NONE;
|
|
italic = FALSE;
|
|
bold = FALSE;
|
|
font1 = -1;
|
|
font1_count = 0;
|
|
font2 = -1;
|
|
font2_count = 0;
|
|
x_height = 0.0;
|
|
caps_height = 0.0;
|
|
guessed_x_ht = TRUE;
|
|
guessed_caps_ht = TRUE;
|
|
combination = FALSE;
|
|
part_of_combo = FALSE;
|
|
reject_spaces = FALSE;
|
|
}
|
|
WERD_RES( //constr from WERD_RES
|
|
const WERD_RES &source) {
|
|
*this = source; //see operator=
|
|
}
|
|
|
|
~WERD_RES (); //destructor
|
|
|
|
WERD_RES & operator= ( //assign word res
|
|
const WERD_RES & source); //from this
|
|
|
|
void copy_on( //copy blobs onto word
|
|
WERD_RES *word_res) { //from this word
|
|
word->set_flag (W_EOL, word_res->word->flag (W_EOL));
|
|
word->copy_on (word_res->word);
|
|
}
|
|
};
|
|
|
|
/*************************************************************************
|
|
* PAGE_RES_IT - Page results iterator
|
|
*************************************************************************/
|
|
|
|
class PAGE_RES_IT
|
|
{
|
|
public:
|
|
PAGE_RES * page_res; //page being iterated
|
|
|
|
PAGE_RES_IT() {
|
|
} //empty contructor
|
|
|
|
PAGE_RES_IT( //empty contructor
|
|
PAGE_RES *the_page_res) { //page result
|
|
page_res = the_page_res;
|
|
restart_page(); //ready to scan
|
|
}
|
|
|
|
WERD_RES *restart_page(); //get ready
|
|
|
|
WERD_RES *internal_forward( //get next word
|
|
BOOL8 new_block);
|
|
|
|
WERD_RES *forward() { //get next word
|
|
return internal_forward (FALSE);
|
|
}
|
|
|
|
WERD_RES *forward_block(); //get first word in
|
|
//next non-empty block
|
|
WERD_RES *prev_word() { //previous word
|
|
return prev_word_res;
|
|
}
|
|
ROW_RES *prev_row() { //row of prev word
|
|
return prev_row_res;
|
|
}
|
|
BLOCK_RES *prev_block() { //block of prev word
|
|
return prev_block_res;
|
|
}
|
|
WERD_RES *word() { //current word
|
|
return word_res;
|
|
}
|
|
ROW_RES *row() { //row of current word
|
|
return row_res;
|
|
}
|
|
BLOCK_RES *block() { //block of cur. word
|
|
return block_res;
|
|
}
|
|
WERD_RES *next_word() { //next word
|
|
return next_word_res;
|
|
}
|
|
ROW_RES *next_row() { //row of next word
|
|
return next_row_res;
|
|
}
|
|
BLOCK_RES *next_block() { //block of next word
|
|
return next_block_res;
|
|
}
|
|
void rej_stat_word(); //for page/block/row
|
|
|
|
private:
|
|
WERD_RES * prev_word_res; //previous word
|
|
ROW_RES *prev_row_res; //row of prev word
|
|
BLOCK_RES *prev_block_res; //block of prev word
|
|
|
|
WERD_RES *word_res; //current word
|
|
ROW_RES *row_res; //row of current word
|
|
BLOCK_RES *block_res; //block of cur. word
|
|
|
|
WERD_RES *next_word_res; //next word
|
|
ROW_RES *next_row_res; //row of next word
|
|
BLOCK_RES *next_block_res; //block of next word
|
|
|
|
BLOCK_RES_IT block_res_it; //iterators
|
|
ROW_RES_IT row_res_it;
|
|
WERD_RES_IT word_res_it;
|
|
};
|
|
#endif
|