tesseract/ccmain/output.cpp
2011-03-18 21:52:08 +00:00

515 lines
18 KiB
C++

/******************************************************************
* File: output.cpp (Formerly output.c)
* Description: Output pass
* Author: Phil Cheatle
* Created: Thu Aug 4 10:56:08 BST 1994
*
* (C) Copyright 1994, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifdef _MSC_VER
#pragma warning(disable:4244) // Conversion warnings
#endif
#include "mfcpch.h"
#include <string.h>
#include <ctype.h>
#ifdef __UNIX__
#include <assert.h>
#include <unistd.h>
#include <errno.h>
#endif
#include "helpers.h"
#include "tfacep.h"
#include "tessvars.h"
#include "control.h"
#include "secname.h"
#include "reject.h"
#include "docqual.h"
#include "output.h"
#include "bestfirst.h"
#include "globals.h"
#include "tesseractclass.h"
#define EPAPER_EXT ".ep"
#define PAGE_YSIZE 3508
#define CTRL_INSET '\024' //dc4=text inset
#define CTRL_FONT '\016' //so=font change
#define CTRL_DEFAULT '\017' //si=default font
#define CTRL_SHIFT '\022' //dc2=x shift
#define CTRL_TAB '\011' //tab
#define CTRL_NEWLINE '\012' //newline
#define CTRL_HARDLINE '\015' //cr
/**********************************************************************
* pixels_to_pts
*
* Convert an integer number of pixels to the nearest integer
* number of points.
**********************************************************************/
inT32 pixels_to_pts( //convert coords
inT32 pixels,
inT32 pix_res //resolution
) {
float pts; //converted value
pts = pixels * 72.0 / pix_res;
return (inT32) (pts + 0.5); //round it
}
namespace tesseract {
void Tesseract::output_pass( //Tess output pass //send to api
PAGE_RES_IT &page_res_it,
const TBOX *target_word_box) {
BLOCK_RES *block_of_last_word;
inT16 block_id;
BOOL8 force_eol; //During output
BLOCK *nextblock; //block of next word
WERD *nextword; //next word
page_res_it.restart_page ();
block_of_last_word = NULL;
while (page_res_it.word () != NULL) {
check_debug_pt (page_res_it.word (), 120);
if (target_word_box)
{
TBOX current_word_box=page_res_it.word ()->word->bounding_box();
FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
if (!target_word_box->contains(center_pt))
{
page_res_it.forward ();
continue;
}
}
if (tessedit_write_block_separators &&
block_of_last_word != page_res_it.block ()) {
block_of_last_word = page_res_it.block ();
block_id = block_of_last_word->block->index();
}
force_eol = (tessedit_write_block_separators &&
(page_res_it.block () != page_res_it.next_block ())) ||
(page_res_it.next_word () == NULL);
if (page_res_it.next_word () != NULL)
nextword = page_res_it.next_word ()->word;
else
nextword = NULL;
if (page_res_it.next_block () != NULL)
nextblock = page_res_it.next_block ()->block;
else
nextblock = NULL;
//regardless of tilde crunching
write_results(page_res_it,
determine_newline_type(page_res_it.word()->word,
page_res_it.block()->block,
nextword, nextblock), force_eol);
page_res_it.forward();
}
}
/*************************************************************************
* write_results()
*
* All recognition and rejection has now been done. Generate the following:
* .txt file - giving the final best choices with NO highlighting
* .raw file - giving the tesseract top choice output for each word
* .map file - showing how the .txt file has been rejected in the .ep file
* epchoice list - a list of one element per word, containing the text for the
* epaper. Reject strings are inserted.
* inset list - a list of bounding boxes of reject insets - indexed by the
* reject strings in the epchoice text.
*************************************************************************/
void Tesseract::write_results(PAGE_RES_IT &page_res_it,
char newline_type, // type of newline
BOOL8 force_eol) { // override tilde crunch?
WERD_RES *word = page_res_it.word();
STRING repetition_code;
const STRING *wordstr;
STRING wordstr_lengths;
int i;
char unrecognised = STRING (unrecognised_char)[0];
char ep_chars[32]; //Only for unlv_tilde_crunch
int ep_chars_index = 0;
char txt_chs[32]; //Only for unlv_tilde_crunch
char map_chs[32]; //Only for unlv_tilde_crunch
int txt_index = 0;
BOOL8 need_reject = FALSE;
UNICHAR_ID space = unicharset.unichar_to_id(" ");
if ((word->unlv_crunch_mode != CR_NONE ||
word->best_choice->length() == 0) &&
!tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
if ((word->unlv_crunch_mode != CR_DELETE) &&
(!stats_.tilde_crunch_written ||
((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
(word->word->space () > 0) &&
!word->word->flag (W_FUZZY_NON) &&
!word->word->flag (W_FUZZY_SP)))) {
if (!word->word->flag (W_BOL) &&
(word->word->space () > 0) &&
!word->word->flag (W_FUZZY_NON) &&
!word->word->flag (W_FUZZY_SP)) {
// Write a space to separate from preceeding good text.
txt_chs[txt_index] = ' ';
map_chs[txt_index++] = '1';
ep_chars[ep_chars_index++] = ' ';
stats_.last_char_was_tilde = false;
}
need_reject = TRUE;
}
if ((need_reject && !stats_.last_char_was_tilde) ||
(force_eol && stats_.write_results_empty_block)) {
/* Write a reject char - mark as rejected unless zero_rejection mode */
stats_.last_char_was_tilde = TRUE;
txt_chs[txt_index] = unrecognised;
if (tessedit_zero_rejection || (suspect_level == 0)) {
map_chs[txt_index++] = '1';
ep_chars[ep_chars_index++] = unrecognised;
}
else {
map_chs[txt_index++] = '0';
/*
The ep_choice string is a faked reject to allow newdiff to sync the
.etx with the .txt and .map files.
*/
ep_chars[ep_chars_index++] = CTRL_INSET; // escape code
//dummy reject
ep_chars[ep_chars_index++] = 1;
//dummy reject
ep_chars[ep_chars_index++] = 1;
//type
ep_chars[ep_chars_index++] = 2;
//dummy reject
ep_chars[ep_chars_index++] = 1;
//dummy reject
ep_chars[ep_chars_index++] = 1;
}
stats_.tilde_crunch_written = true;
stats_.last_char_was_newline = false;
stats_.write_results_empty_block = false;
}
if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
/* Add a new line output */
txt_chs[txt_index] = '\n';
map_chs[txt_index++] = '\n';
//end line
ep_chars[ep_chars_index++] = newline_type;
//Cos of the real newline
stats_.tilde_crunch_written = false;
stats_.last_char_was_newline = true;
stats_.last_char_was_tilde = false;
}
txt_chs[txt_index] = '\0';
map_chs[txt_index] = '\0';
ep_chars[ep_chars_index] = '\0'; // terminate string
word->ep_choice = new WERD_CHOICE(ep_chars, unicharset);
if (force_eol)
stats_.write_results_empty_block = true;
return;
}
/* NORMAL PROCESSING of non tilde crunched words */
stats_.tilde_crunch_written = false;
if (newline_type)
stats_.last_char_was_newline = true;
else
stats_.last_char_was_newline = false;
stats_.write_results_empty_block = force_eol; // about to write a real word
if (unlv_tilde_crunching &&
stats_.last_char_was_tilde &&
(word->word->space() == 0) &&
!(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
(word->best_choice->unichar_id(0) == space)) {
/* Prevent adjacent tilde across words - we know that adjacent tildes within
words have been removed */
word->best_choice->remove_unichar_id(0);
if (word->best_choice->blob_choices() != NULL) {
BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
if (!blob_choices_it.empty()) delete blob_choices_it.extract();
}
word->best_choice->populate_unichars(getDict().getUnicharset());
word->reject_map.remove_pos (0);
delete word->box_word;
word->box_word = new BoxWord;
}
if (newline_type ||
(word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
stats_.last_char_was_tilde = false;
else {
if (word->reject_map.length () > 0) {
if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
stats_.last_char_was_tilde = true;
else
stats_.last_char_was_tilde = false;
}
else if (word->word->space () > 0)
stats_.last_char_was_tilde = false;
/* else it is unchanged as there are no output chars */
}
ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
set_unlv_suspects(word);
check_debug_pt (word, 120);
if (tessedit_rejection_debug) {
tprintf ("Dict word: \"%s\": %d\n",
word->best_choice->debug_string(unicharset).string(),
dict_word(*(word->best_choice)));
}
if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
repetition_code = "|^~R";
wordstr_lengths = "\001\001\001\001";
repetition_code += unicharset.id_to_unichar(get_rep_char (word));
wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word)));
wordstr = &repetition_code;
} else {
if (tessedit_zero_rejection) {
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
for (i = 0; i < word->best_choice->length(); ++i) {
if (word->reject_map[i].rejected())
word->reject_map[i].setrej_minimal_rej_accept();
}
}
if (tessedit_minimal_rejection) {
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
for (i = 0; i < word->best_choice->length(); ++i) {
if ((word->best_choice->unichar_id(i) != space) &&
word->reject_map[i].rejected())
word->reject_map[i].setrej_minimal_rej_accept();
}
}
}
}
} // namespace tesseract
/**********************************************************************
* determine_newline_type
*
* Find whether we have a wrapping or hard newline.
* Return FALSE if not at end of line.
**********************************************************************/
char determine_newline_type( //test line ends
WERD *word, //word to do
BLOCK *block, //current block
WERD *next_word, //next word
BLOCK *next_block //block of next word
) {
inT16 end_gap; //to right edge
inT16 width; //of next word
TBOX word_box; //bounding
TBOX next_box; //next word
TBOX block_box; //block bounding
if (!word->flag (W_EOL))
return FALSE; //not end of line
if (next_word == NULL || next_block == NULL || block != next_block)
return CTRL_NEWLINE;
if (next_word->space () > 0)
return CTRL_HARDLINE; //it is tabbed
word_box = word->bounding_box ();
next_box = next_word->bounding_box ();
block_box = block->bounding_box ();
//gap to eol
end_gap = block_box.right () - word_box.right ();
end_gap -= (inT32) block->space ();
width = next_box.right () - next_box.left ();
// tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
// block_box.right(),word_box.right(),end_gap,
// next_box.right(),next_box.left(),width,
// end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
}
/*************************************************************************
* get_rep_char()
* Return the first accepted character from the repetition string. This is the
* character which is repeated - as determined earlier by fix_rep_char()
*************************************************************************/
namespace tesseract {
UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
int i;
for (i = 0; ((i < word->reject_map.length()) &&
(word->reject_map[i].rejected())); ++i);
if (i < word->reject_map.length()) {
return word->best_choice->unichar_id(i);
} else {
return unicharset.unichar_to_id(unrecognised_char.string());
}
}
/*************************************************************************
* SUSPECT LEVELS
*
* 0 - dont reject ANYTHING
* 1,2 - partial rejection
* 3 - BEST
*
* NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
* tessedit_minimal_rejection.
*************************************************************************/
void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
int len = word_res->reject_map.length();
const WERD_CHOICE &word = *(word_res->best_choice);
int i;
float rating_per_ch;
if (suspect_level == 0) {
for (i = 0; i < len; i++) {
if (word_res->reject_map[i].rejected())
word_res->reject_map[i].setrej_minimal_rej_accept();
}
return;
}
if (suspect_level >= 3)
return; //Use defaults
/* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
if (safe_dict_word(word) &&
(count_alphas(word) > suspect_short_words)) {
/* Unreject alphas in dictionary words */
for (i = 0; i < len; ++i) {
if (word_res->reject_map[i].rejected() &&
unicharset.get_isalpha(word.unichar_id(i)))
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
rating_per_ch = word.rating() / word_res->reject_map.length();
if (rating_per_ch >= suspect_rating_per_ch)
return; //Dont touch bad ratings
if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
/* Unreject any Tess Acceptable word - but NOT tess reject chs*/
for (i = 0; i < len; ++i) {
if (word_res->reject_map[i].rejected() &&
(!unicharset.eq(word.unichar_id(i), " ")))
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
for (i = 0; i < len; i++) {
if (word_res->reject_map[i].rejected()) {
if (word_res->reject_map[i].flag(R_DOC_REJ))
word_res->reject_map[i].setrej_minimal_rej_accept();
if (word_res->reject_map[i].flag(R_BLOCK_REJ))
word_res->reject_map[i].setrej_minimal_rej_accept();
if (word_res->reject_map[i].flag(R_ROW_REJ))
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
if (suspect_level == 2)
return;
if (!suspect_constrain_1Il ||
(word_res->reject_map.length() <= suspect_short_words)) {
for (i = 0; i < len; i++) {
if (word_res->reject_map[i].rejected()) {
if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
word_res->reject_map[i].flag(R_POSTNN_1IL)))
word_res->reject_map[i].setrej_minimal_rej_accept();
if (!suspect_constrain_1Il &&
word_res->reject_map[i].flag(R_MM_REJECT))
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
}
if ((acceptable_word_string(word.unichar_string().string(),
word.unichar_lengths().string()) !=
AC_UNACCEPTABLE) ||
acceptable_number_string(word.unichar_string().string(),
word.unichar_lengths().string())) {
if (word_res->reject_map.length() > suspect_short_words) {
for (i = 0; i < len; i++) {
if (word_res->reject_map[i].rejected() &&
(!word_res->reject_map[i].perm_rejected() ||
word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
word_res->reject_map[i].flag (R_POSTNN_1IL) ||
word_res->reject_map[i].flag (R_MM_REJECT))) {
word_res->reject_map[i].setrej_minimal_rej_accept();
}
}
}
}
}
inT16 Tesseract::count_alphas(const WERD_CHOICE &word) {
int count = 0;
for (int i = 0; i < word.length(); ++i) {
if (unicharset.get_isalpha(word.unichar_id(i)))
count++;
}
return count;
}
inT16 Tesseract::count_alphanums(const WERD_CHOICE &word) {
int count = 0;
for (int i = 0; i < word.length(); ++i) {
if (unicharset.get_isalpha(word.unichar_id(i)) ||
unicharset.get_isdigit(word.unichar_id(i)))
count++;
}
return count;
}
BOOL8 Tesseract::acceptable_number_string(const char *s,
const char *lengths) {
BOOL8 prev_digit = FALSE;
if (*lengths == 1 && *s == '(')
s++;
if (*lengths == 1 &&
((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
s++;
for (; *s != '\0'; s += *(lengths++)) {
if (unicharset.get_isdigit (s, *lengths))
prev_digit = TRUE;
else if (prev_digit &&
(*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
prev_digit = FALSE;
else if (prev_digit && *lengths == 1 &&
(*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
return TRUE;
else if (prev_digit &&
*lengths == 1 && (*s == '%') &&
(*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
(*(s + *lengths + *(lengths + 1)) == '\0'))
return TRUE;
else
return FALSE;
}
return TRUE;
}
} // namespace tesseract