/****************************************************************** * File: output.cpp (Formerly output.c) * Description: Output pass * Author: Phil Cheatle * Created: Thu Aug 4 10:56:08 BST 1994 * * (C) Copyright 1994, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/ #ifdef _MSC_VER #pragma warning(disable:4244) // Conversion warnings #endif #include "mfcpch.h" #include "ocrshell.h" #include #include #ifdef __UNIX__ #include #include #include #endif #include "mainblk.h" #include "tfacep.h" #include "tessvars.h" #include "control.h" #include "secname.h" #include "reject.h" #include "docqual.h" #include "output.h" #include "bestfirst.h" #include "globals.h" #include "tesseractclass.h" #define EXTERN #define EPAPER_EXT ".ep" #define PAGE_YSIZE 3508 #define CTRL_INSET '\024' //dc4=text inset #define CTRL_FONT '\016' //so=font change #define CTRL_DEFAULT '\017' //si=default font #define CTRL_SHIFT '\022' //dc2=x shift #define CTRL_TAB '\011' //tab #define CTRL_NEWLINE '\012' //newline #define CTRL_HARDLINE '\015' //cr EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE, "Write block separators in output"); EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE, "Write raw stuff to name.raw"); EXTERN BOOL_EVAR (tessedit_write_output, FALSE, "Write text to name.txt"); EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE, "Return ratings in IPEOCRAPI data"); EXTERN BOOL_EVAR (tessedit_write_txt_map, FALSE, "Write .txt to .etx map file"); EXTERN BOOL_EVAR (tessedit_write_rep_codes, FALSE, "Write repetition char code"); EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file"); EXTERN STRING_EVAR (unrecognised_char, "|", "Output char for unidentified blobs"); EXTERN INT_EVAR (suspect_level, 99, "Suspect marker level"); EXTERN INT_VAR (suspect_space_level, 100, "Min suspect level for rejecting spaces"); EXTERN INT_VAR (suspect_short_words, 2, "Dont Suspect dict wds longer than this"); EXTERN BOOL_VAR (suspect_constrain_1Il, FALSE, "UNLV keep 1Il chars rejected"); EXTERN double_VAR (suspect_rating_per_ch, 999.9, "Dont touch bad rating limit"); EXTERN double_VAR (suspect_accept_rating, -999.9, "Accept good rating limit"); EXTERN BOOL_EVAR (tessedit_minimal_rejection, FALSE, "Only reject tess failures"); EXTERN BOOL_VAR (tessedit_zero_rejection, FALSE, "Dont reject ANYTHING"); EXTERN BOOL_VAR (tessedit_word_for_word, FALSE, "Make output have exactly one word per WERD"); EXTERN BOOL_VAR (tessedit_zero_kelvin_rejection, FALSE, "Dont reject ANYTHING AT ALL"); EXTERN BOOL_VAR (tessedit_consistent_reps, TRUE, "Force all rep chars the same"); FILE *txt_mapfile = NULL; //reject map FILE *unlv_file = NULL; //reject map /********************************************************************** * pixels_to_pts * * Convert an integer number of pixels to the nearest integer * number of points. **********************************************************************/ inT32 pixels_to_pts( //convert coords inT32 pixels, inT32 pix_res //resolution ) { float pts; //converted value pts = pixels * 72.0 / pix_res; return (inT32) (pts + 0.5); //round it } namespace tesseract { void Tesseract::output_pass( //Tess output pass //send to api PAGE_RES_IT &page_res_it, BOOL8 write_to_shm, TBOX *target_word_box) { BLOCK_RES *block_of_last_word; inT16 block_id; BOOL8 force_eol; //During output BLOCK *nextblock; //block of next word WERD *nextword; //next word if (tessedit_write_txt_map) txt_mapfile = open_outfile (".map"); page_res_it.restart_page (); block_of_last_word = NULL; while (page_res_it.word () != NULL) { check_debug_pt (page_res_it.word (), 120); if (target_word_box) { TBOX current_word_box=page_res_it.word ()->word->bounding_box(); FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2); if (!target_word_box->contains(center_pt)) { page_res_it.forward (); continue; } } if (tessedit_write_block_separators && block_of_last_word != page_res_it.block ()) { block_of_last_word = page_res_it.block (); block_id = block_of_last_word->block->index(); if (!wordrec_no_block) fprintf (textfile, "|^~tr%d\n", block_id); fprintf (txt_mapfile, "|^~tr%d\n", block_id); } force_eol = (tessedit_write_block_separators && (page_res_it.block () != page_res_it.next_block ())) || (page_res_it.next_word () == NULL); if (page_res_it.next_word () != NULL) nextword = page_res_it.next_word ()->word; else nextword = NULL; if (page_res_it.next_block () != NULL) nextblock = page_res_it.next_block ()->block; else nextblock = NULL; //regardless of tilde crunching write_results (page_res_it, determine_newline_type (page_res_it.word ()->word, page_res_it.block ()->block, nextword, nextblock), force_eol, write_to_shm); page_res_it.forward (); } if (write_to_shm) ocr_send_text(FALSE); if (tessedit_write_block_separators) { if (!wordrec_no_block) fprintf (textfile, "|^~tr\n"); fprintf (txt_mapfile, "|^~tr\n"); } if (tessedit_write_txt_map) { fprintf (txt_mapfile, "\n"); //because txt gets one #ifdef __UNIX__ fsync (fileno (txt_mapfile)); #endif fclose(txt_mapfile); } } /************************************************************************* * write_results() * * All recognition and rejection has now been done. Generate the following: * .txt file - giving the final best choices with NO highlighting * .raw file - giving the tesseract top choice output for each word * .map file - showing how the .txt file has been rejected in the .ep file * epchoice list - a list of one element per word, containing the text for the * epaper. Reject strings are inserted. * inset list - a list of bounding boxes of reject insets - indexed by the * reject strings in the epchoice text. *************************************************************************/ void Tesseract::write_results( //output a word //full info PAGE_RES_IT &page_res_it, char newline_type, //type of newline //override tilde crunch? BOOL8 force_eol, BOOL8 write_to_shm //send to api ) { //word to do WERD_RES *word = page_res_it.word (); // WERD_CHOICE *ep_choice; //ep format STRING repetition_code; const STRING *wordstr; STRING wordstr_lengths; int i; char unrecognised = STRING (unrecognised_char)[0]; char ep_chars[32]; //Only for unlv_tilde_crunch int ep_chars_index = 0; char txt_chs[32]; //Only for unlv_tilde_crunch char map_chs[32]; //Only for unlv_tilde_crunch int txt_index = 0; static BOOL8 tilde_crunch_written = FALSE; static BOOL8 last_char_was_newline = TRUE; static BOOL8 last_char_was_tilde = FALSE; static BOOL8 empty_block = TRUE; BOOL8 need_reject = FALSE; PBLOB_IT blob_it; //blobs UNICHAR_ID space = unicharset.unichar_to_id(" "); /* if (word->best_choice->string().length() == 0) { tprintf("No output: to output\n"); } else if (word->best_choice->string()[0]==' ') { tprintf("spaceword to output\n"); } else if (word->best_choice->string()[0]=='\0') { tprintf("null to output\n"); }*/ if (word->unlv_crunch_mode != CR_NONE && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { if ((word->unlv_crunch_mode != CR_DELETE) && (!tilde_crunch_written || ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space () > 0) && !word->word->flag (W_FUZZY_NON) && !word->word->flag (W_FUZZY_SP)))) { if (!word->word->flag (W_BOL) && (word->word->space () > 0) && !word->word->flag (W_FUZZY_NON) && !word->word->flag (W_FUZZY_SP)) { /* Write a space to separate from preceeding good text */ txt_chs[txt_index] = ' '; map_chs[txt_index++] = '1'; ep_chars[ep_chars_index++] = ' '; last_char_was_tilde = FALSE; } need_reject = TRUE; } if ((need_reject && !last_char_was_tilde) || (force_eol && empty_block)) { /* Write a reject char - mark as rejected unless zero_rejection mode */ last_char_was_tilde = TRUE; txt_chs[txt_index] = unrecognised; if (tessedit_zero_rejection || (suspect_level == 0)) { map_chs[txt_index++] = '1'; ep_chars[ep_chars_index++] = unrecognised; } else { map_chs[txt_index++] = '0'; /* The ep_choice string is a faked reject to allow newdiff to sync the .etx with the .txt and .map files. */ ep_chars[ep_chars_index++] = CTRL_INSET; //escape code //dummy reject ep_chars[ep_chars_index++] = 1; //dummy reject ep_chars[ep_chars_index++] = 1; //type ep_chars[ep_chars_index++] = 2; //dummy reject ep_chars[ep_chars_index++] = 1; //dummy reject ep_chars[ep_chars_index++] = 1; } tilde_crunch_written = TRUE; last_char_was_newline = FALSE; empty_block = FALSE; } if ((word->word->flag (W_EOL) && !last_char_was_newline) || force_eol) { /* Add a new line output */ txt_chs[txt_index] = '\n'; map_chs[txt_index++] = '\n'; //end line ep_chars[ep_chars_index++] = newline_type; //Cos of the real newline tilde_crunch_written = FALSE; last_char_was_newline = TRUE; last_char_was_tilde = FALSE; } txt_chs[txt_index] = '\0'; map_chs[txt_index] = '\0'; //xiaofan if (tessedit_write_output && !wordrec_no_block) fprintf (textfile, "%s", txt_chs); if (tessedit_write_txt_map) fprintf (txt_mapfile, "%s", map_chs); //terminate string ep_chars[ep_chars_index] = '\0'; word->ep_choice = new WERD_CHOICE(ep_chars, unicharset); if (force_eol) empty_block = TRUE; return; } /* NORMAL PROCESSING of non tilde crunched words */ tilde_crunch_written = FALSE; if (newline_type) last_char_was_newline = TRUE; else last_char_was_newline = FALSE; empty_block = force_eol; //About to write a real word if (unlv_tilde_crunching && last_char_was_tilde && (word->word->space() == 0) && !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) && (word->best_choice->unichar_id(0) == space)) { /* Prevent adjacent tilde across words - we know that adjacent tildes within words have been removed */ word->best_choice->remove_unichar_id(0); word->best_choice->populate_unichars(getDict().getUnicharset()); word->reject_map.remove_pos (0); blob_it = word->outword->blob_list (); delete blob_it.extract (); //get rid of reject blob } if (newline_type || (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) last_char_was_tilde = FALSE; else { if (word->reject_map.length () > 0) { if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) last_char_was_tilde = TRUE; else last_char_was_tilde = FALSE; } else if (word->word->space () > 0) last_char_was_tilde = FALSE; /* else it is unchanged as there are no output chars */ } ASSERT_HOST (word->best_choice->length() == word->reject_map.length()); if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps) ensure_rep_chars_are_consistent(word); set_unlv_suspects(word); check_debug_pt (word, 120); if (tessedit_rejection_debug) { tprintf ("Dict word: \"%s\": %d\n", word->best_choice->debug_string(unicharset).string(), dict_word(*(word->best_choice))); } #if 0 if (tessedit_write_unlv) { write_unlv_text(word); } #endif if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) { repetition_code = "|^~R"; wordstr_lengths = "\001\001\001\001"; repetition_code += unicharset.id_to_unichar(get_rep_char (word)); wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word))); wordstr = &repetition_code; } else { if (tessedit_zero_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (i = 0; i < word->best_choice->length(); ++i) { if (word->reject_map[i].rejected()) word->reject_map[i].setrej_minimal_rej_accept(); } } if (tessedit_minimal_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ for (i = 0; i < word->best_choice->length(); ++i) { if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) word->reject_map[i].setrej_minimal_rej_accept(); } } } if (write_to_shm) write_shm_text (word, page_res_it.block ()->block, page_res_it.row (), *wordstr, wordstr_lengths); #if 0 if (tessedit_write_output) write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile); if (tessedit_write_raw_output) write_cooked_text (word->word, word->raw_choice->string (), TRUE, FALSE, rawfile); if (tessedit_write_txt_map) write_map(txt_mapfile, word); ep_choice = make_epaper_choice (word, newline_type); word->ep_choice = ep_choice; #endif character_count += word->best_choice->length(); word_count++; } } // namespace tesseract /********************************************************************** * make_epaper_choice * * Construct the epaper text string for a word, using the reject map to * determine whether each blob should be rejected. **********************************************************************/ #if 0 WERD_CHOICE *make_epaper_choice( //convert one word WERD_RES *word, //word to do char newline_type //type of newline ) { inT16 index = 0; //to string inT16 blobindex; //to word inT16 prevright = 0; //right of previous blob inT16 nextleft; //left of next blob PBLOB *blob; TBOX inset_box; //bounding box PBLOB_IT blob_it; //blob iterator char word_string[MAX_PATH]; //converted string BOOL8 force_total_reject; char unrecognised = STRING (unrecognised_char)[0]; blob_it.set_to_list (word->outword->blob_list ()); ASSERT_HOST (word->reject_map.length () == word->best_choice->string ().length ()); /* tprintf( "\"%s\" -> length: %d; blobcount: %d (%d)\n", word->best_choice->string().string(), word->best_choice->string().length(), blob_it.length(), blob_count( word->outword ) ); */ if (word->best_choice->string ().length () == 0) force_total_reject = TRUE; else { force_total_reject = FALSE; ASSERT_HOST (blob_it.length () == word->best_choice->string ().length ()); } if (!blob_it.empty ()) { for (index = 0; index < word->word->space (); index++) word_string[index] = ' '; //leading blanks } /* Why does this generate leading blanks regardless of whether the word_choice string is empty, when write_cooked_text ony generates leading blanks when the string is NOT empty???. */ if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) { strcpy (word_string + index, "|^~R"); index += 4; strcpy(word_string + index, unicharset.id_to_unichar(get_rep_char (word))); index += strlen(unicharset.id_to_unichar(get_rep_char (word))); } else { if (!blob_it.empty ()) prevright = blob_it.data ()->bounding_box ().left (); //actually first left for (blobindex = 0, blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blobindex++, blob_it.forward ()) { blob = blob_it.data (); if (word->reject_map[blobindex].accepted ()) { if (word->best_choice->string ()[blobindex] == ' ') //but not rejected!! word_string[index++] = unrecognised; else word_string[index++] = word->best_choice->string ()[blobindex]; } else { // start reject inset_box = blob->bounding_box (); /* Extend reject box to include rejected neighbours */ while (!blob_it.at_last () && (force_total_reject || (word->reject_map[blobindex + 1].rejected ()))) { blobindex++; blob = blob_it.forward (); //get total box inset_box += blob->bounding_box (); } if (blob_it.at_last ()) nextleft = inset_box.right (); else nextleft = blob_it.data_relative (1)->bounding_box ().left (); // tprintf("Making reject from (%d,%d)->(%d,%d)\n", // inset_box.left(),inset_box.bottom(), // inset_box.right(),inset_box.top()); index += make_reject (&inset_box, prevright, nextleft, &word->denorm, &word_string[index]); } prevright = blob->bounding_box ().right (); } } if (newline_type) //end line word_string[index++] = newline_type; word_string[index] = '\0'; //terminate string if (strlen (word_string) != index) { tprintf ("ASSERT ABOUT TO FAIL: %s, index %d len %d\n", word_string, index, strlen (word_string)); } //don't pass any zeros ASSERT_HOST (strlen (word_string) == index); return new WERD_CHOICE (word_string, 0, 0, NO_PERM); } #endif /********************************************************************** * make_reject * * Add the escape code to the string for the reject. **********************************************************************/ inT16 make_reject ( //make reject code TBOX * inset_box, //bounding box inT16 prevright, //previous char inT16 nextleft, //next char DENORM * denorm, //de-normalizer char word_string[] //output string ) { inT16 index; //to string inT16 xpos; //start of inset inT16 ypos; inT16 width; //size of inset inT16 height; inT16 left_offset; //shift form prev char inT16 right_offset; //shift to next char inT16 baseline_offset; //shift from baseline inT16 inset_index = 0; //number of inset inT16 min_chars; //min width estimate inT16 max_chars; //max width estimate float x_centre; //centre of box index = 0; x_centre = (inset_box->left () + inset_box->right ()) / 2.0; left_offset = (inT16) (denorm->x (inset_box->left ()) - denorm->x (prevright)); right_offset = (inT16) (denorm->x (nextleft) - denorm->x (inset_box->right ())); xpos = (inT16) floor (denorm->x (inset_box->left ())); width = (inT16) ceil (denorm->x (inset_box->right ())) - xpos; ypos = (inT16) floor (denorm->y (inset_box->bottom (), x_centre)); height = (inT16) ceil (denorm->y (inset_box->top (), x_centre)) - ypos; baseline_offset = ypos - (inT16) denorm->y (bln_baseline_offset, x_centre); //escape code word_string[index++] = CTRL_INSET; min_chars = (inT16) ceil (0.27 * width / denorm->row ()->x_height ()); max_chars = (inT16) floor (1.8 * width / denorm->row ()->x_height ()); /* Ensure min_chars and max_chars are in the range 0..254. This ensures that we can add 1 to them to avoid putting \0 in a string, and still not exceed the max value in a byte. */ if (min_chars < 0) min_chars = 0; if (min_chars > 254) min_chars = 254; if (max_chars < min_chars) max_chars = min_chars; if (max_chars > 254) max_chars = 254; //min chars word_string[index++] = min_chars + 1; //max chars word_string[index++] = max_chars + 1; word_string[index++] = 2; //type? //store index word_string[index++] = inset_index / 255 + 1; word_string[index++] = inset_index % 255 + 1; return index; //size of string } /********************************************************************** * determine_newline_type * * Find whether we have a wrapping or hard newline. * Return FALSE if not at end of line. **********************************************************************/ char determine_newline_type( //test line ends WERD *word, //word to do BLOCK *block, //current block WERD *next_word, //next word BLOCK *next_block //block of next word ) { inT16 end_gap; //to right edge inT16 width; //of next word TBOX word_box; //bounding TBOX next_box; //next word TBOX block_box; //block bounding if (!word->flag (W_EOL)) return FALSE; //not end of line if (next_word == NULL || next_block == NULL || block != next_block) return CTRL_NEWLINE; if (next_word->space () > 0) return CTRL_HARDLINE; //it is tabbed word_box = word->bounding_box (); next_box = next_word->bounding_box (); block_box = block->bounding_box (); //gap to eol end_gap = block_box.right () - word_box.right (); end_gap -= (inT32) block->space (); width = next_box.right () - next_box.left (); // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n", // block_box.right(),word_box.right(),end_gap, // next_box.right(),next_box.left(),width, // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE); return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE; } /********************************************************************** * write_shm_text * * Write the cooked text to the shared memory for the api. **********************************************************************/ void write_shm_text( //write output WERD_RES *word, //word to do BLOCK *block, //block it is from ROW_RES *row, //row it is from const STRING &text, //text to write const STRING &text_lengths ) { inT32 index; //char counter inT32 index2; //char counter inT32 length; //chars in word inT32 ptsize; //font size inT8 blanks; //blanks in word uinT8 enhancement; //bold etc uinT8 font; //font index char unrecognised = STRING (unrecognised_char)[0]; PBLOB *blob; TBOX blob_box; //bounding box PBLOB_IT blob_it; //blob iterator WERD copy_outword; // copy to denorm uinT32 rating; //of char BOOL8 lineend; //end of line int offset; int offset2; //point size ptsize = pixels_to_pts ((inT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300); if (word->word->flag (W_BOL) && ocr_char_space () < 128 && ocr_send_text (TRUE) != OKAY) return; //release failed copy_outword = *(word->outword); copy_outword.baseline_denormalise (&word->denorm); blob_it.set_to_list (copy_outword.blob_list ()); length = text_lengths.length (); if (length > 0) { blanks = word->word->space (); if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL)) blanks = 1; for (index = 0, offset = 0; index < length; offset += text_lengths[index++], blob_it.forward ()) { blob = blob_it.data (); blob_box = blob->bounding_box (); enhancement = 0; if (word->italic > 0 || (word->italic == 0 && row->italic > 0)) enhancement |= EUC_ITALIC; if (word->bold > 0 || (word->bold == 0 && row->bold > 0)) enhancement |= EUC_BOLD; if (tessedit_write_ratings) rating = (uinT32) (-word->best_choice->certainty () / 0.035); else if (tessedit_zero_rejection) rating = text[offset] == ' ' ? 100 : 0; else rating = word->reject_map[index].accepted ()? 0 : 100; if (rating > 255) rating = 255; if (word->font1_count > 2) font = word->font1; else if (row->font1_count > 8) font = row->font1; else //font index font = word->word->flag (W_DONT_CHOP) ? 0 : 1; lineend = word->word->flag (W_EOL) && index == length - 1; if (word->word->flag (W_EOL) && tessedit_zero_rejection && index < length - 1 && text[index + text_lengths[index]] == ' ') { for (index2 = index + 1, offset2 = offset + text_lengths[index]; index2 < length && text[offset2] == ' '; offset2 += text_lengths[index2++]); if (index2 == length) lineend = TRUE; } if (!tessedit_zero_rejection || text[offset] != ' ' || tessedit_word_for_word) { //confidence if (text[offset] == ' ') { ocr_append_char (unrecognised, blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, (uinT8) rating, ptsize, //point size blanks, enhancement, //enhancement OCR_CDIR_LEFT_RIGHT, OCR_LDIR_DOWN_RIGHT, lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); } else { for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset) ocr_append_char (static_cast(text[offset+suboffset]), blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, (uinT8) rating, ptsize, //point size blanks, enhancement, //enhancement OCR_CDIR_LEFT_RIGHT, OCR_LDIR_DOWN_RIGHT, lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); } blanks = 0; } } } else if (tessedit_word_for_word) { blanks = word->word->space (); if (blanks == 0 && !word->word->flag (W_BOL)) blanks = 1; blob_box = word->word->bounding_box (); enhancement = 0; if (word->italic > 0) enhancement |= EUC_ITALIC; if (word->bold > 0) enhancement |= EUC_BOLD; rating = 100; if (word->font1_count > 2) font = word->font1; else if (row->font1_count > 8) font = row->font1; else //font index font = word->word->flag (W_DONT_CHOP) ? 0 : 1; lineend = word->word->flag (W_EOL); //font index ocr_append_char (unrecognised, blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, rating, //confidence ptsize, //point size blanks, enhancement, //enhancement OCR_CDIR_LEFT_RIGHT, OCR_LDIR_DOWN_RIGHT, lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); } } /********************************************************************** * write_map * * Write a map file of 0's and 1'a which associates characters from the .txt * file with those in the .etx file. 0 = .txt char was deleted. 1 = .txt char * is kept. Note that there may be reject regions in the .etx file WITHOUT * .txt chars being rejected. The map file should be the same length, and * the same number of lines as the .txt file * * The paramaterised input is because I thought I might be able to generate * multiple map files in a single run. However, it didn't work because * newdiff needs etx files! **********************************************************************/ #if 0 void write_map( //output a map file FILE *mapfile, //mapfile to write to WERD_RES *word) { inT16 index; int status; STRING mapstr = ""; if (word->best_choice->string ().length () > 0) { for (index = 0; index < word->word->space (); index++) { if (word->reject_spaces && (suspect_level >= suspect_space_level) && !tessedit_minimal_rejection && !tessedit_zero_rejection) /* Write rejected spaces to .map file ONLY. Newdiff converts these back to accepted spaces AFTER generating basic space stats but BEFORE using .etx */ status = fprintf (mapfile, "0"); else status = fprintf (mapfile, "1"); if (status < 0) WRITEFAILED.error ("write_map", EXIT, "Space Errno: %d", errno); } if ((word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) { for (index = 0; index < 5; index++) mapstr += '1'; } else { ASSERT_HOST (word->reject_map.length () == word->best_choice->string ().length ()); for (index = 0; index < word->reject_map.length (); index++) { if (word->reject_map[index].accepted ()) mapstr += '1'; else mapstr += '0'; } } status = fprintf (mapfile, "%s", mapstr.string ()); if (status < 0) WRITEFAILED.error ("write_map", EXIT, "Map str Errno: %d", errno); } if (word->word->flag (W_EOL)) { status = fprintf (mapfile, "\n"); if (status < 0) WRITEFAILED.error ("write_map", EXIT, "Newline Errno: %d", errno); } status = fflush (mapfile); if (status != 0) WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno); } #endif /************************************************************************* * open_file() *************************************************************************/ namespace tesseract { FILE *Tesseract::open_outfile( //open .map & .unlv file const char *extension) { STRING file_name; FILE *outfile; file_name = imagebasename + extension; if (!(outfile = fopen (file_name.string (), "w"))) { CANTOPENFILE.error ("open_outfile", EXIT, "%s %d", file_name.string (), errno); } return outfile; } } // namespace tesseract #if 0 void write_unlv_text(WERD_RES *word) { const char *wordstr; char buff[512]; //string to output int i = 0; int j = 0; char unrecognised = STRING (unrecognised_char)[0]; int status; char space_str[3]; wordstr = word->best_choice->string ().string (); /* DONT need to do anything special for repeated char words - at this stage the repetition char has been identified and any other chars have been rejected. */ for (; wordstr[i] != '\0'; i++) { if ((wordstr[i] == ' ') || (wordstr[i] == '~') || (wordstr[i] == '^') || (wordstr[i] == '|')) buff[j++] = unrecognised; else { if (word->reject_map[i].rejected ()) buff[j++] = '^'; //Add suspect marker buff[j++] = wordstr[i]; } } buff[j] = '\0'; if (strlen (wordstr) > 0) { if (word->reject_spaces && (suspect_level >= suspect_space_level) && !tessedit_minimal_rejection && !tessedit_zero_rejection) strcpy (space_str, "^ "); //Suspect space else strcpy (space_str, " "); //Certain space for (i = 0; i < word->word->space (); i++) { status = fprintf (unlv_file, "%s", space_str); if (status < 0) WRITEFAILED.error ("write_unlv_text", EXIT, "Space Errno: %d", errno); } status = fprintf (unlv_file, "%s", buff); if (status < 0) WRITEFAILED.error ("write_unlv_text", EXIT, "Word Errno: %d", errno); } if (word->word->flag (W_EOL)) { status = fprintf (unlv_file, "\n"); if (status < 0) WRITEFAILED.error ("write_unlv_text", EXIT, "Newline Errno: %d", errno); } status = fflush (unlv_file); if (status != 0) WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno); } #endif /************************************************************************* * get_rep_char() * Return the first accepted character from the repetition string. This is the * character which is repeated - as determined earlier by fix_rep_char() *************************************************************************/ namespace tesseract { UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated? int i; for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i); if (i < word->reject_map.length()) { return word->best_choice->unichar_id(i); } else { return unicharset.unichar_to_id(unrecognised_char.string()); } } } // namespace tesseract void ensure_rep_chars_are_consistent(WERD_RES *word) { #if 0 char rep_char = get_rep_char (word); char *ptr; ptr = (char *) word->best_choice->string ().string (); for (; *ptr != '\0'; ptr++) { if (*ptr != rep_char) *ptr = rep_char; } #endif #if 0 UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate int i; char *ptr; STRING consistent_string; STRING consistent_string_lengths; ptr = (char *) word->best_choice->string ().string (); for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) { consistent_string += unicharset.id_to_unichar(rep_char); consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char)); } word->best_choice->string() = consistent_string; word->best_choice->lengths() = consistent_string_lengths; #endif } /************************************************************************* * SUSPECT LEVELS * * 0 - dont reject ANYTHING * 1,2 - partial rejection * 3 - BEST * * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and * tessedit_minimal_rejection. *************************************************************************/ namespace tesseract { void Tesseract::set_unlv_suspects(WERD_RES *word_res) { int len = word_res->reject_map.length(); const WERD_CHOICE &word = *(word_res->best_choice); int i; float rating_per_ch; if (suspect_level == 0) { for (i = 0; i < len; i++) { if (word_res->reject_map[i].rejected()) word_res->reject_map[i].setrej_minimal_rej_accept(); } return; } if (suspect_level >= 3) return; //Use defaults /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/ if (safe_dict_word(word) && (count_alphas(word) > suspect_short_words)) { /* Unreject alphas in dictionary words */ for (i = 0; i < len; ++i) { if (word_res->reject_map[i].rejected() && unicharset.get_isalpha(word.unichar_id(i))) word_res->reject_map[i].setrej_minimal_rej_accept(); } } rating_per_ch = word.rating() / word_res->reject_map.length(); if (rating_per_ch >= suspect_rating_per_ch) return; //Dont touch bad ratings if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) { /* Unreject any Tess Acceptable word - but NOT tess reject chs*/ for (i = 0; i < len; ++i) { if (word_res->reject_map[i].rejected() && (!unicharset.eq(word.unichar_id(i), " "))) word_res->reject_map[i].setrej_minimal_rej_accept(); } } for (i = 0; i < len; i++) { if (word_res->reject_map[i].rejected()) { if (word_res->reject_map[i].flag(R_DOC_REJ)) word_res->reject_map[i].setrej_minimal_rej_accept(); if (word_res->reject_map[i].flag(R_BLOCK_REJ)) word_res->reject_map[i].setrej_minimal_rej_accept(); if (word_res->reject_map[i].flag(R_ROW_REJ)) word_res->reject_map[i].setrej_minimal_rej_accept(); } } if (suspect_level == 2) return; if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) { for (i = 0; i < len; i++) { if (word_res->reject_map[i].rejected()) { if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) || word_res->reject_map[i].flag(R_POSTNN_1IL))) word_res->reject_map[i].setrej_minimal_rej_accept(); if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) word_res->reject_map[i].setrej_minimal_rej_accept(); } } } if ((acceptable_word_string(word.unichar_string().string(), word.unichar_lengths().string()) != AC_UNACCEPTABLE) || acceptable_number_string(word.unichar_string().string(), word.unichar_lengths().string())) { if (word_res->reject_map.length() > suspect_short_words) { for (i = 0; i < len; i++) { if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() || word_res->reject_map[i].flag (R_1IL_CONFLICT) || word_res->reject_map[i].flag (R_POSTNN_1IL) || word_res->reject_map[i].flag (R_MM_REJECT))) { word_res->reject_map[i].setrej_minimal_rej_accept(); } } } } } inT16 Tesseract::count_alphas(const WERD_CHOICE &word) { int count = 0; for (int i = 0; i < word.length(); ++i) { if (unicharset.get_isalpha(word.unichar_id(i))) count++; } return count; } inT16 Tesseract::count_alphanums(const WERD_CHOICE &word) { int count = 0; for (int i = 0; i < word.length(); ++i) { if (unicharset.get_isalpha(word.unichar_id(i)) || unicharset.get_isdigit(word.unichar_id(i))) count++; } return count; } BOOL8 Tesseract::acceptable_number_string(const char *s, const char *lengths) { BOOL8 prev_digit = FALSE; if (*lengths == 1 && *s == '(') s++; if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) s++; for (; *s != '\0'; s += *(lengths++)) { if (unicharset.get_isdigit (s, *lengths)) prev_digit = TRUE; else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) prev_digit = FALSE; else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')'))) return TRUE; else if (prev_digit && *lengths == 1 && (*s == '%') && (*(lengths + 1) == 1 && *(s + *lengths) == ')') && (*(s + *lengths + *(lengths + 1)) == '\0')) return TRUE; else return FALSE; } return TRUE; } } // namespace tesseract