API/output changes to produce unlv-style latin-1 output and test scripts

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@86 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
theraysmith 2007-07-18 01:11:18 +00:00
parent eeaca1beba
commit 627368df42
27 changed files with 1424 additions and 442 deletions

View File

@ -24,20 +24,22 @@ what measures we are interested in.
/* #define SECURE_NAMES done in secnames.h when necessary*/ /* #define SECURE_NAMES done in secnames.h when necessary*/
#include "mfcpch.h" #include "mfcpch.h"
#include "applybox.h" #include "applybox.h"
#include <ctype.h> #include <ctype.h>
#include <string.h> #include <string.h>
#ifdef __UNIX__ #ifdef __UNIX__
#include <assert.h> #include <assert.h>
#include <errno.h> #include <errno.h>
#endif #endif
#include "mainblk.h" #include "mainblk.h"
#include "genblob.h" #include "genblob.h"
#include "fixxht.h" #include "fixxht.h"
#include "control.h" #include "control.h"
#include "tessbox.h" #include "tessbox.h"
#include "globals.h" #include "globals.h"
#include "secname.h" #include "secname.h"
#include "unichar.h"
#include "matchdefs.h"
#define SECURE_NAMES #define SECURE_NAMES
#ifndef SECURE_NAMES #ifndef SECURE_NAMES
@ -47,10 +49,13 @@ what measures we are interested in.
#define EXTERN #define EXTERN
EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead"); EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");
EXTERN INT_VAR (applybox_debug, 0, "Debug level"); EXTERN INT_VAR (applybox_debug, 0, "Debug level");
EXTERN STRING_VAR (applybox_test_exclusions, "|", EXTERN STRING_VAR (applybox_test_exclusions, "",
"Chars ignored for testing"); "Chars ignored for testing");
EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht"); EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");
// The unicharset used during box training
static UNICHARSET unicharset_boxes;
/************************************************************************* /*************************************************************************
* The code re-assigns outlines to form words each with ONE labelled blob. * The code re-assigns outlines to form words each with ONE labelled blob.
* Noise is left in UNLABELLED words. The chars on the page are checked crudely * Noise is left in UNLABELLED words. The chars on the page are checked crudely
@ -89,7 +94,7 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
INT16 boxfile_lineno = 0; INT16 boxfile_lineno = 0;
INT16 boxfile_charno = 0; INT16 boxfile_charno = 0;
BOX box; //boxfile box BOX box; //boxfile box
char ch[2]; //correct ch from boxfile UNICHAR_ID uch_id; //correct ch from boxfile
ROW *row; ROW *row;
ROW *prev_row = NULL; ROW *prev_row = NULL;
INT16 prev_box_right = MAX_INT16; INT16 prev_box_right = MAX_INT16;
@ -100,15 +105,20 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
INT16 labels_ok; INT16 labels_ok;
INT16 rows_ok; INT16 rows_ok;
INT16 bad_blobs; INT16 bad_blobs;
INT16 tgt_char_counts[128]; //No. of box samples INT16 tgt_char_counts[MAX_NUM_CLASSES]; //No. of box samples
// INT16 labelled_char_counts[128]; //No. of unique labelled samples // INT16 labelled_char_counts[128]; //No. of unique labelled samples
INT16 i; INT16 i;
INT16 rebalance_count = 0; INT16 rebalance_count = 0;
char min_char; UNICHAR_ID min_uch_id;
INT16 min_samples; INT16 min_samples;
INT16 final_labelled_blob_count; INT16 final_labelled_blob_count;
for (i = 0; i < 128; i++) // Clean the unichar set
unicharset_boxes.clear();
// Space character needed to represent NIL classification
unicharset_boxes.unichar_insert(" ");
for (i = 0; i < MAX_NUM_CLASSES; i++)
tgt_char_counts[i] = 0; tgt_char_counts[i] = 0;
FILE* box_file; FILE* box_file;
@ -120,11 +130,10 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
filename.string(), errno); filename.string(), errno);
} }
ch[1] = '\0';
clear_any_old_text(block_list); clear_any_old_text(block_list);
while (read_next_box (box_file, &box, &ch[0])) { while (read_next_box (box_file, &box, &uch_id)) {
box_count++; box_count++;
tgt_char_counts[ch[0]]++; tgt_char_counts[uch_id]++;
row = find_row_of_box (block_list, box, block_id, row_id); row = find_row_of_box (block_list, box, block_id, row_id);
if (box.left () < prev_box_right) { if (box.left () < prev_box_right) {
boxfile_lineno++; boxfile_lineno++;
@ -135,14 +144,16 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
if (row == NULL) { if (row == NULL) {
box_failures++; box_failures++;
report_failed_box (boxfile_lineno, boxfile_charno, box, ch, report_failed_box (boxfile_lineno, boxfile_charno, box,
unicharset_boxes.id_to_unichar(uch_id),
"FAILURE! box overlaps no blobs or blobs in multiple rows"); "FAILURE! box overlaps no blobs or blobs in multiple rows");
} }
else { else {
if ((box.left () >= prev_box_right) && (row != prev_row)) if ((box.left () >= prev_box_right) && (row != prev_row))
report_failed_box (boxfile_lineno, boxfile_charno, box, ch, report_failed_box (boxfile_lineno, boxfile_charno, box,
unicharset_boxes.id_to_unichar(uch_id),
"WARNING! false row break"); "WARNING! false row break");
box_failures += resegment_box (row, box, ch, block_id, row_id, box_failures += resegment_box (row, box, uch_id, block_id, row_id,
boxfile_lineno, boxfile_charno); boxfile_lineno, boxfile_charno);
prev_row = row; prev_row = row;
} }
@ -154,7 +165,7 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
bad_blobs, bad_blobs,
tgt_char_counts, tgt_char_counts,
rebalance_count, rebalance_count,
min_char, &min_uch_id,
min_samples, min_samples,
final_labelled_blob_count); final_labelled_blob_count);
tprintf ("APPLY_BOXES:\n"); tprintf ("APPLY_BOXES:\n");
@ -163,7 +174,8 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
labels_ok, rows_ok); labels_ok, rows_ok);
tprintf (" Box failures detected: %6d\n", box_failures); tprintf (" Box failures detected: %6d\n", box_failures);
tprintf (" Duped blobs for rebalance:%6d\n", rebalance_count); tprintf (" Duped blobs for rebalance:%6d\n", rebalance_count);
tprintf (" \"%c\" has fewest samples:%6d\n", min_char, min_samples); tprintf (" \"%s\" has fewest samples:%6d\n",
unicharset_boxes.id_to_unichar(min_uch_id), min_samples);
tprintf (" Total unlabelled words: %6d\n", tprintf (" Total unlabelled words: %6d\n",
bad_blobs); bad_blobs);
tprintf (" Final labelled words: %6d\n", tprintf (" Final labelled words: %6d\n",
@ -194,7 +206,7 @@ void clear_any_old_text( //remove correct text
BOOL8 read_next_box(FILE* box_file, // BOOL8 read_next_box(FILE* box_file, //
BOX *box, BOX *box,
char *ch) { UNICHAR_ID *uch_id) {
char buff[256]; //boxfile read buffer char buff[256]; //boxfile read buffer
char *buffptr = buff; char *buffptr = buff;
STRING box_filename; STRING box_filename;
@ -204,23 +216,38 @@ BOOL8 read_next_box(FILE* box_file, //
INT32 x_max; INT32 x_max;
INT32 y_max; INT32 y_max;
INT32 count = 0; INT32 count = 0;
char uch[256];
while (!feof (box_file)) { while (!feof (box_file)) {
fgets (buff, sizeof (buff) - 1, box_file); fgets (buff, sizeof (buff) - 1, box_file);
line++; line++;
buffptr = buff;
const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
buffptr += 3; // Skip unicode file designation.
/* Check for blank lines in box file */ /* Check for blank lines in box file */
for (buffptr = buff; isspace (*buffptr); buffptr++) while (isspace (*buffptr))
; buffptr++;
if (*buffptr != '\0') { if (*buffptr != '\0') {
count = count =
sscanf (buff, sscanf (buffptr,
"%c " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " " "%s " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
INT32FORMAT, ch, &x_min, &y_min, &x_max, &y_max); INT32FORMAT, uch, &x_min, &y_min, &x_max, &y_max);
if (count != 5) { if (count != 5) {
tprintf ("Box file format error on line %i ignored\n", line); tprintf ("Box file format error on line %i ignored\n", line);
} }
else { else {
if (!unicharset_boxes.contains_unichar(uch))
{
unicharset_boxes.unichar_insert(uch);
if (unicharset_boxes.size() > MAX_NUM_CLASSES) {
tprintf("Error: Size of unicharset of boxes is \
greater than MAX_NUM_CLASSES\n");
exit(1);
}
}
*uch_id = unicharset_boxes.unichar_to_id(uch);
*box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max)); *box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
return TRUE; //read a box ok return TRUE; //read a box ok
} }
@ -314,7 +341,7 @@ ROW *find_row_of_box( //
INT16 resegment_box( // INT16 resegment_box( //
ROW *row, ROW *row,
BOX box, BOX box,
char *ch, UNICHAR_ID uch_id,
INT16 block_id, INT16 block_id,
INT16 row_id, INT16 row_id,
INT16 boxfile_lineno, INT16 boxfile_lineno,
@ -358,7 +385,7 @@ INT16 resegment_box( //
if (applybox_debug > 4) if (applybox_debug > 4)
report_failed_box (boxfile_lineno, report_failed_box (boxfile_lineno,
boxfile_charno, boxfile_charno,
box, ch, box, unicharset_boxes.id_to_unichar(uch_id),
"FAILURE! box overlaps blob in labelled word"); "FAILURE! box overlaps blob in labelled word");
} }
if (applybox_debug > 4) if (applybox_debug > 4)
@ -375,7 +402,7 @@ INT16 resegment_box( //
if (new_word == NULL) { if (new_word == NULL) {
/* Make a new word with a single blob */ /* Make a new word with a single blob */
new_word = word->shallow_copy (); new_word = word->shallow_copy ();
new_word->set_text (ch); new_word->set_text (unicharset_boxes.id_to_unichar(uch_id));
if (polyg) if (polyg)
new_blob = new PBLOB; new_blob = new PBLOB;
else else
@ -414,63 +441,75 @@ INT16 resegment_box( //
word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f; word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f;
baseline = row->base_line (word_x_centre); baseline = row->base_line (word_x_centre);
if (STRING (chs_caps_ht).contains (ch[0]) && #if 0
(new_word_box.top () < if (strlen(unicharset_boxes.id_to_unichar(uch_id)) == 1) {
baseline + (1 + applybox_error_band) * row->x_height ())) { if (STRING (chs_caps_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
report_failed_box (boxfile_lineno, boxfile_charno, box, ch, (new_word_box.top () <
"FAILURE! caps-ht char didn't ascend"); baseline + (1 + applybox_error_band) * row->x_height ())) {
new_word->set_text (""); report_failed_box (boxfile_lineno, boxfile_charno, box,
return 1; unicharset_boxes.id_to_unichar(uch_id),
} "FAILURE! caps-ht char didn't ascend");
if (STRING (chs_odd_top).contains (ch[0]) && new_word->set_text ("");
(new_word_box.top () < return 1;
baseline + (1 - applybox_error_band) * row->x_height ())) { }
report_failed_box (boxfile_lineno, boxfile_charno, box, ch, if (STRING (chs_odd_top).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
"FAILURE! Odd top char below xht"); (new_word_box.top () <
new_word->set_text (""); baseline + (1 - applybox_error_band) * row->x_height ())) {
return 1; report_failed_box (boxfile_lineno, boxfile_charno, box,
} unicharset_boxes.id_to_unichar(uch_id),
if (STRING (chs_x_ht).contains (ch[0]) && "FAILURE! Odd top char below xht");
((new_word_box.top () > new_word->set_text ("");
baseline + (1 + applybox_error_band) * row->x_height ()) || return 1;
(new_word_box.top () < }
baseline + (1 - applybox_error_band) * row->x_height ()))) { if (STRING (chs_x_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
report_failed_box (boxfile_lineno, boxfile_charno, box, ch, ((new_word_box.top () >
"FAILURE! x-ht char didn't have top near xht"); baseline + (1 + applybox_error_band) * row->x_height ()) ||
new_word->set_text (""); (new_word_box.top () <
return 1; baseline + (1 - applybox_error_band) * row->x_height ()))) {
} report_failed_box (boxfile_lineno, boxfile_charno, box,
if (STRING (chs_non_ambig_bl).contains (ch[0]) && unicharset_boxes.id_to_unichar(uch_id),
((new_word_box.bottom () < "FAILURE! x-ht char didn't have top near xht");
baseline - applybox_error_band * row->x_height ()) || new_word->set_text ("");
(new_word_box.bottom () > return 1;
baseline + applybox_error_band * row->x_height ()))) { }
report_failed_box (boxfile_lineno, boxfile_charno, box, ch, if (STRING (chs_non_ambig_bl).contains
"FAILURE! non ambig BL char didnt have bottom near baseline"); (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
new_word->set_text (""); ((new_word_box.bottom () <
return 1; baseline - applybox_error_band * row->x_height ()) ||
} (new_word_box.bottom () >
if (STRING (chs_odd_bot).contains (ch[0]) && baseline + applybox_error_band * row->x_height ()))) {
(new_word_box.bottom () > report_failed_box (boxfile_lineno, boxfile_charno, box,
baseline + applybox_error_band * row->x_height ())) { unicharset_boxes.id_to_unichar(uch_id),
report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! non ambig BL char didnt have bottom near baseline");
"FAILURE! Odd bottom char above baseline"); new_word->set_text ("");
new_word->set_text (""); return 1;
return 1; }
} if (STRING (chs_odd_bot).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
if (STRING (chs_desc).contains (ch[0]) && (new_word_box.bottom () >
(new_word_box.bottom () > baseline + applybox_error_band * row->x_height ())) {
baseline - applybox_error_band * row->x_height ())) { report_failed_box (boxfile_lineno, boxfile_charno, box,
report_failed_box (boxfile_lineno, boxfile_charno, box, ch, unicharset_boxes.id_to_unichar(uch_id),
"FAILURE! Odd bottom char above baseline");
new_word->set_text ("");
return 1;
}
if (STRING (chs_desc).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
(new_word_box.bottom () >
baseline - applybox_error_band * row->x_height ())) {
report_failed_box (boxfile_lineno, boxfile_charno, box,
unicharset_boxes.id_to_unichar(uch_id),
"FAILURE! Descender doesn't descend"); "FAILURE! Descender doesn't descend");
new_word->set_text (""); new_word->set_text ("");
return 1; return 1;
}
} }
#endif
return 0; return 0;
} }
else { else {
report_failed_box (boxfile_lineno, boxfile_charno, box, ch, report_failed_box (boxfile_lineno, boxfile_charno, box,
"FAILURE! Couldn't find any blobs"); unicharset_boxes.id_to_unichar(uch_id),
"FAILURE! Couldn't find any blobs");
return 1; return 1;
} }
} }
@ -492,7 +531,7 @@ void tidy_up( //
INT16 &unlabelled_words, INT16 &unlabelled_words,
INT16 *tgt_char_counts, INT16 *tgt_char_counts,
INT16 &rebalance_count, INT16 &rebalance_count,
char &min_char, UNICHAR_ID *min_uch_id,
INT16 &min_samples, INT16 &min_samples,
INT16 &final_labelled_blob_count) { INT16 &final_labelled_blob_count) {
BLOCK_IT block_it(block_list); BLOCK_IT block_it(block_list);
@ -507,16 +546,16 @@ void tidy_up( //
BOOL8 row_ok; BOOL8 row_ok;
BOOL8 rebalance_needed = FALSE; BOOL8 rebalance_needed = FALSE;
//No. of unique labelled samples //No. of unique labelled samples
INT16 labelled_char_counts[128]; INT16 labelled_char_counts[MAX_NUM_CLASSES];
INT16 i; INT16 i;
char ch; UNICHAR_ID uch_id;
char prev_ch = '\0'; UNICHAR_ID prev_uch_id = -1;
BOOL8 at_dupe_of_prev_word; BOOL8 at_dupe_of_prev_word;
ROW *prev_row = NULL; ROW *prev_row = NULL;
INT16 left; INT16 left;
INT16 prev_left = -1; INT16 prev_left = -1;
for (i = 0; i < 128; i++) for (i = 0; i < MAX_NUM_CLASSES; i++)
labelled_char_counts[i] = 0; labelled_char_counts[i] = 0;
ok_char_count = 0; ok_char_count = 0;
@ -556,7 +595,7 @@ void tidy_up( //
block_idx, row_idx, all_row_idx); block_idx, row_idx, all_row_idx);
ok_char_count++; ok_char_count++;
labelled_char_counts[*word->text ()]++; labelled_char_counts[unicharset_boxes.unichar_to_id(word->text ())]++;
row_ok = TRUE; row_ok = TRUE;
} }
} }
@ -571,24 +610,24 @@ void tidy_up( //
} }
min_samples = 9999; min_samples = 9999;
for (i = 0; i < 128; i++) { for (i = 0; i < unicharset_boxes.size(); i++) {
if (tgt_char_counts[i] > labelled_char_counts[i]) { if (tgt_char_counts[i] > labelled_char_counts[i]) {
if (labelled_char_counts[i] <= 1) { if (labelled_char_counts[i] <= 1) {
tprintf tprintf
("APPLY_BOXES: FATALITY - %d labelled samples of \"%c\" - target is %d\n", ("APPLY_BOXES: FATALITY - %d labelled samples of \"%s\" - target is %d\n",
labelled_char_counts[i], (char) i, tgt_char_counts[i]); labelled_char_counts[i], unicharset_boxes.id_to_unichar(i), tgt_char_counts[i]);
} }
else { else {
rebalance_needed = TRUE; rebalance_needed = TRUE;
if (applybox_debug > 0) if (applybox_debug > 0)
tprintf tprintf
("APPLY_BOXES: REBALANCE REQD \"%c\" - target of %d from %d labelled samples\n", ("APPLY_BOXES: REBALANCE REQD \"%s\" - target of %d from %d labelled samples\n",
(char) i, tgt_char_counts[i], labelled_char_counts[i]); unicharset_boxes.id_to_unichar(i), tgt_char_counts[i], labelled_char_counts[i]);
} }
} }
if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) { if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {
min_samples = labelled_char_counts[i]; min_samples = labelled_char_counts[i];
min_char = (char) i; *min_uch_id = i;
} }
} }
@ -605,33 +644,36 @@ void tidy_up( //
!word_it.cycled_list (); word_it.forward ()) { !word_it.cycled_list (); word_it.forward ()) {
word = word_it.data (); word = word_it.data ();
left = word->bounding_box ().left (); left = word->bounding_box ().left ();
ch = *word->text (); if (*word->text () != '\0')
uch_id = unicharset_boxes.unichar_to_id(word->text ());
else
uch_id = -1;
at_dupe_of_prev_word = ((row == prev_row) && at_dupe_of_prev_word = ((row == prev_row) &&
(left = prev_left) && (left = prev_left) &&
(ch == prev_ch)); (uch_id == prev_uch_id));
if ((ch != '\0') && if ((uch_id != -1) &&
(labelled_char_counts[ch] > 1) && (labelled_char_counts[uch_id] > 1) &&
(tgt_char_counts[ch] > labelled_char_counts[ch]) && (tgt_char_counts[uch_id] > labelled_char_counts[uch_id]) &&
(!at_dupe_of_prev_word)) { (!at_dupe_of_prev_word)) {
/* Duplicate the word to rebalance the labelled samples */ /* Duplicate the word to rebalance the labelled samples */
if (applybox_debug > 9) { if (applybox_debug > 9) {
tprintf ("Duping \"%c\" from ", ch); tprintf ("Duping \"%s\" from ", unicharset_boxes.id_to_unichar(uch_id));
word->bounding_box ().print (); word->bounding_box ().print ();
} }
duplicate_word = new WERD; duplicate_word = new WERD;
*duplicate_word = *word; *duplicate_word = *word;
word_it.add_after_then_move (duplicate_word); word_it.add_after_then_move (duplicate_word);
rebalance_count++; rebalance_count++;
labelled_char_counts[ch]++; labelled_char_counts[uch_id]++;
} }
prev_row = row; prev_row = row;
prev_left = left; prev_left = left;
prev_ch = ch; prev_uch_id = uch_id;
} }
} }
} }
rebalance_needed = FALSE; rebalance_needed = FALSE;
for (i = 0; i < 128; i++) { for (i = 0; i < unicharset_boxes.size(); i++) {
if ((tgt_char_counts[i] > labelled_char_counts[i]) && if ((tgt_char_counts[i] > labelled_char_counts[i]) &&
(labelled_char_counts[i] > 1)) { (labelled_char_counts[i] > 1)) {
rebalance_needed = TRUE; rebalance_needed = TRUE;
@ -653,7 +695,7 @@ void tidy_up( //
for (word_it.mark_cycle_pt (); for (word_it.mark_cycle_pt ();
!word_it.cycled_list (); word_it.forward ()) { !word_it.cycled_list (); word_it.forward ()) {
word = word_it.data (); word = word_it.data ();
if ((strlen (word->text ()) == 1) && if ((strlen (word->text ()) > 0) &&
(word->gblob_list ()->length () == 1)) (word->gblob_list ()->length () == 1))
final_labelled_blob_count++; final_labelled_blob_count++;
} }
@ -665,7 +707,7 @@ void tidy_up( //
void report_failed_box(INT16 boxfile_lineno, void report_failed_box(INT16 boxfile_lineno,
INT16 boxfile_charno, INT16 boxfile_charno,
BOX box, BOX box,
char *box_ch, const char *box_ch,
const char *err_msg) { const char *err_msg) {
if (applybox_debug > 4) if (applybox_debug > 4)
tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n", tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n",
@ -687,10 +729,9 @@ void apply_box_training(BLOCK_LIST *block_list) {
PBLOB_IT blob_it; PBLOB_IT blob_it;
DENORM denorm; DENORM denorm;
INT16 count = 0; INT16 count = 0;
char ch[2]; char unichar[UNICHAR_LEN + 1];
ch[1] = '\0';
unichar[UNICHAR_LEN] = '\0';
tprintf ("Generating training data\n"); tprintf ("Generating training data\n");
for (block_it.mark_cycle_pt (); for (block_it.mark_cycle_pt ();
!block_it.cycled_list (); block_it.forward ()) { !block_it.cycled_list (); block_it.forward ()) {
@ -701,23 +742,22 @@ void apply_box_training(BLOCK_LIST *block_list) {
for (word_it.mark_cycle_pt (); for (word_it.mark_cycle_pt ();
!word_it.cycled_list (); word_it.forward ()) { !word_it.cycled_list (); word_it.forward ()) {
word = word_it.data (); word = word_it.data ();
if ((strlen (word->text ()) == 1) && if ((strlen (word->text ()) > 0) &&
(word->gblob_list ()->length () == 1)) { (word->gblob_list ()->length () == 1)) {
/* Here is a word with a single char label and a single blob so train on it */ /* Here is a word with a single unichar label and a single blob so train on it */
bln_word = bln_word =
make_bln_copy (word, row, row->x_height (), &denorm); make_bln_copy (word, row, row->x_height (), &denorm);
blob_it.set_to_list (bln_word->blob_list ()); blob_it.set_to_list (bln_word->blob_list ());
ch[0] = *word->text (); strncpy(unichar, word->text (), UNICHAR_LEN);
tess_training_tester (blob_it.data (), tess_training_tester (blob_it.data (),
//single blob //single blob
&denorm, TRUE, //correct &denorm, TRUE, //correct
ch, //correct ASCII char unichar, //correct character
1, //ASCII length strlen(unichar), //character length
NULL); NULL);
copy_outword = *(bln_word); copy_outword = *(bln_word);
copy_outword.baseline_denormalise (&denorm); copy_outword.baseline_denormalise (&denorm);
blob_it.set_to_list (copy_outword.blob_list ()); blob_it.set_to_list (copy_outword.blob_list ());
ch[0] = *word->text ();
delete bln_word; delete bln_word;
count++; count++;
} }
@ -793,7 +833,7 @@ void apply_box_testing(BLOCK_LIST *block_list) {
choice list, outword blob lists and best_choice string are the same choice list, outword blob lists and best_choice string are the same
length. A TESS screw up is indicated by a blank filled or 0 length string. length. A TESS screw up is indicated by a blank filled or 0 length string.
*/ */
if ((best_choice->string ().length () == 0) || if ((best_choice->lengths ().length () == 0) ||
(strspn (best_choice->string ().string (), " ") == (strspn (best_choice->string ().string (), " ") ==
best_choice->string ().length ())) { best_choice->string ().length ())) {
rej_count++; rej_count++;
@ -804,22 +844,22 @@ void apply_box_testing(BLOCK_LIST *block_list) {
#endif #endif
} }
else { else {
if ((best_choice->string ().length () != if ((best_choice->lengths ().length () !=
outword->blob_list ()->length ()) || outword->blob_list ()->length ()) ||
(best_choice->string ().length () != (best_choice->lengths ().length () !=
blob_choices.length ())) { blob_choices.length ())) {
tprintf tprintf
("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
best_choice->string ().string (), best_choice->string ().string (),
best_choice->string ().length (), best_choice->lengths ().length (),
outword->blob_list ()->length (), outword->blob_list ()->length (),
blob_choices.length ()); blob_choices.length ());
} }
ASSERT_HOST (best_choice->string ().length () == ASSERT_HOST (best_choice->lengths ().length () ==
outword->blob_list ()->length ()); outword->blob_list ()->length ());
ASSERT_HOST (best_choice->string ().length () == ASSERT_HOST (best_choice->lengths ().length () ==
blob_choices.length ()); blob_choices.length ());
fix_quotes ((char *) best_choice->string ().string (), fix_quotes (best_choice,
//turn to double //turn to double
outword, &blob_choices); outword, &blob_choices);
if (strcmp (best_choice->string ().string (), ch) != 0) { if (strcmp (best_choice->string ().string (), ch) != 0) {

View File

@ -27,6 +27,7 @@
#include "applybox.h" #include "applybox.h"
#include "pgedit.h" #include "pgedit.h"
#include "varabled.h" #include "varabled.h"
#include "output.h"
#include "adaptmatch.h" #include "adaptmatch.h"
BOOL_VAR(tessedit_resegment_from_boxes, FALSE, BOOL_VAR(tessedit_resegment_from_boxes, FALSE,
@ -37,6 +38,8 @@ BOOL_VAR(tessedit_train_from_boxes, FALSE,
// Minimum sensible image size to be worth running tesseract. // Minimum sensible image size to be worth running tesseract.
const int kMinRectSize = 10; const int kMinRectSize = 10;
static STRING input_file = "noname.tif";
// Start tesseract. // Start tesseract.
// The datapath must be the name of the data directory or some other file // The datapath must be the name of the data directory or some other file
// in which the data directory resides (for instance argv[0].) // in which the data directory resides (for instance argv[0].)
@ -70,6 +73,12 @@ int TessBaseAPI::InitWithLanguage(const char* datapath, const char* outputbase,
return result; return result;
} }
// Set the name of the input file. Needed only for training and
// loading a UNLV zone file.
void TessBaseAPI::SetInputName(const char* name) {
input_file = name;
}
// Recognize a rectangle from an image and return the result as a string. // Recognize a rectangle from an image and return the result as a string.
// May be called many times for a single Init. // May be called many times for a single Init.
// Currently has no error checking. // Currently has no error checking.
@ -96,6 +105,52 @@ char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
return RecognizeToString(); return RecognizeToString();
} }
// As TesseractRect but produces a box file as output.
char* TessBaseAPI::TesseractRectBoxes(const unsigned char* imagedata,
int bytes_per_pixel,
int bytes_per_line,
int left, int top,
int width, int height,
int imageheight) {
if (width < kMinRectSize || height < kMinRectSize)
return NULL; // Nothing worth doing.
// Copy/Threshold the image to the tesseract global page_image.
CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
left, top, width, height);
BLOCK_LIST block_list;
FindLines(&block_list);
// Now run the main recognition.
PAGE_RES* page_res = Recognize(&block_list, NULL);
return TesseractToBoxText(page_res, left, imageheight - (top + height));
}
char* TessBaseAPI::TesseractRectUNLV(const unsigned char* imagedata,
int bytes_per_pixel,
int bytes_per_line,
int left, int top,
int width, int height) {
if (width < kMinRectSize || height < kMinRectSize)
return NULL; // Nothing worth doing.
// Copy/Threshold the image to the tesseract global page_image.
CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
left, top, width, height);
BLOCK_LIST block_list;
FindLines(&block_list);
// Now run the main recognition.
PAGE_RES* page_res = Recognize(&block_list, NULL);
return TesseractToUNLV(page_res);
}
// Call between pages or documents etc to free up memory and forget // Call between pages or documents etc to free up memory and forget
// adaptive data. // adaptive data.
void TessBaseAPI::ClearAdaptiveClassifier() { void TessBaseAPI::ClearAdaptiveClassifier() {
@ -326,7 +381,7 @@ void TessBaseAPI::CopyBinaryRect(const unsigned char* imagedata,
image.capture(const_cast<unsigned char*>(imagedata), image.capture(const_cast<unsigned char*>(imagedata),
bytes_per_line*8, top + height, 1); bytes_per_line*8, top + height, 1);
page_image.create(width, height, 1); page_image.create(width, height, 1);
copy_sub_image(&image, left, top, width, height, &page_image, 0, 0, false); copy_sub_image(&image, left, 0, width, height, &page_image, 0, 0, false);
} }
// Low-level function to recognize the current global image to a string. // Low-level function to recognize the current global image to a string.
@ -343,7 +398,6 @@ char* TessBaseAPI::RecognizeToString() {
// Find lines from the image making the BLOCK_LIST. // Find lines from the image making the BLOCK_LIST.
void TessBaseAPI::FindLines(BLOCK_LIST* block_list) { void TessBaseAPI::FindLines(BLOCK_LIST* block_list) {
STRING input_file = "noname.tif";
// The following call creates a full-page block and then runs connected // The following call creates a full-page block and then runs connected
// component analysis and text line creation. // component analysis and text line creation.
pgeditor_read_file(input_file, block_list); pgeditor_read_file(input_file, block_list);
@ -369,21 +423,32 @@ PAGE_RES* TessBaseAPI::Recognize(BLOCK_LIST* block_list, ETEXT_DESC* monitor) {
return page_res; return page_res;
} }
// Return the maximum length that the output text string might occupy.
int TessBaseAPI::TextLength(PAGE_RES* page_res) {
PAGE_RES_IT page_res_it(page_res);
int total_length = 2;
// Iterate over the data structures to extract the recognition result.
for (page_res_it.restart_page(); page_res_it.word () != NULL;
page_res_it.forward()) {
WERD_RES *word = page_res_it.word();
WERD_CHOICE* choice = word->best_choice;
if (choice != NULL) {
total_length += choice->string().length() + 1;
for (int i = 0; i < word->reject_map.length(); ++i) {
if (word->reject_map[i].rejected())
++total_length;
}
}
}
return total_length;
}
// Make a text string from the internal data structures. // Make a text string from the internal data structures.
// The input page_res is deleted. // The input page_res is deleted.
char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) { char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
if (page_res != NULL) { if (page_res != NULL) {
int total_length = 2; int total_length = TextLength(page_res);
PAGE_RES_IT page_res_it(page_res); PAGE_RES_IT page_res_it(page_res);
// Iterate over the data structures to extract the recognition result.
for (page_res_it.restart_page(); page_res_it.word () != NULL;
page_res_it.forward()) {
WERD_RES *word = page_res_it.word();
WERD_CHOICE* choice = word->best_choice;
if (choice != NULL) {
total_length += choice->string().length() + 1;
}
}
char* result = new char[total_length]; char* result = new char[total_length];
char* ptr = result; char* ptr = result;
for (page_res_it.restart_page(); page_res_it.word () != NULL; for (page_res_it.restart_page(); page_res_it.word () != NULL;
@ -406,3 +471,207 @@ char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
} }
return NULL; return NULL;
} }
static int ConvertWordToBoxText(WERD_RES *word,
ROW_RES* row,
int left,
int bottom,
char* word_str) {
// Copy the output word and denormalize it back to image coords.
WERD copy_outword;
copy_outword = *(word->outword);
copy_outword.baseline_denormalise(&word->denorm);
PBLOB_IT blob_it;
blob_it.set_to_list(copy_outword.blob_list());
int length = copy_outword.blob_list()->length();
int output_size = 0;
if (length > 0) {
for (int index = 0, offset = 0; index < length;
offset += word->best_choice->lengths()[index++], blob_it.forward()) {
PBLOB* blob = blob_it.data();
BOX blob_box = blob->bounding_box();
if (word->tess_failed ||
blob_box.left() < 0 ||
blob_box.right() > page_image.get_xsize() ||
blob_box.bottom() < 0 ||
blob_box.top() > page_image.get_ysize()) {
// Bounding boxes can be illegal when tess fails on a word.
blob_box = word->word->bounding_box(); // Use original word as backup.
tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
blob_box.left(), blob_box.bottom(),
blob_box.right(), blob_box.top());
}
// A single classification unit can be composed of several UTF-8
// characters. Append each of them to the result.
for (int sub = 0; sub < word->best_choice->lengths()[index]; ++sub) {
char ch = word->best_choice->string()[offset + sub];
// Tesseract uses space for recognition failure. Fix to a reject
// character, '~' so we don't create illegal box files.
if (ch == ' ')
ch = '~';
word_str[output_size++] = ch;
}
sprintf(word_str + output_size, " %d %d %d %d\n",
blob_box.left() + left, blob_box.bottom() + bottom,
blob_box.right() + left, blob_box.top() + bottom);
output_size += strlen(word_str + output_size);
}
}
return output_size;
}
// Multiplier for textlength assumes 4 numbers @ 5 digits and a space
// plus the newline and the orginial character = 4*(5+1)+2
const int kMaxCharsPerChar = 26;
// Make a text string from the internal data structures.
// The input page_res is deleted.
// The text string takes the form of a box file as needed for training.
char* TessBaseAPI::TesseractToBoxText(PAGE_RES* page_res,
int left, int bottom) {
if (page_res != NULL) {
int total_length = TextLength(page_res) * kMaxCharsPerChar;
PAGE_RES_IT page_res_it(page_res);
char* result = new char[total_length];
char* ptr = result;
for (page_res_it.restart_page(); page_res_it.word () != NULL;
page_res_it.forward()) {
WERD_RES *word = page_res_it.word();
ptr += ConvertWordToBoxText(word,page_res_it.row(),left, bottom, ptr);
}
*ptr = '\0';
delete page_res;
return result;
}
return NULL;
}
// Make a text string from the internal data structures.
// The input page_res is deleted. The text string is converted
// to UNLV-format: Latin-1 with specific reject and suspect codes.
const char kUnrecognized = '~';
// Conversion table for non-latin characters.
// Maps characters out of the latin set into the latin set.
// TODO(rays) incorporate this translation into unicharset.
const int kUniChs[] = {
0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
};
// Latin chars corresponding to the unicode chars above.
const int kLatinChs[] = {
0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
};
char* TessBaseAPI::TesseractToUNLV(PAGE_RES* page_res) {
bool tilde_crunch_written = false;
bool last_char_was_newline = true;
bool last_char_was_tilde = false;
if (page_res != NULL) {
int total_length = TextLength(page_res);
PAGE_RES_IT page_res_it(page_res);
char* result = new char[total_length];
char* ptr = result;
for (page_res_it.restart_page(); page_res_it.word () != NULL;
page_res_it.forward()) {
WERD_RES *word = page_res_it.word();
// Process the current word.
if (word->unlv_crunch_mode != CR_NONE) {
if (word->unlv_crunch_mode != CR_DELETE &&
(!tilde_crunch_written ||
(word->unlv_crunch_mode == CR_KEEP_SPACE &&
word->word->space () > 0 &&
!word->word->flag (W_FUZZY_NON) &&
!word->word->flag (W_FUZZY_SP)))) {
if (!word->word->flag (W_BOL) &&
word->word->space () > 0 &&
!word->word->flag (W_FUZZY_NON) &&
!word->word->flag (W_FUZZY_SP)) {
/* Write a space to separate from preceeding good text */
*ptr++ = ' ';
last_char_was_tilde = false;
}
if (!last_char_was_tilde) {
// Write a reject char.
last_char_was_tilde = true;
*ptr++ = kUnrecognized;
tilde_crunch_written = true;
last_char_was_newline = false;
}
}
} else {
// NORMAL PROCESSING of non tilde crunched words.
tilde_crunch_written = false;
if (last_char_was_tilde &&
word->word->space () == 0 &&
(word->best_choice->string ()[0] == ' ')) {
/* Prevent adjacent tilde across words - we know that adjacent tildes within
words have been removed */
char* p = (char *) word->best_choice->string().string ();
strcpy (p, p + 1); //shuffle up
p = (char *) word->best_choice->lengths().string ();
strcpy (p, p + 1); //shuffle up
word->reject_map.remove_pos (0);
PBLOB_IT blob_it = word->outword->blob_list ();
delete blob_it.extract (); //get rid of reject blob
}
if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps)
ensure_rep_chars_are_consistent(word);
set_unlv_suspects(word);
const char* wordstr = word->best_choice->string().string();
if (wordstr[0] != 0) {
if (!last_char_was_newline)
*ptr++ = ' ';
else
last_char_was_newline = false;
int offset = 0;
const STRING& lengths = word->best_choice->lengths();
int length = lengths.length();
for (int i = 0; i < length; offset += lengths[i++]) {
if (wordstr[offset] == ' ' ||
wordstr[offset] == '~' ||
wordstr[offset] == '|') {
*ptr++ = kUnrecognized;
last_char_was_tilde = true;
} else {
if (word->reject_map[i].rejected())
*ptr++ = '^';
UNICHAR ch(wordstr + offset, lengths[i]);
int uni_ch = ch.first_uni();
for (int j = 0; kUniChs[j] != 0; ++j) {
if (kUniChs[j] == uni_ch) {
uni_ch = kLatinChs[j];
break;
}
}
if (uni_ch <= 0xff) {
*ptr++ = static_cast<char>(uni_ch);
last_char_was_tilde = false;
} else {
*ptr++ = kUnrecognized;
last_char_was_tilde = true;
}
}
}
}
}
if (word->word->flag(W_EOL) && !last_char_was_newline) {
/* Add a new line output */
*ptr++ = '\n';
tilde_crunch_written = false;
last_char_was_newline = true;
last_char_was_tilde = false;
}
}
*ptr++ = '\n';
*ptr = '\0';
delete page_res;
return result;
}
return NULL;
}

View File

@ -20,8 +20,6 @@
#ifndef THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__ #ifndef THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
#define THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__ #define THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
#include <string>
class PAGE_RES; class PAGE_RES;
class BLOCK_LIST; class BLOCK_LIST;
@ -56,6 +54,10 @@ class TessBaseAPI {
const char* language, const char* configfile, const char* language, const char* configfile,
bool numeric_mode, int argc, char* argv[]); bool numeric_mode, int argc, char* argv[]);
// Set the name of the input file. Needed only for training and
// reading a UNLV zone file.
static void SetInputName(const char* name);
// Recognize a rectangle from an image and return the result as a string. // Recognize a rectangle from an image and return the result as a string.
// May be called many times for a single Init. // May be called many times for a single Init.
// Currently has no error checking. // Currently has no error checking.
@ -71,6 +73,19 @@ class TessBaseAPI {
int bytes_per_pixel, int bytes_per_pixel,
int bytes_per_line, int bytes_per_line,
int left, int top, int width, int height); int left, int top, int width, int height);
// As TesseractRect but produces a box file as output.
// Image height is needed as well as rect height, since output y-coords
// will be relative to the bottom of the image.
static char* TesseractRectBoxes(const unsigned char* imagedata,
int bytes_per_pixel,
int bytes_per_line,
int left, int top, int width, int height,
int imageheight);
// As TesseractRect but produces UNLV-style output.
static char* TesseractRectUNLV(const unsigned char* imagedata,
int bytes_per_pixel,
int bytes_per_line,
int left, int top, int width, int height);
// Call between pages or documents etc to free up memory and forget // Call between pages or documents etc to free up memory and forget
// adaptive data. // adaptive data.
@ -153,8 +168,18 @@ class TessBaseAPI {
static PAGE_RES* Recognize(BLOCK_LIST* block_list, static PAGE_RES* Recognize(BLOCK_LIST* block_list,
struct ETEXT_STRUCT* monitor); struct ETEXT_STRUCT* monitor);
// Return the maximum length that the output text string might occupy.
static int TextLength(PAGE_RES* page_res);
// Convert (and free) the internal data structures into a text string. // Convert (and free) the internal data structures into a text string.
static char* TesseractToText(PAGE_RES* page_res); static char* TesseractToText(PAGE_RES* page_res);
// Make a text string from the internal data structures.
// The input page_res is deleted.
// The text string takes the form of a box file as needed for training.
static char* TesseractToBoxText(PAGE_RES* page_res, int left, int bottom);
// Make a text string from the internal data structures.
// The input page_res is deleted. The text string is converted
// to UNLV-format: Latin-1 with specific reject and suspect codes.
static char* TesseractToUNLV(PAGE_RES* page_res);
}; };
#endif // THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__ #endif // THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__

View File

@ -35,6 +35,7 @@
#include "docqual.h" #include "docqual.h"
#include "output.h" #include "output.h"
#include "bestfirst.h" #include "bestfirst.h"
#include "globals.h"
#define EXTERN #define EXTERN
@ -55,12 +56,12 @@ EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,
"Write block separators in output"); "Write block separators in output");
EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE, EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,
"Write raw stuff to name.raw"); "Write raw stuff to name.raw");
EXTERN BOOL_EVAR (tessedit_write_output, TRUE, "Write text to name.txt"); EXTERN BOOL_EVAR (tessedit_write_output, FALSE, "Write text to name.txt");
EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE, EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,
"Return ratings in IPEOCRAPI data"); "Return ratings in IPEOCRAPI data");
EXTERN BOOL_EVAR (tessedit_write_txt_map, TRUE, EXTERN BOOL_EVAR (tessedit_write_txt_map, FALSE,
"Write .txt to .etx map file"); "Write .txt to .etx map file");
EXTERN BOOL_EVAR (tessedit_write_rep_codes, TRUE, EXTERN BOOL_EVAR (tessedit_write_rep_codes, FALSE,
"Write repetition char code"); "Write repetition char code");
EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file"); EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");
EXTERN STRING_EVAR (unrecognised_char, "|", EXTERN STRING_EVAR (unrecognised_char, "|",
@ -106,7 +107,6 @@ INT32 pixels_to_pts( //convert coords
return (INT32) (pts + 0.5); //round it return (INT32) (pts + 0.5); //round it
} }
void output_pass( //Tess output pass //send to api void output_pass( //Tess output pass //send to api
PAGE_RES_IT &page_res_it, PAGE_RES_IT &page_res_it,
BOOL8 write_to_shm, BOOL8 write_to_shm,
@ -119,8 +119,7 @@ void output_pass( //Tess output pass //send to api
if (tessedit_write_txt_map) if (tessedit_write_txt_map)
txt_mapfile = open_outfile (".map"); txt_mapfile = open_outfile (".map");
if (tessedit_write_unlv)
unlv_file = open_outfile (".unlv");
page_res_it.restart_page (); page_res_it.restart_page ();
block_of_last_word = NULL; block_of_last_word = NULL;
while (page_res_it.word () != NULL) { while (page_res_it.word () != NULL) {
@ -189,7 +188,6 @@ void output_pass( //Tess output pass //send to api
} }
} }
/************************************************************************* /*************************************************************************
* write_results() * write_results()
* *
@ -211,9 +209,10 @@ void write_results( //output a word
) { ) {
//word to do //word to do
WERD_RES *word = page_res_it.word (); WERD_RES *word = page_res_it.word ();
WERD_CHOICE *ep_choice; //ep format // WERD_CHOICE *ep_choice; //ep format
STRING repetition_code; STRING repetition_code;
const STRING *wordstr; const STRING *wordstr;
STRING wordstr_lengths;
const char *text; const char *text;
int i; int i;
char unrecognised = STRING (unrecognised_char)[0]; char unrecognised = STRING (unrecognised_char)[0];
@ -312,15 +311,12 @@ void write_results( //output a word
if (tessedit_write_output && !NO_BLOCK) if (tessedit_write_output && !NO_BLOCK)
fprintf (textfile, "%s", txt_chs); fprintf (textfile, "%s", txt_chs);
if (tessedit_write_unlv)
fprintf (unlv_file, "%s", txt_chs);
if (tessedit_write_txt_map) if (tessedit_write_txt_map)
fprintf (txt_mapfile, "%s", map_chs); fprintf (txt_mapfile, "%s", map_chs);
//terminate string //terminate string
ep_chars[ep_chars_index] = '\0'; ep_chars[ep_chars_index] = '\0';
word->ep_choice = new WERD_CHOICE (ep_chars, 0, 0, NO_PERM); word->ep_choice = new WERD_CHOICE (ep_chars, NULL, 0, 0, NO_PERM);
if (force_eol) if (force_eol)
empty_block = TRUE; empty_block = TRUE;
@ -345,6 +341,8 @@ void write_results( //output a word
words have been removed */ words have been removed */
ptr = (char *) word->best_choice->string ().string (); ptr = (char *) word->best_choice->string ().string ();
strcpy (ptr, ptr + 1); //shuffle up strcpy (ptr, ptr + 1); //shuffle up
ptr = (char *) word->best_choice->lengths ().string ();
strcpy (ptr, ptr + 1); //shuffle up
word->reject_map.remove_pos (0); word->reject_map.remove_pos (0);
blob_it = word->outword->blob_list (); blob_it = word->outword->blob_list ();
delete blob_it.extract (); //get rid of reject blob delete blob_it.extract (); //get rid of reject blob
@ -354,8 +352,10 @@ void write_results( //output a word
last_char_was_tilde = FALSE; last_char_was_tilde = FALSE;
else { else {
if (word->reject_map.length () > 0) { if (word->reject_map.length () > 0) {
if (word->best_choice->string ()[word->reject_map.length () - 1] == for (i = 0, ptr = (char *) word->best_choice->string().string();
' ') i < word->reject_map.length () - 1; ++i)
ptr += word->best_choice->lengths()[i];
if (*ptr == ' ')
last_char_was_tilde = TRUE; last_char_was_tilde = TRUE;
else else
last_char_was_tilde = FALSE; last_char_was_tilde = FALSE;
@ -365,7 +365,7 @@ void write_results( //output a word
/* else it is unchanged as there are no output chars */ /* else it is unchanged as there are no output chars */
} }
ptr = (char *) word->best_choice->string ().string (); ptr = (char *) word->best_choice->lengths ().string ();
ASSERT_HOST (strlen (ptr) == word->reject_map.length ()); ASSERT_HOST (strlen (ptr) == word->reject_map.length ());
if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps) if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
@ -379,21 +379,26 @@ void write_results( //output a word
dict_word (word->best_choice->string ().string ())); dict_word (word->best_choice->string ().string ()));
} }
#if 0
if (tessedit_write_unlv) { if (tessedit_write_unlv) {
write_unlv_text(word); write_unlv_text(word);
} }
#endif
if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) { if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
repetition_code = "|^~R"; repetition_code = "|^~R";
repetition_code += get_rep_char (word); wordstr_lengths = "\001\001\001\001";
repetition_code += unicharset.id_to_unichar(get_rep_char (word));
wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word)));
wordstr = &repetition_code; wordstr = &repetition_code;
} }
else { else {
wordstr = &(word->best_choice->string ()); wordstr = &(word->best_choice->string ());
wordstr_lengths = word->best_choice->lengths ();
if (tessedit_zero_rejection) { if (tessedit_zero_rejection) {
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
text = wordstr->string (); text = wordstr->string ();
for (i = 0; text[i] != '\0'; i++) { for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
if (word->reject_map[i].rejected ()) if (word->reject_map[i].rejected ())
word->reject_map[i].setrej_minimal_rej_accept (); word->reject_map[i].setrej_minimal_rej_accept ();
} }
@ -401,8 +406,8 @@ void write_results( //output a word
if (tessedit_minimal_rejection) { if (tessedit_minimal_rejection) {
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
text = wordstr->string (); text = wordstr->string ();
for (i = 0; text[i] != '\0'; i++) { for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
if ((text[i] != ' ') && word->reject_map[i].rejected ()) if ((*text != ' ') && word->reject_map[i].rejected ())
word->reject_map[i].setrej_minimal_rej_accept (); word->reject_map[i].setrej_minimal_rej_accept ();
} }
} }
@ -410,8 +415,9 @@ void write_results( //output a word
if (write_to_shm) if (write_to_shm)
write_shm_text (word, page_res_it.block ()->block, write_shm_text (word, page_res_it.block ()->block,
page_res_it.row (), *wordstr); page_res_it.row (), *wordstr, wordstr_lengths);
#if 0
if (tessedit_write_output) if (tessedit_write_output)
write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile); write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);
@ -424,12 +430,12 @@ void write_results( //output a word
ep_choice = make_epaper_choice (word, newline_type); ep_choice = make_epaper_choice (word, newline_type);
word->ep_choice = ep_choice; word->ep_choice = ep_choice;
#endif
character_count += word->best_choice->string ().length (); character_count += word->best_choice->lengths ().length ();
word_count++; word_count++;
} }
/********************************************************************** /**********************************************************************
* make_epaper_choice * make_epaper_choice
* *
@ -437,6 +443,7 @@ void write_results( //output a word
* determine whether each blob should be rejected. * determine whether each blob should be rejected.
**********************************************************************/ **********************************************************************/
#if 0
WERD_CHOICE *make_epaper_choice( //convert one word WERD_CHOICE *make_epaper_choice( //convert one word
WERD_RES *word, //word to do WERD_RES *word, //word to do
char newline_type //type of newline char newline_type //type of newline
@ -482,7 +489,8 @@ WERD_CHOICE *make_epaper_choice( //convert one word
if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) { if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
strcpy (word_string + index, "|^~R"); strcpy (word_string + index, "|^~R");
index += 4; index += 4;
word_string[index++] = get_rep_char (word); strcpy(word_string + index, unicharset.id_to_unichar(get_rep_char (word)));
index += strlen(unicharset.id_to_unichar(get_rep_char (word)));
} }
else { else {
if (!blob_it.empty ()) if (!blob_it.empty ())
@ -537,7 +545,7 @@ WERD_CHOICE *make_epaper_choice( //convert one word
ASSERT_HOST (strlen (word_string) == index); ASSERT_HOST (strlen (word_string) == index);
return new WERD_CHOICE (word_string, 0, 0, NO_PERM); return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
} }
#endif
/********************************************************************** /**********************************************************************
* make_reject * make_reject
@ -653,6 +661,7 @@ char determine_newline_type( //test line ends
* to the given file. * to the given file.
**********************************************************************/ **********************************************************************/
#if 0
void write_cooked_text( //write output void write_cooked_text( //write output
WERD *word, //word to do WERD *word, //word to do
const STRING &text, //text to write const STRING &text, //text to write
@ -749,6 +758,7 @@ void write_cooked_text( //write output
if (status != 0) if (status != 0)
WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno); WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno);
} }
#endif
/********************************************************************** /**********************************************************************
@ -761,7 +771,8 @@ void write_shm_text( //write output
WERD_RES *word, //word to do WERD_RES *word, //word to do
BLOCK *block, //block it is from BLOCK *block, //block it is from
ROW_RES *row, //row it is from ROW_RES *row, //row it is from
const STRING &text //text to write const STRING &text, //text to write
const STRING &text_lengths
) { ) {
INT32 index; //char counter INT32 index; //char counter
INT32 index2; //char counter INT32 index2; //char counter
@ -777,6 +788,8 @@ void write_shm_text( //write output
WERD copy_outword; // copy to denorm WERD copy_outword; // copy to denorm
UINT32 rating; //of char UINT32 rating; //of char
BOOL8 lineend; //end of line BOOL8 lineend; //end of line
int offset;
int offset2;
//point size //point size
ptsize = pixels_to_pts ((INT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300); ptsize = pixels_to_pts ((INT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
@ -786,13 +799,14 @@ void write_shm_text( //write output
copy_outword = *(word->outword); copy_outword = *(word->outword);
copy_outword.baseline_denormalise (&word->denorm); copy_outword.baseline_denormalise (&word->denorm);
blob_it.set_to_list (copy_outword.blob_list ()); blob_it.set_to_list (copy_outword.blob_list ());
length = text.length (); length = text_lengths.length ();
if (length > 0) { if (length > 0) {
blanks = word->word->space (); blanks = word->word->space ();
if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL)) if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
blanks = 1; blanks = 1;
for (index = 0; index < length; index++, blob_it.forward ()) { for (index = 0, offset = 0; index < length;
offset += text_lengths[index++], blob_it.forward ()) {
blob = blob_it.data (); blob = blob_it.data ();
blob_box = blob->bounding_box (); blob_box = blob->bounding_box ();
@ -804,7 +818,7 @@ void write_shm_text( //write output
if (tessedit_write_ratings) if (tessedit_write_ratings)
rating = (UINT32) (-word->best_choice->certainty () / 0.035); rating = (UINT32) (-word->best_choice->certainty () / 0.035);
else if (tessedit_zero_rejection) else if (tessedit_zero_rejection)
rating = text[index] == ' ' ? 100 : 0; rating = text[offset] == ' ' ? 100 : 0;
else else
rating = word->reject_map[index].accepted ()? 0 : 100; rating = word->reject_map[index].accepted ()? 0 : 100;
if (rating > 255) if (rating > 255)
@ -819,22 +833,41 @@ void write_shm_text( //write output
lineend = word->word->flag (W_EOL) && index == length - 1; lineend = word->word->flag (W_EOL) && index == length - 1;
if (word->word->flag (W_EOL) && tessedit_zero_rejection if (word->word->flag (W_EOL) && tessedit_zero_rejection
&& index < length - 1 && text[index + 1] == ' ') { && index < length - 1 && text[index + text_lengths[index]] == ' ') {
for (index2 = index + 1; index2 < length && text[index2] == ' '; for (index2 = index + 1, offset2 = offset + text_lengths[index];
index2++); index2 < length && text[offset2] == ' ';
offset2 += text_lengths[index2++]);
if (index2 == length) if (index2 == length)
lineend = TRUE; lineend = TRUE;
} }
if (!tessedit_zero_rejection || text[index] != ' ' if (!tessedit_zero_rejection || text[offset] != ' '
|| tessedit_word_for_word) { || tessedit_word_for_word) {
//confidence //confidence
ocr_append_char (text[index] == ' ' ? unrecognised : text[index], blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, (UINT8) rating, if (text[offset] == ' ') {
ptsize, //point size ocr_append_char (unrecognised,
blanks, enhancement, //enhancement blob_box.left (), blob_box.right (),
OCR_CDIR_LEFT_RIGHT, page_image.get_ysize () - 1 - blob_box.top (),
OCR_LDIR_DOWN_RIGHT, page_image.get_ysize () - 1 - blob_box.bottom (),
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); font, (UINT8) rating,
ptsize, //point size
blanks, enhancement, //enhancement
OCR_CDIR_LEFT_RIGHT,
OCR_LDIR_DOWN_RIGHT,
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
} else {
for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
ocr_append_char (text[offset + suboffset],
blob_box.left (), blob_box.right (),
page_image.get_ysize () - 1 - blob_box.top (),
page_image.get_ysize () - 1 - blob_box.bottom (),
font, (UINT8) rating,
ptsize, //point size
blanks, enhancement, //enhancement
OCR_CDIR_LEFT_RIGHT,
OCR_LDIR_DOWN_RIGHT,
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
}
blanks = 0; blanks = 0;
} }
@ -863,13 +896,17 @@ void write_shm_text( //write output
lineend = word->word->flag (W_EOL); lineend = word->word->flag (W_EOL);
//font index //font index
ocr_append_char (unrecognised, blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, ocr_append_char (unrecognised,
rating, //confidence blob_box.left (), blob_box.right (),
ptsize, //point size page_image.get_ysize () - 1 - blob_box.top (),
blanks, enhancement, //enhancement page_image.get_ysize () - 1 - blob_box.bottom (),
OCR_CDIR_LEFT_RIGHT, font,
OCR_LDIR_DOWN_RIGHT, rating, //confidence
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); ptsize, //point size
blanks, enhancement, //enhancement
OCR_CDIR_LEFT_RIGHT,
OCR_LDIR_DOWN_RIGHT,
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
} }
} }
@ -888,6 +925,7 @@ void write_shm_text( //write output
* newdiff needs etx files! * newdiff needs etx files!
**********************************************************************/ **********************************************************************/
#if 0
void write_map( //output a map file void write_map( //output a map file
FILE *mapfile, //mapfile to write to FILE *mapfile, //mapfile to write to
WERD_RES *word) { WERD_RES *word) {
@ -937,6 +975,7 @@ void write_map( //output a map file
if (status != 0) if (status != 0)
WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno); WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
} }
#endif
/************************************************************************* /*************************************************************************
@ -957,6 +996,7 @@ FILE *open_outfile( //open .map & .unlv file
} }
#if 0
void write_unlv_text(WERD_RES *word) { void write_unlv_text(WERD_RES *word) {
const char *wordstr; const char *wordstr;
@ -1015,6 +1055,7 @@ void write_unlv_text(WERD_RES *word) {
if (status != 0) if (status != 0)
WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno); WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
} }
#endif
/************************************************************************* /*************************************************************************
@ -1022,21 +1063,24 @@ void write_unlv_text(WERD_RES *word) {
* Return the first accepted character from the repetition string. This is the * Return the first accepted character from the repetition string. This is the
* character which is repeated - as determined earlier by fix_rep_char() * character which is repeated - as determined earlier by fix_rep_char()
*************************************************************************/ *************************************************************************/
char get_rep_char( // what char is repeated? UNICHAR_ID get_rep_char(WERD_RES *word) { // what char is repeated?
WERD_RES *word) {
int i; int i;
int offset;
for (i = 0; for (i = 0, offset = 0;
((i < word->reject_map.length ()) && ((i < word->reject_map.length ()) &&
(word->reject_map[i].rejected ())); i++); (word->reject_map[i].rejected ()));
offset += word->best_choice->lengths()[i++]);
if (i < word->reject_map.length ()) if (i < word->reject_map.length ())
return word->best_choice->string ()[i]; return unicharset.unichar_to_id(word->best_choice->string().string()
+ offset,
word->best_choice->lengths()[i]);
else else
return STRING (unrecognised_char)[0]; return unicharset.unichar_to_id(unrecognised_char.string());
} }
void ensure_rep_chars_are_consistent(WERD_RES *word) { void ensure_rep_chars_are_consistent(WERD_RES *word) {
#if 0
char rep_char = get_rep_char (word); char rep_char = get_rep_char (word);
char *ptr; char *ptr;
@ -1045,8 +1089,24 @@ void ensure_rep_chars_are_consistent(WERD_RES *word) {
if (*ptr != rep_char) if (*ptr != rep_char)
*ptr = rep_char; *ptr = rep_char;
} }
} #endif
#if 0
UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate
int i;
char *ptr;
STRING consistent_string;
STRING consistent_string_lengths;
ptr = (char *) word->best_choice->string ().string ();
for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) {
consistent_string += unicharset.id_to_unichar(rep_char);
consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char));
}
word->best_choice->string() = consistent_string;
word->best_choice->lengths() = consistent_string_lengths;
#endif
}
/************************************************************************* /*************************************************************************
* SUSPECT LEVELS * SUSPECT LEVELS
@ -1062,7 +1122,9 @@ void ensure_rep_chars_are_consistent(WERD_RES *word) {
void set_unlv_suspects(WERD_RES *word) { void set_unlv_suspects(WERD_RES *word) {
int len = word->reject_map.length (); int len = word->reject_map.length ();
int i; int i;
int offset;
const char *ptr; const char *ptr;
const char *lengths = word->best_choice->lengths ().string ();
float rating_per_ch; float rating_per_ch;
ptr = word->best_choice->string ().string (); ptr = word->best_choice->string ().string ();
@ -1080,10 +1142,12 @@ void set_unlv_suspects(WERD_RES *word) {
/* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/ /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
if (safe_dict_word (ptr) && (count_alphas (ptr) > suspect_short_words)) { if (safe_dict_word (ptr) && (count_alphas (ptr, lengths) >
suspect_short_words)) {
/* Unreject alphas in dictionary words */ /* Unreject alphas in dictionary words */
for (i = 0; i < len; i++) { for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
if (word->reject_map[i].rejected () && isalpha (ptr[i])) if (word->reject_map[i].rejected () &&
unicharset.get_isalpha (ptr + offset, lengths[i]))
word->reject_map[i].setrej_minimal_rej_accept (); word->reject_map[i].setrej_minimal_rej_accept ();
} }
} }
@ -1095,8 +1159,8 @@ void set_unlv_suspects(WERD_RES *word) {
if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) { if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
/* Unreject any Tess Acceptable word - but NOT tess reject chs*/ /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
for (i = 0; i < len; i++) { for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
if (word->reject_map[i].rejected () && (ptr[i] != ' ')) if (word->reject_map[i].rejected () && (ptr[offset] != ' '))
word->reject_map[i].setrej_minimal_rej_accept (); word->reject_map[i].setrej_minimal_rej_accept ();
} }
} }
@ -1130,9 +1194,11 @@ void set_unlv_suspects(WERD_RES *word) {
} }
} }
if ((acceptable_word_string (word->best_choice->string ().string ()) if ((acceptable_word_string (word->best_choice->string ().string (),
word->best_choice->lengths ().string ())
!= AC_UNACCEPTABLE) || != AC_UNACCEPTABLE) ||
acceptable_number_string (word->best_choice->string ().string ())) { acceptable_number_string (word->best_choice->string ().string (),
word->best_choice->lengths ().string ())) {
if (word->reject_map.length () > suspect_short_words) { if (word->reject_map.length () > suspect_short_words) {
for (i = 0; i < len; i++) { for (i = 0; i < len; i++) {
if (word->reject_map[i].rejected () && if (word->reject_map[i].rejected () &&
@ -1149,11 +1215,12 @@ void set_unlv_suspects(WERD_RES *word) {
INT16 count_alphas( //how many alphas INT16 count_alphas( //how many alphas
const char *s) { const char *s,
const char *lengths) {
int count = 0; int count = 0;
for (; *s != '\0'; s++) { for (; *s != '\0'; s += *(lengths++)) {
if (isalpha (*s)) if (unicharset.get_isalpha(s, *lengths))
count++; count++;
} }
return count; return count;
@ -1161,36 +1228,43 @@ INT16 count_alphas( //how many alphas
INT16 count_alphanums( //how many alphanums INT16 count_alphanums( //how many alphanums
const char *s) { const char *s,
const char *lengths) {
int count = 0; int count = 0;
for (; *s != '\0'; s++) { for (; *s != '\0'; s += *(lengths++)) {
if (isalnum (*s)) if (unicharset.get_isalpha(s, *lengths) ||
unicharset.get_isdigit(s, *lengths))
count++; count++;
} }
return count; return count;
} }
BOOL8 acceptable_number_string(const char *s) { BOOL8 acceptable_number_string(const char *s,
const char *lengths) {
BOOL8 prev_digit = FALSE; BOOL8 prev_digit = FALSE;
if (*s == '(') if (*lengths == 1 && *s == '(')
s++; s++;
if ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')) if (*lengths == 1 &&
((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
s++; s++;
for (; *s != '\0'; s++) { for (; *s != '\0'; s += *(lengths++)) {
if (isdigit (*s)) if (unicharset.get_isdigit (s, *lengths))
prev_digit = TRUE; prev_digit = TRUE;
else if (prev_digit && ((*s == '.') || (*s == ',') || (*s == '-')))
prev_digit = FALSE;
else if (prev_digit && else if (prev_digit &&
(*(s + 1) == '\0') && ((*s == '%') || (*s == ')'))) (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
prev_digit = FALSE;
else if (prev_digit && *lengths == 1 &&
(*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
return TRUE; return TRUE;
else if (prev_digit && else if (prev_digit &&
(*s == '%') && (*(s + 1) == ')') && (*(s + 2) == '\0')) *lengths == 1 && (*s == '%') &&
(*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
(*(s + *lengths + *(lengths + 1)) == '\0'))
return TRUE; return TRUE;
else else
return FALSE; return FALSE;

View File

@ -31,7 +31,9 @@
#include "stderr.h" #include "stderr.h"
#include "notdll.h" #include "notdll.h"
#include "mainblk.h" #include "mainblk.h"
#include "output.h"
#include "globals.h" #include "globals.h"
#include "blread.h"
#include "tfacep.h" #include "tfacep.h"
#include "callnet.h" #include "callnet.h"
@ -40,7 +42,10 @@
#define API_CONFIG "configs/api_config" #define API_CONFIG "configs/api_config"
#define EXTERN #define EXTERN
EXTERN BOOL_VAR (tessedit_create_boxfile, FALSE, "Output text with boxes");
EXTERN BOOL_VAR (tessedit_read_image, TRUE, "Ensure the image is read"); EXTERN BOOL_VAR (tessedit_read_image, TRUE, "Ensure the image is read");
EXTERN INT_VAR (tessedit_serial_unlv, 0,
"0->Whole page, 1->serial no adapt, 2->serial with adapt");
EXTERN BOOL_VAR (tessedit_write_images, FALSE, EXTERN BOOL_VAR (tessedit_write_images, FALSE,
"Capture the image from the IPE"); "Capture the image from the IPE");
EXTERN BOOL_VAR (tessedit_debug_to_screen, FALSE, "Dont use debug file"); EXTERN BOOL_VAR (tessedit_debug_to_screen, FALSE, "Dont use debug file");
@ -63,15 +68,30 @@ int main(int argc, char **argv) {
if (argc < 3) { if (argc < 3) {
USAGE.error (argv[0], EXIT, USAGE.error (argv[0], EXIT,
"%s imagename outputbase [configfile [[+|-]varfile]...]\n", argv[0]); "%s imagename outputbase [-l lang] [configfile [[+|-]varfile]...]\n",
argv[0]);
}
// Find the required language.
const char* lang = "eng";
int arg = 3;
if (argc >= 5 && strcmp(argv[3], "-l") == 0) {
lang = argv[4];
arg = 5;
}
// Find the basename of the input file.
STRING infile(argv[1]);
const char* lastdot = strrchr(argv[1], '.');
if (lastdot != NULL) {
infile[lastdot - argv[1]] = '\0';
} }
if (argc == 3) if (argc == arg)
TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL, TessBaseAPI::InitWithLanguage(argv[0], infile.string(), lang,
NULL, false, 0, argv + 2); NULL, false, 0, argv + arg);
else else
TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL, TessBaseAPI::InitWithLanguage(argv[0], infile.string(), lang,
argv[3], false, argc - 4, argv + 4); argv[arg], false,
argc - arg - 1, argv + arg + 1);
tprintf ("Tesseract Open Source OCR Engine\n"); tprintf ("Tesseract Open Source OCR Engine\n");
@ -92,20 +112,70 @@ int main(int argc, char **argv) {
argv[1]); argv[1]);
} }
#endif #endif
STRING text_out;
int bytes_per_line = check_legal_image_size(image.get_xsize(), int bytes_per_line = check_legal_image_size(image.get_xsize(),
image.get_ysize(), image.get_ysize(),
image.get_bpp()); image.get_bpp());
char* text = TessBaseAPI::TesseractRect(image.get_buffer(), image.get_bpp()/8, if (tessedit_serial_unlv == 0) {
bytes_per_line, 0, 0, TessBaseAPI::SetInputName(argv[1]);
image.get_xsize(), image.get_ysize()); char* text;
if (tessedit_create_boxfile)
text = TessBaseAPI::TesseractRectBoxes(image.get_buffer(),
image.get_bpp()/8,
bytes_per_line, 0, 0,
image.get_xsize(),
image.get_ysize(),
image.get_ysize());
else if (tessedit_write_unlv)
text = TessBaseAPI::TesseractRectUNLV(image.get_buffer(),
image.get_bpp()/8,
bytes_per_line, 0, 0,
image.get_xsize(),
image.get_ysize());
else
text = TessBaseAPI::TesseractRect(image.get_buffer(), image.get_bpp()/8,
bytes_per_line, 0, 0,
image.get_xsize(), image.get_ysize());
text_out = text;
delete [] text;
} else {
BLOCK_LIST blocks;
STRING filename = argv[1];
int len = filename.length();
if (len > 4 && filename[len - 4] == '.') {
filename[len - 4] = '\0';
}
if (!read_unlv_file(filename, image.get_xsize(), image.get_ysize(),
&blocks)) {
fprintf(stderr, "Error: Must have a unlv zone file %s to read!\n",
filename.string());
return 1;
}
BLOCK_IT b_it = &blocks;
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOCK* block = b_it.data();
BOX box = block->bounding_box();
char* text = TessBaseAPI::TesseractRectUNLV(image.get_buffer(),
image.get_bpp()/8,
bytes_per_line,
box.left(),
image.get_ysize() - box.top(),
box.width(),
box.height());
text_out += text;
delete [] text;
if (tessedit_serial_unlv == 1)
TessBaseAPI::ClearAdaptiveClassifier();
}
}
outfile = argv[2]; outfile = argv[2];
outfile += ".txt"; outfile += ".txt";
FILE* fp = fopen(outfile.string(), "w"); FILE* fp = fopen(outfile.string(), "w");
if (fp != NULL) { if (fp != NULL) {
fwrite(text, 1, strlen(text), fp); fwrite(text_out.string(), 1, text_out.length(), fp);
fclose(fp); fclose(fp);
} }
delete [] text;
TessBaseAPI::End(); TessBaseAPI::End();
return 0; //Normal exit return 0; //Normal exit

View File

@ -527,7 +527,9 @@ BOOL8 read_unlv_file( //print list of sides
else { else {
while (fscanf (pdfp, "%d %d %d %d %*s", &x, &y, &width, &height) >= 4) { while (fscanf (pdfp, "%d %d %d %d %*s", &x, &y, &width, &height) >= 4) {
//make rect block //make rect block
block = new BLOCK (name.string (), TRUE, 0, 0, (INT16) x, (INT16) (ysize - 1 - y - height), (INT16) (x + width), (INT16) (ysize - 1 - y)); block = new BLOCK (name.string (), TRUE, 0, 0,
(INT16) x, (INT16) (ysize - y - height),
(INT16) (x + width), (INT16) (ysize - y));
//on end of list //on end of list
block_it.add_to_end (block); block_it.add_to_end (block);
} }

View File

@ -63,7 +63,7 @@ make_toggle_var (debug_8, 0, make_debug_8, 6, 8, toggle_debug_8, "Debug #8");
make_toggle_var (display_ratings, 0, make_display_ratings, make_toggle_var (display_ratings, 0, make_display_ratings,
6, 9, toggle_ratings, "Ratings display"); 6, 9, toggle_ratings, "Ratings display");
make_toggle_var (display_text, 1, make_display_text, make_toggle_var (display_text, 0, make_display_text,
6, 10, toggle_text, "Display Text"); 6, 10, toggle_text, "Display Text");
make_toggle_var (show_bold, 1, make_show_bold, make_toggle_var (show_bold, 1, make_show_bold,

1
tessdata/configs/makebox Normal file
View File

@ -0,0 +1 @@
tessedit_create_boxfile 1

3
tessdata/configs/unlv Normal file
View File

@ -0,0 +1,3 @@
tessedit_write_unlv 1
tessedit_write_output 0
tessedit_write_txt_map 0

View File

@ -1,78 +1,2 @@
################################################# # No content needed as all defaults are correct.
# Adaptive Matcher Using PreAdapted Templates
#################################################
acts_fx 0x800
acts_ocr 0x20
RatingScale 30.0
CertaintyScale 20.0
#EnableMatcher 0
#CurrentFx 2
MinSlope 0.414213562
MaxSlope 2.414213562
#ExtremityMode 1
NormMethod 1
EnableAdaptiveMatcher 1
NormAdjMidpoint 32.0
NormAdjCurl 2.0
MinNormScaleX 0.0
MaxNormScaleX 0.325
MinNormScaleY 0.0
MaxNormScaleY 0.325
BuiltInTemplatesFile tessdata/inttemp
BuiltInCutoffsFile tessdata/pffmtable
EnableLearning 0
SaveAdaptedTemplates 0
UsePreAdaptedTemplates 0
ReliableConfigThreshold 2
MinNumPermClasses 3
#EnableStopper 1
GoodAdaptiveMatch 0.125
GreatAdaptiveMatch 0.0
EnableIntFX 1
EnableNewAdaptRules 1
################################################################################
#
# File: marks/configs/knobs
# Description: Control variables for 'marks' code
# Author: Mark Seaman, OCR Technology
# Created: Wed Feb 27 11:27:27 1991
# Modified: Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
# Language: Text
# Package: N/A
# Status: Experimental (Do Not Distribute)
#
# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
#
################################################################################
#hidden_edges 1
save_doc_words 1
doc_dict_enable 1
ClassPrunerThreshold 229
ClassPrunerMultiplier 15
IntThetaFudge 128
CPCutoffStrength 0.15
EvidenceTableBits 9
IntEvidenceTruncBits 14
SEExponentialMultiplier 0
SimilarityCenter 0.0075
#################################################
# Adaptive Matcher Using 2 Passes
#################################################
EnableLearning 1
SaveAdaptedTemplates 0
UsePreAdaptedTemplates 0
#save_errors 0

View File

@ -0,0 +1,2 @@
chop_enable 0
enable_assoc 0

View File

@ -2,80 +2,6 @@
# Adaptive Matcher Using PreAdapted Templates # Adaptive Matcher Using PreAdapted Templates
################################################# #################################################
acts_fx 0x800
acts_ocr 0x20
RatingScale 30.0
CertaintyScale 20.0
#EnableMatcher 0
#CurrentFx 2
EnableAdaptiveMatcher 1
NormAdjMidpoint 32.0
NormAdjCurl 2.0
MinNormScaleX 0.0
MaxNormScaleX 0.325
MinNormScaleY 0.0
MaxNormScaleY 0.325
BuiltInTemplatesFile tessdata/inttemp
BuiltInCutoffsFile tessdata/pffmtable
EnableLearning 0
SaveAdaptedTemplates 0
UsePreAdaptedTemplates 0
ReliableConfigThreshold 2
MinNumPermClasses 3
#EnableStopper 1
GoodAdaptiveMatch 0.125
GreatAdaptiveMatch 0.0
EnableIntFX 1
EnableNewAdaptRules 1
EnableAdaptiveDebugger 1 EnableAdaptiveDebugger 1
MatchDebugFlags 6 MatchDebugFlags 6
MatcherDebugLevel 1 MatcherDebugLevel 1
################################################################################
#
# File: marks/configs/knobs
# Description: Control variables for 'marks' code
# Author: Mark Seaman, OCR Technology
# Created: Wed Feb 27 11:27:27 1991
# Modified: Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
# Language: Text
# Package: N/A
# Status: Experimental (Do Not Distribute)
#
# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
#
################################################################################
#hidden_edges 1
save_doc_words 1
doc_dict_enable 1
ClassPrunerThreshold 229
ClassPrunerMultiplier 15
IntThetaFudge 128
CPCutoffStrength 0.15
EvidenceTableBits 9
IntEvidenceTruncBits 14
SEExponentialMultiplier 0
SimilarityCenter 0.0075
#################################################
# Adaptive Matcher Using 2 Passes
#################################################
display_splits 0
display_all_words 0
display_all_blobs 0
display_segmentations 0
EnableLearning 1
SaveAdaptedTemplates 0
UsePreAdaptedTemplates 0
#save_errors 0

View File

@ -0,0 +1,13 @@
#################################################
# Adaptive Matcher Using PreAdapted Templates
#################################################
EnableAdaptiveDebugger 1
MatchDebugFlags 6
MatcherDebugLevel 1
display_splits 0
display_all_words 1
display_all_blobs 1
display_segmentations 2
display_ratings 1

View File

@ -0,0 +1,2 @@
display_text 0

View File

@ -2,70 +2,6 @@
# Adaptive Matcher Using PreAdapted Templates # Adaptive Matcher Using PreAdapted Templates
################################################# #################################################
acts_fx 0x800
acts_ocr 0x20
RatingScale 30.0
CertaintyScale 20.0
#EnableMatcher 0
#CurrentFx 2
EnableAdaptiveMatcher 1
NormAdjMidpoint 32.0
NormAdjCurl 2.0
MinNormScaleX 0.0
MaxNormScaleX 0.325
MinNormScaleY 0.0
MaxNormScaleY 0.325
BuiltInTemplatesFile tessdata/inttemp
BuiltInCutoffsFile tessdata/pffmtable
EnableLearning 0
SaveAdaptedTemplates 0
UsePreAdaptedTemplates 0
ReliableConfigThreshold 2
MinNumPermClasses 3
#EnableStopper 1
GoodAdaptiveMatch 0.125
GreatAdaptiveMatch 0.0
EnableIntFX 1
EnableNewAdaptRules 1
################################################################################
#
# File: marks/configs/knobs
# Description: Control variables for 'marks' code
# Author: Mark Seaman, OCR Technology
# Created: Wed Feb 27 11:27:27 1991
# Modified: Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
# Language: Text
# Package: N/A
# Status: Experimental (Do Not Distribute)
#
# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
#
################################################################################
#hidden_edges 1
save_doc_words 1
doc_dict_enable 1
ClassPrunerThreshold 229
ClassPrunerMultiplier 15
IntThetaFudge 128
CPCutoffStrength 0.15
EvidenceTableBits 9
IntEvidenceTruncBits 14
SEExponentialMultiplier 0
SimilarityCenter 0.0075
#################################################
# Adaptive Matcher Using 2 Passes
#################################################
display_splits 0 display_splits 0
display_all_words 1 display_all_words 1
display_all_blobs 1 display_all_blobs 1

185
testing/Makefile Normal file
View File

@ -0,0 +1,185 @@
# Makefile.in generated automatically by automake 1.4-p6 from Makefile.am
# Copyright (C) 1994, 1995-8, 1999, 2001 Free Software Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE.
SHELL = /bin/sh
srcdir = .
top_srcdir = ..
prefix = /usr/local
exec_prefix = ${prefix}
bindir = ${exec_prefix}/bin
sbindir = ${exec_prefix}/sbin
libexecdir = ${exec_prefix}/libexec
datadir = ${prefix}/share
sysconfdir = ${prefix}/etc
sharedstatedir = ${prefix}/com
localstatedir = ${prefix}/var
libdir = ${exec_prefix}/lib
infodir = ${prefix}/info
mandir = ${prefix}/man
includedir = ${prefix}/include/tesseract
oldincludedir = /usr/include
DESTDIR =
pkgdatadir = $(datadir)/
pkglibdir = $(libdir)/
pkgincludedir = $(includedir)/
top_builddir = ..
ACLOCAL = aclocal-1.4
AUTOCONF = autoconf
AUTOMAKE = automake-1.4
AUTOHEADER = autoheader
INSTALL = /usr/bin/install -c
INSTALL_PROGRAM = ${INSTALL} $(AM_INSTALL_PROGRAM_FLAGS)
INSTALL_DATA = ${INSTALL} -m 644
INSTALL_SCRIPT = ${INSTALL}
transform = s,x,x,
NORMAL_INSTALL = :
PRE_INSTALL = :
POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
host_alias =
host_triplet = x86_64-unknown-linux-gnu
CC = gcc
CXX = g++
HAVE_LIB = @HAVE_LIB@
LIB = @LIB@
LTLIB = @LTLIB@
MAINT = #
MAKEINFO = /home/rays/src/opensrc/tesseract-ocr/config/missing makeinfo
PACKAGE =
PACKAGE_DATE = 07/2007
PACKAGE_NAME = tesseract
PACKAGE_VERSION = 2.00
PACKAGE_YEAR = 2007
RANLIB = ranlib
VERSION =
EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum
mkinstalldirs = $(SHELL) $(top_srcdir)/config/mkinstalldirs
CONFIG_HEADER = ../config_auto.h
CONFIG_CLEAN_FILES =
DIST_COMMON = README Makefile.am Makefile.in
DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST)
TAR = tar
GZIP_ENV = --best
all: all-redirect
.SUFFIXES:
$(srcdir)/Makefile.in: # Makefile.am $(top_srcdir)/configure.ac $(ACLOCAL_M4)
cd $(top_srcdir) && $(AUTOMAKE) --gnu testing/Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status $(BUILT_SOURCES)
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
tags: TAGS
TAGS:
distdir = $(top_builddir)/$(PACKAGE_NAME)-$(PACKAGE_VERSION)/$(subdir)
subdir = testing
distdir: $(DISTFILES)
here=`cd $(top_builddir) && pwd`; \
top_distdir=`cd $(top_distdir) && pwd`; \
distdir=`cd $(distdir) && pwd`; \
cd $(top_srcdir) \
&& $(AUTOMAKE) --include-deps --build-dir=$$here --srcdir-name=$(top_srcdir) --output-dir=$$top_distdir --gnu testing/Makefile
$(mkinstalldirs) $(distdir)/reports
@for file in $(DISTFILES); do \
d=$(srcdir); \
if test -d $$d/$$file; then \
cp -pr $$d/$$file $(distdir)/$$file; \
else \
test -f $(distdir)/$$file \
|| ln $$d/$$file $(distdir)/$$file 2> /dev/null \
|| cp -p $$d/$$file $(distdir)/$$file || :; \
fi; \
done
info-am:
info: info-am
dvi-am:
dvi: dvi-am
check-am: all-am
check: check-am
installcheck-am:
installcheck: installcheck-am
install-exec-am:
install-exec: install-exec-am
install-data-am:
install-data: install-data-am
install-am: all-am
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
install: install-am
uninstall-am:
uninstall: uninstall-am
all-am: Makefile
all-redirect: all-am
install-strip:
$(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install
installdirs:
mostlyclean-generic:
clean-generic:
distclean-generic:
-rm -f Makefile $(CONFIG_CLEAN_FILES)
-rm -f config.cache config.log stamp-h stamp-h[0-9]*
maintainer-clean-generic:
mostlyclean-am: mostlyclean-generic
mostlyclean: mostlyclean-am
clean-am: clean-generic mostlyclean-am
clean: clean-am
distclean-am: distclean-generic clean-am
distclean: distclean-am
maintainer-clean-am: maintainer-clean-generic distclean-am
@echo "This command is intended for maintainers to use;"
@echo "it deletes files that may require special tools to rebuild."
maintainer-clean: maintainer-clean-am
.PHONY: tags distdir info-am info dvi-am dvi check check-am \
installcheck-am installcheck install-exec-am install-exec \
install-data-am install-data install-am install uninstall-am uninstall \
all-redirect all-am all installdirs mostlyclean-generic \
distclean-generic clean-generic maintainer-clean-generic clean \
mostlyclean distclean maintainer-clean
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:

2
testing/Makefile.am Normal file
View File

@ -0,0 +1,2 @@
EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum

185
testing/Makefile.in Normal file
View File

@ -0,0 +1,185 @@
# Makefile.in generated automatically by automake 1.4-p6 from Makefile.am
# Copyright (C) 1994, 1995-8, 1999, 2001 Free Software Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE.
SHELL = @SHELL@
srcdir = @srcdir@
top_srcdir = @top_srcdir@
VPATH = @srcdir@
prefix = @prefix@
exec_prefix = @exec_prefix@
bindir = @bindir@
sbindir = @sbindir@
libexecdir = @libexecdir@
datadir = @datadir@
sysconfdir = @sysconfdir@
sharedstatedir = @sharedstatedir@
localstatedir = @localstatedir@
libdir = @libdir@
infodir = @infodir@
mandir = @mandir@
includedir = @includedir@
oldincludedir = /usr/include
DESTDIR =
pkgdatadir = $(datadir)/@PACKAGE@
pkglibdir = $(libdir)/@PACKAGE@
pkgincludedir = $(includedir)/@PACKAGE@
top_builddir = ..
ACLOCAL = @ACLOCAL@
AUTOCONF = @AUTOCONF@
AUTOMAKE = @AUTOMAKE@
AUTOHEADER = @AUTOHEADER@
INSTALL = @INSTALL@
INSTALL_PROGRAM = @INSTALL_PROGRAM@ $(AM_INSTALL_PROGRAM_FLAGS)
INSTALL_DATA = @INSTALL_DATA@
INSTALL_SCRIPT = @INSTALL_SCRIPT@
transform = @program_transform_name@
NORMAL_INSTALL = :
PRE_INSTALL = :
POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
host_alias = @host_alias@
host_triplet = @host@
CC = @CC@
CXX = @CXX@
HAVE_LIB = @HAVE_LIB@
LIB = @LIB@
LTLIB = @LTLIB@
MAINT = @MAINT@
MAKEINFO = @MAKEINFO@
PACKAGE = @PACKAGE@
PACKAGE_DATE = @PACKAGE_DATE@
PACKAGE_NAME = @PACKAGE_NAME@
PACKAGE_VERSION = @PACKAGE_VERSION@
PACKAGE_YEAR = @PACKAGE_YEAR@
RANLIB = @RANLIB@
VERSION = @VERSION@
EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum
mkinstalldirs = $(SHELL) $(top_srcdir)/config/mkinstalldirs
CONFIG_HEADER = ../config_auto.h
CONFIG_CLEAN_FILES =
DIST_COMMON = README Makefile.am Makefile.in
DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST)
TAR = tar
GZIP_ENV = --best
all: all-redirect
.SUFFIXES:
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.ac $(ACLOCAL_M4)
cd $(top_srcdir) && $(AUTOMAKE) --gnu testing/Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status $(BUILT_SOURCES)
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
tags: TAGS
TAGS:
distdir = $(top_builddir)/$(PACKAGE_NAME)-$(PACKAGE_VERSION)/$(subdir)
subdir = testing
distdir: $(DISTFILES)
here=`cd $(top_builddir) && pwd`; \
top_distdir=`cd $(top_distdir) && pwd`; \
distdir=`cd $(distdir) && pwd`; \
cd $(top_srcdir) \
&& $(AUTOMAKE) --include-deps --build-dir=$$here --srcdir-name=$(top_srcdir) --output-dir=$$top_distdir --gnu testing/Makefile
$(mkinstalldirs) $(distdir)/reports
@for file in $(DISTFILES); do \
d=$(srcdir); \
if test -d $$d/$$file; then \
cp -pr $$d/$$file $(distdir)/$$file; \
else \
test -f $(distdir)/$$file \
|| ln $$d/$$file $(distdir)/$$file 2> /dev/null \
|| cp -p $$d/$$file $(distdir)/$$file || :; \
fi; \
done
info-am:
info: info-am
dvi-am:
dvi: dvi-am
check-am: all-am
check: check-am
installcheck-am:
installcheck: installcheck-am
install-exec-am:
install-exec: install-exec-am
install-data-am:
install-data: install-data-am
install-am: all-am
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
install: install-am
uninstall-am:
uninstall: uninstall-am
all-am: Makefile
all-redirect: all-am
install-strip:
$(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install
installdirs:
mostlyclean-generic:
clean-generic:
distclean-generic:
-rm -f Makefile $(CONFIG_CLEAN_FILES)
-rm -f config.cache config.log stamp-h stamp-h[0-9]*
maintainer-clean-generic:
mostlyclean-am: mostlyclean-generic
mostlyclean: mostlyclean-am
clean-am: clean-generic mostlyclean-am
clean: clean-am
distclean-am: distclean-generic clean-am
distclean: distclean-am
maintainer-clean-am: maintainer-clean-generic distclean-am
@echo "This command is intended for maintainers to use;"
@echo "it deletes files that may require special tools to rebuild."
maintainer-clean: maintainer-clean-am
.PHONY: tags distdir info-am info dvi-am dvi check check-am \
installcheck-am installcheck install-exec-am install-exec \
install-data-am install-data install-am install uninstall-am uninstall \
all-redirect all-am all installdirs mostlyclean-generic \
distclean-generic clean-generic maintainer-clean-generic clean \
mostlyclean distclean maintainer-clean
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:

43
testing/README Normal file
View File

@ -0,0 +1,43 @@
How to run UNLV tests.
The scripts in this directory make it possible to duplicate the tests
published in the Fourth Annual Test of OCR Accuracy.
See http://www.isri.unlv.edu/downloads/AT-1995.pdf
but first you have to get the tools and data from UNLV:
Step 1: to download the images goto
http://www.isri.unlv.edu/ISRI/OCRtk
and get 3b.tgz, Bb.tgz, Mb.tgz and Nb.tgz.
Step 2: extract the files. It doesn't really matter where
in your filesystem you put them, but they must go under a common
root so you have directories 3, B, M and N in, for example,
/users/me/ISRI-OCRtk.
Step 3: Reorg the files
The lack of tif extensions on the images is inconvenient, so there
is a script to reorganize the data to match the rest of the test
scripts.
cd to /users/me/ISRI-OCRtk or wherever 3, B, M and N ended up and run
/blah/blah/tesseract-ocr/testing/reorgdata.sh 3B
This makes directories doe3.3B, bus.3B, mag.3B and news.3B.
You can now get rid of 3, B, M, and N unless you want to get some of the
other scanning resolutions out of them.
Step 4: Download the ISRI toolkit from:
http://www.isri.unlv.edu/downloads/ftk-1.0.tgz
Step 5: If they work for you, use the binaries directly from the bin
directory and put them in tesseract-ocr/testing/unlv
otherwise build the tools for yourself and put them there.
Step 6: cd back to your main tesseract-ocr dir and Build tesseract.
Step 7: run testing/runalltests.sh with the root data dir and testname:
testing/runalltests.sh /users/me/ISRI-OCRtk tess2.0
and go to the gym, have lunch etc.
Step 8: There should be a file
testing/reports/tess2.0.summary that contains the final summarized accuracy
report and comparison with the 1995 results.

61
testing/counttestset.sh Executable file
View File

@ -0,0 +1,61 @@
#!/bin/bash
# File: counttestset.sh
# Description: Script to count the errors on a single UNLV set.
# Author: Ray Smith
# Created: Wed Jun 13 11:58:01 PDT 2007
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ne 1 ]
then
echo "Usage:$0 pagesfile"
exit 1
fi
if [ ! -d ccmain ]
then
echo "Run $0 from the tesseract-ocr root directory!"
exit 1
fi
if [ ! -r testing/unlv/accuracy ]
then
echo "Please download the UNLV accuracy tools (and build) to testing/unlv"
exit 1
fi
pages=$1
imdir=${pages%/pages}
setname=${imdir##*/}
resdir=testing/results/$setname
mkdir -p testing/reports
echo "Counting on set $setname in directory $imdir to $resdir"
accfiles=""
wafiles=""
while read page dir
do
if [ "$dir" ]
then
srcdir="$imdir/$dir"
else
srcdir="$imdir"
fi
# echo "$srcdir/$page.tif"
# Count character errors.
testing/unlv/accuracy $srcdir/$page.txt $resdir/$page.txt $resdir/$page.acc
accfiles="$accfiles $resdir/$page.acc"
# Count word errors.
testing/unlv/wordacc $srcdir/$page.txt $resdir/$page.txt $resdir/$page.wa
wafiles="$wafiles $resdir/$page.wa"
done <$pages
testing/unlv/accsum $accfiles >testing/reports/$setname.characc
testing/unlv/wordaccsum $wafiles >testing/reports/$setname.wordacc

44
testing/reorgdata.sh Executable file
View File

@ -0,0 +1,44 @@
#!/bin/bash
if [ $# -ne 1 ]
then
echo "Usage:$0 scantype"
echo "UNLV data comes in several scan types:"
echo "3B=300 dpi binary"
echo "3A=adaptive thresholded 300 dpi"
echo "3G=300 dpi grey"
echo "4B=400dpi binary"
echo "2B=200dpi binary"
echo "For now we only use 3B"
exit 1
fi
ext=$1
#There are several test sets without meaningful names, so rename
#them with something a bit more meaningful.
#Each s is oldname/newname
for s in 3/doe3 B/bus M/mag N/news L/legal R/rep S/spn Z/zset
do
old=${s%/*}
#if this set was downloaded then process it.
if [ -r "$old/PAGES" ]
then
new=${s#*/}.$ext
mkdir -p $new
echo "Set $old -> $new"
#The pages file had - instead of _ so fix it and add the extension.
for page in `cat $old/PAGES`
do
echo "${page%-*}_${page#*-}.$ext"
done >$new/pages
for f in `cat $new/pages`
do
#Put a tif extension on the tif files.
cp $old/${old}_B/$f $new/$f.tif
#Put a uzn extension on the zone files.
cp $old/${old}_B/${f}Z $new/$f.uzn
#Cat all the truth files together and put into a single txt file.
cat $old/${old}_GT/${f%.$ext}.Z* >$new/$f.txt
done
fi
done

View File

@ -0,0 +1 @@
1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00%

View File

@ -0,0 +1 @@
1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00%

View File

@ -0,0 +1 @@
1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00%

View File

@ -0,0 +1 @@
1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00%

110
testing/runalltests.sh Executable file
View File

@ -0,0 +1,110 @@
#!/bin/bash
# File: runalltests.sh
# Description: Script to run a set of UNLV test sets.
# Author: Ray Smith
# Created: Thu Jun 14 08:21:01 PDT 2007
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ne 2 ]
then
echo "Usage:$0 unlv-data-dir version-id"
exit 1
fi
if [ ! -d ccmain ]
then
echo "Run $0 from the tesseract-ocr root directory!"
exit 1
fi
if [ ! -r ccmain/tesseract -a ! -r tesseract.exe ]
then
echo "Please build tesseract before running $0"
exit 1
fi
if [ ! -r testing/unlv/accuracy -a ! -r testing/unlv/accuracy.exe ]
then
echo "Please download the UNLV accuracy tools (and build) to testing/unlv"
exit 1
fi
#deltapc new old calculates the %change from old to new
deltapc() {
awk ' BEGIN {
printf("%.2f", 100.0*('$1'-'$2')/'$2');
}'
}
imdir="$1"
vid="$2"
bindir=${0%/*}
if [ "$bindir" = "$0" ]
then
bindir="./"
fi
rdir=testing/reports
testsets="bus.3B doe3.3B mag.3B news.3B"
totalerrs=0
totalwerrs=0
totalnswerrs=0
totalolderrs=0
totaloldwerrs=0
totaloldnswerrs=0
for set in $testsets
do
if [ -r $imdir/$set/pages ]
then
# Run tesseract on all the pages.
$bindir/runtestset.sh $imdir/$set/pages
# Count the errors on all the pages.
$bindir/counttestset.sh $imdir/$set/pages
# Get the old character word and nonstop word errors.
olderrs=`cat testing/reports/1995.$set.sum | cut -f3`
oldwerrs=`cat testing/reports/1995.$set.sum | cut -f6`
oldnswerrs=`cat testing/reports/1995.$set.sum | cut -f9`
# Get the new character word and nonstop word errors and accuracy.
cherrs=`head -4 testing/reports/$set.characc |tail -1 |cut -c1-9 |
tr -d '[:blank:]'`
chacc=`head -5 testing/reports/$set.characc |tail -1 |cut -c1-9 |
tr -d '[:blank:]'`
wderrs=`head -4 testing/reports/$set.wordacc |tail -1 |cut -c1-9 |
tr -d '[:blank:]'`
wdacc=`head -5 testing/reports/$set.wordacc |tail -1 |cut -c1-9 |
tr -d '[:blank:]'`
nswderrs=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 |
cut -c10-17 |tr -d '[:blank:]'`
nswdacc=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 |
cut -c19-26 |tr -d '[:blank:]'`
# Compute the percent change.
chdelta=`deltapc $cherrs $olderrs`
wdelta=`deltapc $wderrs $oldwerrs`
nswdelta=`deltapc $nswderrs $oldnswerrs`
sumfile=$rdir/$vid.$set.sum
echo "$vid $set $cherrs $chacc $chdelta% $wderrs $wdacc\
$wdelta% $nswderrs $nswdacc $nswdelta%" >$sumfile
# Sum totals over all the testsets.
let totalerrs=totalerrs+cherrs
let totalwerrs=totalwerrs+wderrs
let totalnswerrs=totalnswerrs+nswderrs
let totalolderrs=totalolderrs+olderrs
let totaloldwerrs=totaloldwerrs+oldwerrs
let totaloldnswerrs=totaloldnswerrs+oldnswerrs
fi
done
# Compute grand total percent change.
chdelta=`deltapc $totalerrs $totalolderrs`
wdelta=`deltapc $totalwerrs $totaloldwerrs`
nswdelta=`deltapc $totalnswerrs $totaloldnswerrs `
tfile=$rdir/$vid.total.sum
echo "$vid Total $totalerrs - $chdelta% $totalwerrs\
- $wdelta% $totalnswerrs - $nswdelta%" >$tfile
cat $rdir/1995.*.sum $rdir/$vid.*.sum >$rdir/$vid.summary

61
testing/runtestset.sh Executable file
View File

@ -0,0 +1,61 @@
#!/bin/bash
# File: runtestset.sh
# Description: Script to run tesseract on a single UNLV set.
# Author: Ray Smith
# Created: Wed Jun 13 10:13:01 PDT 2007
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ne 1 ]
then
echo "Usage:$0 pagesfile"
exit 1
fi
if [ ! -d ccmain ]
then
echo "Run $0 from the tesseract-ocr root directory!"
exit 1
fi
if [ ! -r ccmain/tesseract ]
then
if [ ! -r tesseract.exe ]
then
echo "Please build tesseract before running $0"
exit 1
else
tess="./tesseract.exe"
fi
else
tess="ccmain/tesseract"
export TESSDATA_PREFIX=$PWD/
fi
pages=$1
imdir=${pages%/pages}
setname=${imdir##*/}
resdir=testing/results/$setname
echo "Testing on set $setname in directory $imdir to $resdir"
mkdir -p $resdir
while read page dir
do
# A pages file may be a list of files with subdirs or maybe just
# a plain list of files so accomodate both.
if [ "$dir" ]
then
srcdir="$imdir/$dir"
else
srcdir="$imdir"
fi
# echo "$srcdir/$page.tif"
$tess $srcdir/$page.tif $resdir/$page nobatch unlv
done <$pages