API/output changes to produce unlv-style latin-1 output and test scripts

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@86 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
theraysmith 2007-07-18 01:11:18 +00:00
parent eeaca1beba
commit 627368df42
27 changed files with 1424 additions and 442 deletions

View File

@ -24,20 +24,22 @@ what measures we are interested in.
/* #define SECURE_NAMES done in secnames.h when necessary*/
#include "mfcpch.h"
#include "applybox.h"
#include <ctype.h>
#include <string.h>
#include "applybox.h"
#include <ctype.h>
#include <string.h>
#ifdef __UNIX__
#include <assert.h>
#include <errno.h>
#include <assert.h>
#include <errno.h>
#endif
#include "mainblk.h"
#include "genblob.h"
#include "fixxht.h"
#include "control.h"
#include "tessbox.h"
#include "globals.h"
#include "secname.h"
#include "mainblk.h"
#include "genblob.h"
#include "fixxht.h"
#include "control.h"
#include "tessbox.h"
#include "globals.h"
#include "secname.h"
#include "unichar.h"
#include "matchdefs.h"
#define SECURE_NAMES
#ifndef SECURE_NAMES
@ -47,10 +49,13 @@ what measures we are interested in.
#define EXTERN
EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");
EXTERN INT_VAR (applybox_debug, 0, "Debug level");
EXTERN STRING_VAR (applybox_test_exclusions, "|",
EXTERN STRING_VAR (applybox_test_exclusions, "",
"Chars ignored for testing");
EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");
// The unicharset used during box training
static UNICHARSET unicharset_boxes;
/*************************************************************************
* The code re-assigns outlines to form words each with ONE labelled blob.
* Noise is left in UNLABELLED words. The chars on the page are checked crudely
@ -89,7 +94,7 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
INT16 boxfile_lineno = 0;
INT16 boxfile_charno = 0;
BOX box; //boxfile box
char ch[2]; //correct ch from boxfile
UNICHAR_ID uch_id; //correct ch from boxfile
ROW *row;
ROW *prev_row = NULL;
INT16 prev_box_right = MAX_INT16;
@ -100,15 +105,20 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
INT16 labels_ok;
INT16 rows_ok;
INT16 bad_blobs;
INT16 tgt_char_counts[128]; //No. of box samples
INT16 tgt_char_counts[MAX_NUM_CLASSES]; //No. of box samples
// INT16 labelled_char_counts[128]; //No. of unique labelled samples
INT16 i;
INT16 rebalance_count = 0;
char min_char;
UNICHAR_ID min_uch_id;
INT16 min_samples;
INT16 final_labelled_blob_count;
for (i = 0; i < 128; i++)
// Clean the unichar set
unicharset_boxes.clear();
// Space character needed to represent NIL classification
unicharset_boxes.unichar_insert(" ");
for (i = 0; i < MAX_NUM_CLASSES; i++)
tgt_char_counts[i] = 0;
FILE* box_file;
@ -120,11 +130,10 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
filename.string(), errno);
}
ch[1] = '\0';
clear_any_old_text(block_list);
while (read_next_box (box_file, &box, &ch[0])) {
while (read_next_box (box_file, &box, &uch_id)) {
box_count++;
tgt_char_counts[ch[0]]++;
tgt_char_counts[uch_id]++;
row = find_row_of_box (block_list, box, block_id, row_id);
if (box.left () < prev_box_right) {
boxfile_lineno++;
@ -135,14 +144,16 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
if (row == NULL) {
box_failures++;
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
report_failed_box (boxfile_lineno, boxfile_charno, box,
unicharset_boxes.id_to_unichar(uch_id),
"FAILURE! box overlaps no blobs or blobs in multiple rows");
}
else {
if ((box.left () >= prev_box_right) && (row != prev_row))
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
report_failed_box (boxfile_lineno, boxfile_charno, box,
unicharset_boxes.id_to_unichar(uch_id),
"WARNING! false row break");
box_failures += resegment_box (row, box, ch, block_id, row_id,
box_failures += resegment_box (row, box, uch_id, block_id, row_id,
boxfile_lineno, boxfile_charno);
prev_row = row;
}
@ -154,7 +165,7 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
bad_blobs,
tgt_char_counts,
rebalance_count,
min_char,
&min_uch_id,
min_samples,
final_labelled_blob_count);
tprintf ("APPLY_BOXES:\n");
@ -163,7 +174,8 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
labels_ok, rows_ok);
tprintf (" Box failures detected: %6d\n", box_failures);
tprintf (" Duped blobs for rebalance:%6d\n", rebalance_count);
tprintf (" \"%c\" has fewest samples:%6d\n", min_char, min_samples);
tprintf (" \"%s\" has fewest samples:%6d\n",
unicharset_boxes.id_to_unichar(min_uch_id), min_samples);
tprintf (" Total unlabelled words: %6d\n",
bad_blobs);
tprintf (" Final labelled words: %6d\n",
@ -194,7 +206,7 @@ void clear_any_old_text( //remove correct text
BOOL8 read_next_box(FILE* box_file, //
BOX *box,
char *ch) {
UNICHAR_ID *uch_id) {
char buff[256]; //boxfile read buffer
char *buffptr = buff;
STRING box_filename;
@ -204,23 +216,38 @@ BOOL8 read_next_box(FILE* box_file, //
INT32 x_max;
INT32 y_max;
INT32 count = 0;
char uch[256];
while (!feof (box_file)) {
fgets (buff, sizeof (buff) - 1, box_file);
line++;
buffptr = buff;
const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
buffptr += 3; // Skip unicode file designation.
/* Check for blank lines in box file */
for (buffptr = buff; isspace (*buffptr); buffptr++)
;
while (isspace (*buffptr))
buffptr++;
if (*buffptr != '\0') {
count =
sscanf (buff,
"%c " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
INT32FORMAT, ch, &x_min, &y_min, &x_max, &y_max);
sscanf (buffptr,
"%s " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
INT32FORMAT, uch, &x_min, &y_min, &x_max, &y_max);
if (count != 5) {
tprintf ("Box file format error on line %i ignored\n", line);
}
else {
if (!unicharset_boxes.contains_unichar(uch))
{
unicharset_boxes.unichar_insert(uch);
if (unicharset_boxes.size() > MAX_NUM_CLASSES) {
tprintf("Error: Size of unicharset of boxes is \
greater than MAX_NUM_CLASSES\n");
exit(1);
}
}
*uch_id = unicharset_boxes.unichar_to_id(uch);
*box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
return TRUE; //read a box ok
}
@ -314,7 +341,7 @@ ROW *find_row_of_box( //
INT16 resegment_box( //
ROW *row,
BOX box,
char *ch,
UNICHAR_ID uch_id,
INT16 block_id,
INT16 row_id,
INT16 boxfile_lineno,
@ -358,7 +385,7 @@ INT16 resegment_box( //
if (applybox_debug > 4)
report_failed_box (boxfile_lineno,
boxfile_charno,
box, ch,
box, unicharset_boxes.id_to_unichar(uch_id),
"FAILURE! box overlaps blob in labelled word");
}
if (applybox_debug > 4)
@ -375,7 +402,7 @@ INT16 resegment_box( //
if (new_word == NULL) {
/* Make a new word with a single blob */
new_word = word->shallow_copy ();
new_word->set_text (ch);
new_word->set_text (unicharset_boxes.id_to_unichar(uch_id));
if (polyg)
new_blob = new PBLOB;
else
@ -414,63 +441,75 @@ INT16 resegment_box( //
word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f;
baseline = row->base_line (word_x_centre);
if (STRING (chs_caps_ht).contains (ch[0]) &&
(new_word_box.top () <
baseline + (1 + applybox_error_band) * row->x_height ())) {
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
"FAILURE! caps-ht char didn't ascend");
new_word->set_text ("");
return 1;
}
if (STRING (chs_odd_top).contains (ch[0]) &&
(new_word_box.top () <
baseline + (1 - applybox_error_band) * row->x_height ())) {
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
"FAILURE! Odd top char below xht");
new_word->set_text ("");
return 1;
}
if (STRING (chs_x_ht).contains (ch[0]) &&
((new_word_box.top () >
baseline + (1 + applybox_error_band) * row->x_height ()) ||
(new_word_box.top () <
baseline + (1 - applybox_error_band) * row->x_height ()))) {
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
"FAILURE! x-ht char didn't have top near xht");
new_word->set_text ("");
return 1;
}
if (STRING (chs_non_ambig_bl).contains (ch[0]) &&
((new_word_box.bottom () <
baseline - applybox_error_band * row->x_height ()) ||
(new_word_box.bottom () >
baseline + applybox_error_band * row->x_height ()))) {
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
"FAILURE! non ambig BL char didnt have bottom near baseline");
new_word->set_text ("");
return 1;
}
if (STRING (chs_odd_bot).contains (ch[0]) &&
(new_word_box.bottom () >
baseline + applybox_error_band * row->x_height ())) {
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
"FAILURE! Odd bottom char above baseline");
new_word->set_text ("");
return 1;
}
if (STRING (chs_desc).contains (ch[0]) &&
(new_word_box.bottom () >
baseline - applybox_error_band * row->x_height ())) {
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
#if 0
if (strlen(unicharset_boxes.id_to_unichar(uch_id)) == 1) {
if (STRING (chs_caps_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
(new_word_box.top () <
baseline + (1 + applybox_error_band) * row->x_height ())) {
report_failed_box (boxfile_lineno, boxfile_charno, box,
unicharset_boxes.id_to_unichar(uch_id),
"FAILURE! caps-ht char didn't ascend");
new_word->set_text ("");
return 1;
}
if (STRING (chs_odd_top).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
(new_word_box.top () <
baseline + (1 - applybox_error_band) * row->x_height ())) {
report_failed_box (boxfile_lineno, boxfile_charno, box,
unicharset_boxes.id_to_unichar(uch_id),
"FAILURE! Odd top char below xht");
new_word->set_text ("");
return 1;
}
if (STRING (chs_x_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
((new_word_box.top () >
baseline + (1 + applybox_error_band) * row->x_height ()) ||
(new_word_box.top () <
baseline + (1 - applybox_error_band) * row->x_height ()))) {
report_failed_box (boxfile_lineno, boxfile_charno, box,
unicharset_boxes.id_to_unichar(uch_id),
"FAILURE! x-ht char didn't have top near xht");
new_word->set_text ("");
return 1;
}
if (STRING (chs_non_ambig_bl).contains
(unicharset_boxes.id_to_unichar(uch_id)[0]) &&
((new_word_box.bottom () <
baseline - applybox_error_band * row->x_height ()) ||
(new_word_box.bottom () >
baseline + applybox_error_band * row->x_height ()))) {
report_failed_box (boxfile_lineno, boxfile_charno, box,
unicharset_boxes.id_to_unichar(uch_id),
"FAILURE! non ambig BL char didnt have bottom near baseline");
new_word->set_text ("");
return 1;
}
if (STRING (chs_odd_bot).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
(new_word_box.bottom () >
baseline + applybox_error_band * row->x_height ())) {
report_failed_box (boxfile_lineno, boxfile_charno, box,
unicharset_boxes.id_to_unichar(uch_id),
"FAILURE! Odd bottom char above baseline");
new_word->set_text ("");
return 1;
}
if (STRING (chs_desc).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
(new_word_box.bottom () >
baseline - applybox_error_band * row->x_height ())) {
report_failed_box (boxfile_lineno, boxfile_charno, box,
unicharset_boxes.id_to_unichar(uch_id),
"FAILURE! Descender doesn't descend");
new_word->set_text ("");
return 1;
new_word->set_text ("");
return 1;
}
}
#endif
return 0;
}
else {
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
"FAILURE! Couldn't find any blobs");
report_failed_box (boxfile_lineno, boxfile_charno, box,
unicharset_boxes.id_to_unichar(uch_id),
"FAILURE! Couldn't find any blobs");
return 1;
}
}
@ -492,7 +531,7 @@ void tidy_up( //
INT16 &unlabelled_words,
INT16 *tgt_char_counts,
INT16 &rebalance_count,
char &min_char,
UNICHAR_ID *min_uch_id,
INT16 &min_samples,
INT16 &final_labelled_blob_count) {
BLOCK_IT block_it(block_list);
@ -507,16 +546,16 @@ void tidy_up( //
BOOL8 row_ok;
BOOL8 rebalance_needed = FALSE;
//No. of unique labelled samples
INT16 labelled_char_counts[128];
INT16 labelled_char_counts[MAX_NUM_CLASSES];
INT16 i;
char ch;
char prev_ch = '\0';
UNICHAR_ID uch_id;
UNICHAR_ID prev_uch_id = -1;
BOOL8 at_dupe_of_prev_word;
ROW *prev_row = NULL;
INT16 left;
INT16 prev_left = -1;
for (i = 0; i < 128; i++)
for (i = 0; i < MAX_NUM_CLASSES; i++)
labelled_char_counts[i] = 0;
ok_char_count = 0;
@ -556,7 +595,7 @@ void tidy_up( //
block_idx, row_idx, all_row_idx);
ok_char_count++;
labelled_char_counts[*word->text ()]++;
labelled_char_counts[unicharset_boxes.unichar_to_id(word->text ())]++;
row_ok = TRUE;
}
}
@ -571,24 +610,24 @@ void tidy_up( //
}
min_samples = 9999;
for (i = 0; i < 128; i++) {
for (i = 0; i < unicharset_boxes.size(); i++) {
if (tgt_char_counts[i] > labelled_char_counts[i]) {
if (labelled_char_counts[i] <= 1) {
tprintf
("APPLY_BOXES: FATALITY - %d labelled samples of \"%c\" - target is %d\n",
labelled_char_counts[i], (char) i, tgt_char_counts[i]);
("APPLY_BOXES: FATALITY - %d labelled samples of \"%s\" - target is %d\n",
labelled_char_counts[i], unicharset_boxes.id_to_unichar(i), tgt_char_counts[i]);
}
else {
rebalance_needed = TRUE;
if (applybox_debug > 0)
tprintf
("APPLY_BOXES: REBALANCE REQD \"%c\" - target of %d from %d labelled samples\n",
(char) i, tgt_char_counts[i], labelled_char_counts[i]);
("APPLY_BOXES: REBALANCE REQD \"%s\" - target of %d from %d labelled samples\n",
unicharset_boxes.id_to_unichar(i), tgt_char_counts[i], labelled_char_counts[i]);
}
}
if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {
min_samples = labelled_char_counts[i];
min_char = (char) i;
*min_uch_id = i;
}
}
@ -605,33 +644,36 @@ void tidy_up( //
!word_it.cycled_list (); word_it.forward ()) {
word = word_it.data ();
left = word->bounding_box ().left ();
ch = *word->text ();
if (*word->text () != '\0')
uch_id = unicharset_boxes.unichar_to_id(word->text ());
else
uch_id = -1;
at_dupe_of_prev_word = ((row == prev_row) &&
(left = prev_left) &&
(ch == prev_ch));
if ((ch != '\0') &&
(labelled_char_counts[ch] > 1) &&
(tgt_char_counts[ch] > labelled_char_counts[ch]) &&
(uch_id == prev_uch_id));
if ((uch_id != -1) &&
(labelled_char_counts[uch_id] > 1) &&
(tgt_char_counts[uch_id] > labelled_char_counts[uch_id]) &&
(!at_dupe_of_prev_word)) {
/* Duplicate the word to rebalance the labelled samples */
if (applybox_debug > 9) {
tprintf ("Duping \"%c\" from ", ch);
tprintf ("Duping \"%s\" from ", unicharset_boxes.id_to_unichar(uch_id));
word->bounding_box ().print ();
}
duplicate_word = new WERD;
*duplicate_word = *word;
word_it.add_after_then_move (duplicate_word);
rebalance_count++;
labelled_char_counts[ch]++;
labelled_char_counts[uch_id]++;
}
prev_row = row;
prev_left = left;
prev_ch = ch;
prev_uch_id = uch_id;
}
}
}
rebalance_needed = FALSE;
for (i = 0; i < 128; i++) {
for (i = 0; i < unicharset_boxes.size(); i++) {
if ((tgt_char_counts[i] > labelled_char_counts[i]) &&
(labelled_char_counts[i] > 1)) {
rebalance_needed = TRUE;
@ -653,7 +695,7 @@ void tidy_up( //
for (word_it.mark_cycle_pt ();
!word_it.cycled_list (); word_it.forward ()) {
word = word_it.data ();
if ((strlen (word->text ()) == 1) &&
if ((strlen (word->text ()) > 0) &&
(word->gblob_list ()->length () == 1))
final_labelled_blob_count++;
}
@ -665,7 +707,7 @@ void tidy_up( //
void report_failed_box(INT16 boxfile_lineno,
INT16 boxfile_charno,
BOX box,
char *box_ch,
const char *box_ch,
const char *err_msg) {
if (applybox_debug > 4)
tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n",
@ -687,10 +729,9 @@ void apply_box_training(BLOCK_LIST *block_list) {
PBLOB_IT blob_it;
DENORM denorm;
INT16 count = 0;
char ch[2];
ch[1] = '\0';
char unichar[UNICHAR_LEN + 1];
unichar[UNICHAR_LEN] = '\0';
tprintf ("Generating training data\n");
for (block_it.mark_cycle_pt ();
!block_it.cycled_list (); block_it.forward ()) {
@ -701,23 +742,22 @@ void apply_box_training(BLOCK_LIST *block_list) {
for (word_it.mark_cycle_pt ();
!word_it.cycled_list (); word_it.forward ()) {
word = word_it.data ();
if ((strlen (word->text ()) == 1) &&
if ((strlen (word->text ()) > 0) &&
(word->gblob_list ()->length () == 1)) {
/* Here is a word with a single char label and a single blob so train on it */
/* Here is a word with a single unichar label and a single blob so train on it */
bln_word =
make_bln_copy (word, row, row->x_height (), &denorm);
blob_it.set_to_list (bln_word->blob_list ());
ch[0] = *word->text ();
strncpy(unichar, word->text (), UNICHAR_LEN);
tess_training_tester (blob_it.data (),
//single blob
&denorm, TRUE, //correct
ch, //correct ASCII char
1, //ASCII length
unichar, //correct character
strlen(unichar), //character length
NULL);
copy_outword = *(bln_word);
copy_outword.baseline_denormalise (&denorm);
blob_it.set_to_list (copy_outword.blob_list ());
ch[0] = *word->text ();
delete bln_word;
count++;
}
@ -793,7 +833,7 @@ void apply_box_testing(BLOCK_LIST *block_list) {
choice list, outword blob lists and best_choice string are the same
length. A TESS screw up is indicated by a blank filled or 0 length string.
*/
if ((best_choice->string ().length () == 0) ||
if ((best_choice->lengths ().length () == 0) ||
(strspn (best_choice->string ().string (), " ") ==
best_choice->string ().length ())) {
rej_count++;
@ -804,22 +844,22 @@ void apply_box_testing(BLOCK_LIST *block_list) {
#endif
}
else {
if ((best_choice->string ().length () !=
if ((best_choice->lengths ().length () !=
outword->blob_list ()->length ()) ||
(best_choice->string ().length () !=
(best_choice->lengths ().length () !=
blob_choices.length ())) {
tprintf
("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
best_choice->string ().string (),
best_choice->string ().length (),
best_choice->lengths ().length (),
outword->blob_list ()->length (),
blob_choices.length ());
}
ASSERT_HOST (best_choice->string ().length () ==
ASSERT_HOST (best_choice->lengths ().length () ==
outword->blob_list ()->length ());
ASSERT_HOST (best_choice->string ().length () ==
ASSERT_HOST (best_choice->lengths ().length () ==
blob_choices.length ());
fix_quotes ((char *) best_choice->string ().string (),
fix_quotes (best_choice,
//turn to double
outword, &blob_choices);
if (strcmp (best_choice->string ().string (), ch) != 0) {

View File

@ -27,6 +27,7 @@
#include "applybox.h"
#include "pgedit.h"
#include "varabled.h"
#include "output.h"
#include "adaptmatch.h"
BOOL_VAR(tessedit_resegment_from_boxes, FALSE,
@ -37,6 +38,8 @@ BOOL_VAR(tessedit_train_from_boxes, FALSE,
// Minimum sensible image size to be worth running tesseract.
const int kMinRectSize = 10;
static STRING input_file = "noname.tif";
// Start tesseract.
// The datapath must be the name of the data directory or some other file
// in which the data directory resides (for instance argv[0].)
@ -70,6 +73,12 @@ int TessBaseAPI::InitWithLanguage(const char* datapath, const char* outputbase,
return result;
}
// Set the name of the input file. Needed only for training and
// loading a UNLV zone file.
void TessBaseAPI::SetInputName(const char* name) {
input_file = name;
}
// Recognize a rectangle from an image and return the result as a string.
// May be called many times for a single Init.
// Currently has no error checking.
@ -96,6 +105,52 @@ char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
return RecognizeToString();
}
// As TesseractRect but produces a box file as output.
char* TessBaseAPI::TesseractRectBoxes(const unsigned char* imagedata,
int bytes_per_pixel,
int bytes_per_line,
int left, int top,
int width, int height,
int imageheight) {
if (width < kMinRectSize || height < kMinRectSize)
return NULL; // Nothing worth doing.
// Copy/Threshold the image to the tesseract global page_image.
CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
left, top, width, height);
BLOCK_LIST block_list;
FindLines(&block_list);
// Now run the main recognition.
PAGE_RES* page_res = Recognize(&block_list, NULL);
return TesseractToBoxText(page_res, left, imageheight - (top + height));
}
char* TessBaseAPI::TesseractRectUNLV(const unsigned char* imagedata,
int bytes_per_pixel,
int bytes_per_line,
int left, int top,
int width, int height) {
if (width < kMinRectSize || height < kMinRectSize)
return NULL; // Nothing worth doing.
// Copy/Threshold the image to the tesseract global page_image.
CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
left, top, width, height);
BLOCK_LIST block_list;
FindLines(&block_list);
// Now run the main recognition.
PAGE_RES* page_res = Recognize(&block_list, NULL);
return TesseractToUNLV(page_res);
}
// Call between pages or documents etc to free up memory and forget
// adaptive data.
void TessBaseAPI::ClearAdaptiveClassifier() {
@ -326,7 +381,7 @@ void TessBaseAPI::CopyBinaryRect(const unsigned char* imagedata,
image.capture(const_cast<unsigned char*>(imagedata),
bytes_per_line*8, top + height, 1);
page_image.create(width, height, 1);
copy_sub_image(&image, left, top, width, height, &page_image, 0, 0, false);
copy_sub_image(&image, left, 0, width, height, &page_image, 0, 0, false);
}
// Low-level function to recognize the current global image to a string.
@ -343,7 +398,6 @@ char* TessBaseAPI::RecognizeToString() {
// Find lines from the image making the BLOCK_LIST.
void TessBaseAPI::FindLines(BLOCK_LIST* block_list) {
STRING input_file = "noname.tif";
// The following call creates a full-page block and then runs connected
// component analysis and text line creation.
pgeditor_read_file(input_file, block_list);
@ -369,21 +423,32 @@ PAGE_RES* TessBaseAPI::Recognize(BLOCK_LIST* block_list, ETEXT_DESC* monitor) {
return page_res;
}
// Return the maximum length that the output text string might occupy.
int TessBaseAPI::TextLength(PAGE_RES* page_res) {
PAGE_RES_IT page_res_it(page_res);
int total_length = 2;
// Iterate over the data structures to extract the recognition result.
for (page_res_it.restart_page(); page_res_it.word () != NULL;
page_res_it.forward()) {
WERD_RES *word = page_res_it.word();
WERD_CHOICE* choice = word->best_choice;
if (choice != NULL) {
total_length += choice->string().length() + 1;
for (int i = 0; i < word->reject_map.length(); ++i) {
if (word->reject_map[i].rejected())
++total_length;
}
}
}
return total_length;
}
// Make a text string from the internal data structures.
// The input page_res is deleted.
char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
if (page_res != NULL) {
int total_length = 2;
int total_length = TextLength(page_res);
PAGE_RES_IT page_res_it(page_res);
// Iterate over the data structures to extract the recognition result.
for (page_res_it.restart_page(); page_res_it.word () != NULL;
page_res_it.forward()) {
WERD_RES *word = page_res_it.word();
WERD_CHOICE* choice = word->best_choice;
if (choice != NULL) {
total_length += choice->string().length() + 1;
}
}
char* result = new char[total_length];
char* ptr = result;
for (page_res_it.restart_page(); page_res_it.word () != NULL;
@ -406,3 +471,207 @@ char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
}
return NULL;
}
static int ConvertWordToBoxText(WERD_RES *word,
ROW_RES* row,
int left,
int bottom,
char* word_str) {
// Copy the output word and denormalize it back to image coords.
WERD copy_outword;
copy_outword = *(word->outword);
copy_outword.baseline_denormalise(&word->denorm);
PBLOB_IT blob_it;
blob_it.set_to_list(copy_outword.blob_list());
int length = copy_outword.blob_list()->length();
int output_size = 0;
if (length > 0) {
for (int index = 0, offset = 0; index < length;
offset += word->best_choice->lengths()[index++], blob_it.forward()) {
PBLOB* blob = blob_it.data();
BOX blob_box = blob->bounding_box();
if (word->tess_failed ||
blob_box.left() < 0 ||
blob_box.right() > page_image.get_xsize() ||
blob_box.bottom() < 0 ||
blob_box.top() > page_image.get_ysize()) {
// Bounding boxes can be illegal when tess fails on a word.
blob_box = word->word->bounding_box(); // Use original word as backup.
tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
blob_box.left(), blob_box.bottom(),
blob_box.right(), blob_box.top());
}
// A single classification unit can be composed of several UTF-8
// characters. Append each of them to the result.
for (int sub = 0; sub < word->best_choice->lengths()[index]; ++sub) {
char ch = word->best_choice->string()[offset + sub];
// Tesseract uses space for recognition failure. Fix to a reject
// character, '~' so we don't create illegal box files.
if (ch == ' ')
ch = '~';
word_str[output_size++] = ch;
}
sprintf(word_str + output_size, " %d %d %d %d\n",
blob_box.left() + left, blob_box.bottom() + bottom,
blob_box.right() + left, blob_box.top() + bottom);
output_size += strlen(word_str + output_size);
}
}
return output_size;
}
// Multiplier for textlength assumes 4 numbers @ 5 digits and a space
// plus the newline and the orginial character = 4*(5+1)+2
const int kMaxCharsPerChar = 26;
// Make a text string from the internal data structures.
// The input page_res is deleted.
// The text string takes the form of a box file as needed for training.
char* TessBaseAPI::TesseractToBoxText(PAGE_RES* page_res,
int left, int bottom) {
if (page_res != NULL) {
int total_length = TextLength(page_res) * kMaxCharsPerChar;
PAGE_RES_IT page_res_it(page_res);
char* result = new char[total_length];
char* ptr = result;
for (page_res_it.restart_page(); page_res_it.word () != NULL;
page_res_it.forward()) {
WERD_RES *word = page_res_it.word();
ptr += ConvertWordToBoxText(word,page_res_it.row(),left, bottom, ptr);
}
*ptr = '\0';
delete page_res;
return result;
}
return NULL;
}
// Make a text string from the internal data structures.
// The input page_res is deleted. The text string is converted
// to UNLV-format: Latin-1 with specific reject and suspect codes.
const char kUnrecognized = '~';
// Conversion table for non-latin characters.
// Maps characters out of the latin set into the latin set.
// TODO(rays) incorporate this translation into unicharset.
const int kUniChs[] = {
0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
};
// Latin chars corresponding to the unicode chars above.
const int kLatinChs[] = {
0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
};
char* TessBaseAPI::TesseractToUNLV(PAGE_RES* page_res) {
bool tilde_crunch_written = false;
bool last_char_was_newline = true;
bool last_char_was_tilde = false;
if (page_res != NULL) {
int total_length = TextLength(page_res);
PAGE_RES_IT page_res_it(page_res);
char* result = new char[total_length];
char* ptr = result;
for (page_res_it.restart_page(); page_res_it.word () != NULL;
page_res_it.forward()) {
WERD_RES *word = page_res_it.word();
// Process the current word.
if (word->unlv_crunch_mode != CR_NONE) {
if (word->unlv_crunch_mode != CR_DELETE &&
(!tilde_crunch_written ||
(word->unlv_crunch_mode == CR_KEEP_SPACE &&
word->word->space () > 0 &&
!word->word->flag (W_FUZZY_NON) &&
!word->word->flag (W_FUZZY_SP)))) {
if (!word->word->flag (W_BOL) &&
word->word->space () > 0 &&
!word->word->flag (W_FUZZY_NON) &&
!word->word->flag (W_FUZZY_SP)) {
/* Write a space to separate from preceeding good text */
*ptr++ = ' ';
last_char_was_tilde = false;
}
if (!last_char_was_tilde) {
// Write a reject char.
last_char_was_tilde = true;
*ptr++ = kUnrecognized;
tilde_crunch_written = true;
last_char_was_newline = false;
}
}
} else {
// NORMAL PROCESSING of non tilde crunched words.
tilde_crunch_written = false;
if (last_char_was_tilde &&
word->word->space () == 0 &&
(word->best_choice->string ()[0] == ' ')) {
/* Prevent adjacent tilde across words - we know that adjacent tildes within
words have been removed */
char* p = (char *) word->best_choice->string().string ();
strcpy (p, p + 1); //shuffle up
p = (char *) word->best_choice->lengths().string ();
strcpy (p, p + 1); //shuffle up
word->reject_map.remove_pos (0);
PBLOB_IT blob_it = word->outword->blob_list ();
delete blob_it.extract (); //get rid of reject blob
}
if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps)
ensure_rep_chars_are_consistent(word);
set_unlv_suspects(word);
const char* wordstr = word->best_choice->string().string();
if (wordstr[0] != 0) {
if (!last_char_was_newline)
*ptr++ = ' ';
else
last_char_was_newline = false;
int offset = 0;
const STRING& lengths = word->best_choice->lengths();
int length = lengths.length();
for (int i = 0; i < length; offset += lengths[i++]) {
if (wordstr[offset] == ' ' ||
wordstr[offset] == '~' ||
wordstr[offset] == '|') {
*ptr++ = kUnrecognized;
last_char_was_tilde = true;
} else {
if (word->reject_map[i].rejected())
*ptr++ = '^';
UNICHAR ch(wordstr + offset, lengths[i]);
int uni_ch = ch.first_uni();
for (int j = 0; kUniChs[j] != 0; ++j) {
if (kUniChs[j] == uni_ch) {
uni_ch = kLatinChs[j];
break;
}
}
if (uni_ch <= 0xff) {
*ptr++ = static_cast<char>(uni_ch);
last_char_was_tilde = false;
} else {
*ptr++ = kUnrecognized;
last_char_was_tilde = true;
}
}
}
}
}
if (word->word->flag(W_EOL) && !last_char_was_newline) {
/* Add a new line output */
*ptr++ = '\n';
tilde_crunch_written = false;
last_char_was_newline = true;
last_char_was_tilde = false;
}
}
*ptr++ = '\n';
*ptr = '\0';
delete page_res;
return result;
}
return NULL;
}

View File

@ -20,8 +20,6 @@
#ifndef THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
#define THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
#include <string>
class PAGE_RES;
class BLOCK_LIST;
@ -56,6 +54,10 @@ class TessBaseAPI {
const char* language, const char* configfile,
bool numeric_mode, int argc, char* argv[]);
// Set the name of the input file. Needed only for training and
// reading a UNLV zone file.
static void SetInputName(const char* name);
// Recognize a rectangle from an image and return the result as a string.
// May be called many times for a single Init.
// Currently has no error checking.
@ -71,6 +73,19 @@ class TessBaseAPI {
int bytes_per_pixel,
int bytes_per_line,
int left, int top, int width, int height);
// As TesseractRect but produces a box file as output.
// Image height is needed as well as rect height, since output y-coords
// will be relative to the bottom of the image.
static char* TesseractRectBoxes(const unsigned char* imagedata,
int bytes_per_pixel,
int bytes_per_line,
int left, int top, int width, int height,
int imageheight);
// As TesseractRect but produces UNLV-style output.
static char* TesseractRectUNLV(const unsigned char* imagedata,
int bytes_per_pixel,
int bytes_per_line,
int left, int top, int width, int height);
// Call between pages or documents etc to free up memory and forget
// adaptive data.
@ -153,8 +168,18 @@ class TessBaseAPI {
static PAGE_RES* Recognize(BLOCK_LIST* block_list,
struct ETEXT_STRUCT* monitor);
// Return the maximum length that the output text string might occupy.
static int TextLength(PAGE_RES* page_res);
// Convert (and free) the internal data structures into a text string.
static char* TesseractToText(PAGE_RES* page_res);
// Make a text string from the internal data structures.
// The input page_res is deleted.
// The text string takes the form of a box file as needed for training.
static char* TesseractToBoxText(PAGE_RES* page_res, int left, int bottom);
// Make a text string from the internal data structures.
// The input page_res is deleted. The text string is converted
// to UNLV-format: Latin-1 with specific reject and suspect codes.
static char* TesseractToUNLV(PAGE_RES* page_res);
};
#endif // THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__

View File

@ -35,6 +35,7 @@
#include "docqual.h"
#include "output.h"
#include "bestfirst.h"
#include "globals.h"
#define EXTERN
@ -55,12 +56,12 @@ EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,
"Write block separators in output");
EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,
"Write raw stuff to name.raw");
EXTERN BOOL_EVAR (tessedit_write_output, TRUE, "Write text to name.txt");
EXTERN BOOL_EVAR (tessedit_write_output, FALSE, "Write text to name.txt");
EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,
"Return ratings in IPEOCRAPI data");
EXTERN BOOL_EVAR (tessedit_write_txt_map, TRUE,
EXTERN BOOL_EVAR (tessedit_write_txt_map, FALSE,
"Write .txt to .etx map file");
EXTERN BOOL_EVAR (tessedit_write_rep_codes, TRUE,
EXTERN BOOL_EVAR (tessedit_write_rep_codes, FALSE,
"Write repetition char code");
EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");
EXTERN STRING_EVAR (unrecognised_char, "|",
@ -106,7 +107,6 @@ INT32 pixels_to_pts( //convert coords
return (INT32) (pts + 0.5); //round it
}
void output_pass( //Tess output pass //send to api
PAGE_RES_IT &page_res_it,
BOOL8 write_to_shm,
@ -119,8 +119,7 @@ void output_pass( //Tess output pass //send to api
if (tessedit_write_txt_map)
txt_mapfile = open_outfile (".map");
if (tessedit_write_unlv)
unlv_file = open_outfile (".unlv");
page_res_it.restart_page ();
block_of_last_word = NULL;
while (page_res_it.word () != NULL) {
@ -189,7 +188,6 @@ void output_pass( //Tess output pass //send to api
}
}
/*************************************************************************
* write_results()
*
@ -211,9 +209,10 @@ void write_results( //output a word
) {
//word to do
WERD_RES *word = page_res_it.word ();
WERD_CHOICE *ep_choice; //ep format
// WERD_CHOICE *ep_choice; //ep format
STRING repetition_code;
const STRING *wordstr;
STRING wordstr_lengths;
const char *text;
int i;
char unrecognised = STRING (unrecognised_char)[0];
@ -312,15 +311,12 @@ void write_results( //output a word
if (tessedit_write_output && !NO_BLOCK)
fprintf (textfile, "%s", txt_chs);
if (tessedit_write_unlv)
fprintf (unlv_file, "%s", txt_chs);
if (tessedit_write_txt_map)
fprintf (txt_mapfile, "%s", map_chs);
//terminate string
ep_chars[ep_chars_index] = '\0';
word->ep_choice = new WERD_CHOICE (ep_chars, 0, 0, NO_PERM);
word->ep_choice = new WERD_CHOICE (ep_chars, NULL, 0, 0, NO_PERM);
if (force_eol)
empty_block = TRUE;
@ -345,6 +341,8 @@ void write_results( //output a word
words have been removed */
ptr = (char *) word->best_choice->string ().string ();
strcpy (ptr, ptr + 1); //shuffle up
ptr = (char *) word->best_choice->lengths ().string ();
strcpy (ptr, ptr + 1); //shuffle up
word->reject_map.remove_pos (0);
blob_it = word->outword->blob_list ();
delete blob_it.extract (); //get rid of reject blob
@ -354,8 +352,10 @@ void write_results( //output a word
last_char_was_tilde = FALSE;
else {
if (word->reject_map.length () > 0) {
if (word->best_choice->string ()[word->reject_map.length () - 1] ==
' ')
for (i = 0, ptr = (char *) word->best_choice->string().string();
i < word->reject_map.length () - 1; ++i)
ptr += word->best_choice->lengths()[i];
if (*ptr == ' ')
last_char_was_tilde = TRUE;
else
last_char_was_tilde = FALSE;
@ -365,7 +365,7 @@ void write_results( //output a word
/* else it is unchanged as there are no output chars */
}
ptr = (char *) word->best_choice->string ().string ();
ptr = (char *) word->best_choice->lengths ().string ();
ASSERT_HOST (strlen (ptr) == word->reject_map.length ());
if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
@ -379,21 +379,26 @@ void write_results( //output a word
dict_word (word->best_choice->string ().string ()));
}
#if 0
if (tessedit_write_unlv) {
write_unlv_text(word);
}
#endif
if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
repetition_code = "|^~R";
repetition_code += get_rep_char (word);
wordstr_lengths = "\001\001\001\001";
repetition_code += unicharset.id_to_unichar(get_rep_char (word));
wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word)));
wordstr = &repetition_code;
}
else {
wordstr = &(word->best_choice->string ());
wordstr_lengths = word->best_choice->lengths ();
if (tessedit_zero_rejection) {
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
text = wordstr->string ();
for (i = 0; text[i] != '\0'; i++) {
for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
if (word->reject_map[i].rejected ())
word->reject_map[i].setrej_minimal_rej_accept ();
}
@ -401,8 +406,8 @@ void write_results( //output a word
if (tessedit_minimal_rejection) {
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
text = wordstr->string ();
for (i = 0; text[i] != '\0'; i++) {
if ((text[i] != ' ') && word->reject_map[i].rejected ())
for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
if ((*text != ' ') && word->reject_map[i].rejected ())
word->reject_map[i].setrej_minimal_rej_accept ();
}
}
@ -410,8 +415,9 @@ void write_results( //output a word
if (write_to_shm)
write_shm_text (word, page_res_it.block ()->block,
page_res_it.row (), *wordstr);
page_res_it.row (), *wordstr, wordstr_lengths);
#if 0
if (tessedit_write_output)
write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);
@ -424,12 +430,12 @@ void write_results( //output a word
ep_choice = make_epaper_choice (word, newline_type);
word->ep_choice = ep_choice;
#endif
character_count += word->best_choice->string ().length ();
character_count += word->best_choice->lengths ().length ();
word_count++;
}
/**********************************************************************
* make_epaper_choice
*
@ -437,6 +443,7 @@ void write_results( //output a word
* determine whether each blob should be rejected.
**********************************************************************/
#if 0
WERD_CHOICE *make_epaper_choice( //convert one word
WERD_RES *word, //word to do
char newline_type //type of newline
@ -482,7 +489,8 @@ WERD_CHOICE *make_epaper_choice( //convert one word
if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
strcpy (word_string + index, "|^~R");
index += 4;
word_string[index++] = get_rep_char (word);
strcpy(word_string + index, unicharset.id_to_unichar(get_rep_char (word)));
index += strlen(unicharset.id_to_unichar(get_rep_char (word)));
}
else {
if (!blob_it.empty ())
@ -537,7 +545,7 @@ WERD_CHOICE *make_epaper_choice( //convert one word
ASSERT_HOST (strlen (word_string) == index);
return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
}
#endif
/**********************************************************************
* make_reject
@ -653,6 +661,7 @@ char determine_newline_type( //test line ends
* to the given file.
**********************************************************************/
#if 0
void write_cooked_text( //write output
WERD *word, //word to do
const STRING &text, //text to write
@ -749,6 +758,7 @@ void write_cooked_text( //write output
if (status != 0)
WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno);
}
#endif
/**********************************************************************
@ -761,7 +771,8 @@ void write_shm_text( //write output
WERD_RES *word, //word to do
BLOCK *block, //block it is from
ROW_RES *row, //row it is from
const STRING &text //text to write
const STRING &text, //text to write
const STRING &text_lengths
) {
INT32 index; //char counter
INT32 index2; //char counter
@ -777,6 +788,8 @@ void write_shm_text( //write output
WERD copy_outword; // copy to denorm
UINT32 rating; //of char
BOOL8 lineend; //end of line
int offset;
int offset2;
//point size
ptsize = pixels_to_pts ((INT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
@ -786,13 +799,14 @@ void write_shm_text( //write output
copy_outword = *(word->outword);
copy_outword.baseline_denormalise (&word->denorm);
blob_it.set_to_list (copy_outword.blob_list ());
length = text.length ();
length = text_lengths.length ();
if (length > 0) {
blanks = word->word->space ();
if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
blanks = 1;
for (index = 0; index < length; index++, blob_it.forward ()) {
for (index = 0, offset = 0; index < length;
offset += text_lengths[index++], blob_it.forward ()) {
blob = blob_it.data ();
blob_box = blob->bounding_box ();
@ -804,7 +818,7 @@ void write_shm_text( //write output
if (tessedit_write_ratings)
rating = (UINT32) (-word->best_choice->certainty () / 0.035);
else if (tessedit_zero_rejection)
rating = text[index] == ' ' ? 100 : 0;
rating = text[offset] == ' ' ? 100 : 0;
else
rating = word->reject_map[index].accepted ()? 0 : 100;
if (rating > 255)
@ -819,22 +833,41 @@ void write_shm_text( //write output
lineend = word->word->flag (W_EOL) && index == length - 1;
if (word->word->flag (W_EOL) && tessedit_zero_rejection
&& index < length - 1 && text[index + 1] == ' ') {
for (index2 = index + 1; index2 < length && text[index2] == ' ';
index2++);
&& index < length - 1 && text[index + text_lengths[index]] == ' ') {
for (index2 = index + 1, offset2 = offset + text_lengths[index];
index2 < length && text[offset2] == ' ';
offset2 += text_lengths[index2++]);
if (index2 == length)
lineend = TRUE;
}
if (!tessedit_zero_rejection || text[index] != ' '
if (!tessedit_zero_rejection || text[offset] != ' '
|| tessedit_word_for_word) {
//confidence
ocr_append_char (text[index] == ' ' ? unrecognised : text[index], blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, (UINT8) rating,
ptsize, //point size
blanks, enhancement, //enhancement
OCR_CDIR_LEFT_RIGHT,
OCR_LDIR_DOWN_RIGHT,
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
if (text[offset] == ' ') {
ocr_append_char (unrecognised,
blob_box.left (), blob_box.right (),
page_image.get_ysize () - 1 - blob_box.top (),
page_image.get_ysize () - 1 - blob_box.bottom (),
font, (UINT8) rating,
ptsize, //point size
blanks, enhancement, //enhancement
OCR_CDIR_LEFT_RIGHT,
OCR_LDIR_DOWN_RIGHT,
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
} else {
for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
ocr_append_char (text[offset + suboffset],
blob_box.left (), blob_box.right (),
page_image.get_ysize () - 1 - blob_box.top (),
page_image.get_ysize () - 1 - blob_box.bottom (),
font, (UINT8) rating,
ptsize, //point size
blanks, enhancement, //enhancement
OCR_CDIR_LEFT_RIGHT,
OCR_LDIR_DOWN_RIGHT,
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
}
blanks = 0;
}
@ -863,13 +896,17 @@ void write_shm_text( //write output
lineend = word->word->flag (W_EOL);
//font index
ocr_append_char (unrecognised, blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font,
rating, //confidence
ptsize, //point size
blanks, enhancement, //enhancement
OCR_CDIR_LEFT_RIGHT,
OCR_LDIR_DOWN_RIGHT,
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
ocr_append_char (unrecognised,
blob_box.left (), blob_box.right (),
page_image.get_ysize () - 1 - blob_box.top (),
page_image.get_ysize () - 1 - blob_box.bottom (),
font,
rating, //confidence
ptsize, //point size
blanks, enhancement, //enhancement
OCR_CDIR_LEFT_RIGHT,
OCR_LDIR_DOWN_RIGHT,
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
}
}
@ -888,6 +925,7 @@ void write_shm_text( //write output
* newdiff needs etx files!
**********************************************************************/
#if 0
void write_map( //output a map file
FILE *mapfile, //mapfile to write to
WERD_RES *word) {
@ -937,6 +975,7 @@ void write_map( //output a map file
if (status != 0)
WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
}
#endif
/*************************************************************************
@ -957,6 +996,7 @@ FILE *open_outfile( //open .map & .unlv file
}
#if 0
void write_unlv_text(WERD_RES *word) {
const char *wordstr;
@ -1015,6 +1055,7 @@ void write_unlv_text(WERD_RES *word) {
if (status != 0)
WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
}
#endif
/*************************************************************************
@ -1022,21 +1063,24 @@ void write_unlv_text(WERD_RES *word) {
* Return the first accepted character from the repetition string. This is the
* character which is repeated - as determined earlier by fix_rep_char()
*************************************************************************/
char get_rep_char( // what char is repeated?
WERD_RES *word) {
UNICHAR_ID get_rep_char(WERD_RES *word) { // what char is repeated?
int i;
int offset;
for (i = 0;
for (i = 0, offset = 0;
((i < word->reject_map.length ()) &&
(word->reject_map[i].rejected ())); i++);
(word->reject_map[i].rejected ()));
offset += word->best_choice->lengths()[i++]);
if (i < word->reject_map.length ())
return word->best_choice->string ()[i];
return unicharset.unichar_to_id(word->best_choice->string().string()
+ offset,
word->best_choice->lengths()[i]);
else
return STRING (unrecognised_char)[0];
return unicharset.unichar_to_id(unrecognised_char.string());
}
void ensure_rep_chars_are_consistent(WERD_RES *word) {
#if 0
char rep_char = get_rep_char (word);
char *ptr;
@ -1045,8 +1089,24 @@ void ensure_rep_chars_are_consistent(WERD_RES *word) {
if (*ptr != rep_char)
*ptr = rep_char;
}
}
#endif
#if 0
UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate
int i;
char *ptr;
STRING consistent_string;
STRING consistent_string_lengths;
ptr = (char *) word->best_choice->string ().string ();
for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) {
consistent_string += unicharset.id_to_unichar(rep_char);
consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char));
}
word->best_choice->string() = consistent_string;
word->best_choice->lengths() = consistent_string_lengths;
#endif
}
/*************************************************************************
* SUSPECT LEVELS
@ -1062,7 +1122,9 @@ void ensure_rep_chars_are_consistent(WERD_RES *word) {
void set_unlv_suspects(WERD_RES *word) {
int len = word->reject_map.length ();
int i;
int offset;
const char *ptr;
const char *lengths = word->best_choice->lengths ().string ();
float rating_per_ch;
ptr = word->best_choice->string ().string ();
@ -1080,10 +1142,12 @@ void set_unlv_suspects(WERD_RES *word) {
/* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
if (safe_dict_word (ptr) && (count_alphas (ptr) > suspect_short_words)) {
if (safe_dict_word (ptr) && (count_alphas (ptr, lengths) >
suspect_short_words)) {
/* Unreject alphas in dictionary words */
for (i = 0; i < len; i++) {
if (word->reject_map[i].rejected () && isalpha (ptr[i]))
for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
if (word->reject_map[i].rejected () &&
unicharset.get_isalpha (ptr + offset, lengths[i]))
word->reject_map[i].setrej_minimal_rej_accept ();
}
}
@ -1095,8 +1159,8 @@ void set_unlv_suspects(WERD_RES *word) {
if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
/* Unreject any Tess Acceptable word - but NOT tess reject chs*/
for (i = 0; i < len; i++) {
if (word->reject_map[i].rejected () && (ptr[i] != ' '))
for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
if (word->reject_map[i].rejected () && (ptr[offset] != ' '))
word->reject_map[i].setrej_minimal_rej_accept ();
}
}
@ -1130,9 +1194,11 @@ void set_unlv_suspects(WERD_RES *word) {
}
}
if ((acceptable_word_string (word->best_choice->string ().string ())
if ((acceptable_word_string (word->best_choice->string ().string (),
word->best_choice->lengths ().string ())
!= AC_UNACCEPTABLE) ||
acceptable_number_string (word->best_choice->string ().string ())) {
acceptable_number_string (word->best_choice->string ().string (),
word->best_choice->lengths ().string ())) {
if (word->reject_map.length () > suspect_short_words) {
for (i = 0; i < len; i++) {
if (word->reject_map[i].rejected () &&
@ -1149,11 +1215,12 @@ void set_unlv_suspects(WERD_RES *word) {
INT16 count_alphas( //how many alphas
const char *s) {
const char *s,
const char *lengths) {
int count = 0;
for (; *s != '\0'; s++) {
if (isalpha (*s))
for (; *s != '\0'; s += *(lengths++)) {
if (unicharset.get_isalpha(s, *lengths))
count++;
}
return count;
@ -1161,36 +1228,43 @@ INT16 count_alphas( //how many alphas
INT16 count_alphanums( //how many alphanums
const char *s) {
const char *s,
const char *lengths) {
int count = 0;
for (; *s != '\0'; s++) {
if (isalnum (*s))
for (; *s != '\0'; s += *(lengths++)) {
if (unicharset.get_isalpha(s, *lengths) ||
unicharset.get_isdigit(s, *lengths))
count++;
}
return count;
}
BOOL8 acceptable_number_string(const char *s) {
BOOL8 acceptable_number_string(const char *s,
const char *lengths) {
BOOL8 prev_digit = FALSE;
if (*s == '(')
if (*lengths == 1 && *s == '(')
s++;
if ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))
if (*lengths == 1 &&
((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
s++;
for (; *s != '\0'; s++) {
if (isdigit (*s))
for (; *s != '\0'; s += *(lengths++)) {
if (unicharset.get_isdigit (s, *lengths))
prev_digit = TRUE;
else if (prev_digit && ((*s == '.') || (*s == ',') || (*s == '-')))
prev_digit = FALSE;
else if (prev_digit &&
(*(s + 1) == '\0') && ((*s == '%') || (*s == ')')))
(*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
prev_digit = FALSE;
else if (prev_digit && *lengths == 1 &&
(*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
return TRUE;
else if (prev_digit &&
(*s == '%') && (*(s + 1) == ')') && (*(s + 2) == '\0'))
*lengths == 1 && (*s == '%') &&
(*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
(*(s + *lengths + *(lengths + 1)) == '\0'))
return TRUE;
else
return FALSE;

View File

@ -31,7 +31,9 @@
#include "stderr.h"
#include "notdll.h"
#include "mainblk.h"
#include "output.h"
#include "globals.h"
#include "blread.h"
#include "tfacep.h"
#include "callnet.h"
@ -40,7 +42,10 @@
#define API_CONFIG "configs/api_config"
#define EXTERN
EXTERN BOOL_VAR (tessedit_create_boxfile, FALSE, "Output text with boxes");
EXTERN BOOL_VAR (tessedit_read_image, TRUE, "Ensure the image is read");
EXTERN INT_VAR (tessedit_serial_unlv, 0,
"0->Whole page, 1->serial no adapt, 2->serial with adapt");
EXTERN BOOL_VAR (tessedit_write_images, FALSE,
"Capture the image from the IPE");
EXTERN BOOL_VAR (tessedit_debug_to_screen, FALSE, "Dont use debug file");
@ -63,15 +68,30 @@ int main(int argc, char **argv) {
if (argc < 3) {
USAGE.error (argv[0], EXIT,
"%s imagename outputbase [configfile [[+|-]varfile]...]\n", argv[0]);
"%s imagename outputbase [-l lang] [configfile [[+|-]varfile]...]\n",
argv[0]);
}
// Find the required language.
const char* lang = "eng";
int arg = 3;
if (argc >= 5 && strcmp(argv[3], "-l") == 0) {
lang = argv[4];
arg = 5;
}
// Find the basename of the input file.
STRING infile(argv[1]);
const char* lastdot = strrchr(argv[1], '.');
if (lastdot != NULL) {
infile[lastdot - argv[1]] = '\0';
}
if (argc == 3)
TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL,
NULL, false, 0, argv + 2);
if (argc == arg)
TessBaseAPI::InitWithLanguage(argv[0], infile.string(), lang,
NULL, false, 0, argv + arg);
else
TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL,
argv[3], false, argc - 4, argv + 4);
TessBaseAPI::InitWithLanguage(argv[0], infile.string(), lang,
argv[arg], false,
argc - arg - 1, argv + arg + 1);
tprintf ("Tesseract Open Source OCR Engine\n");
@ -92,20 +112,70 @@ int main(int argc, char **argv) {
argv[1]);
}
#endif
STRING text_out;
int bytes_per_line = check_legal_image_size(image.get_xsize(),
image.get_ysize(),
image.get_bpp());
char* text = TessBaseAPI::TesseractRect(image.get_buffer(), image.get_bpp()/8,
bytes_per_line, 0, 0,
image.get_xsize(), image.get_ysize());
if (tessedit_serial_unlv == 0) {
TessBaseAPI::SetInputName(argv[1]);
char* text;
if (tessedit_create_boxfile)
text = TessBaseAPI::TesseractRectBoxes(image.get_buffer(),
image.get_bpp()/8,
bytes_per_line, 0, 0,
image.get_xsize(),
image.get_ysize(),
image.get_ysize());
else if (tessedit_write_unlv)
text = TessBaseAPI::TesseractRectUNLV(image.get_buffer(),
image.get_bpp()/8,
bytes_per_line, 0, 0,
image.get_xsize(),
image.get_ysize());
else
text = TessBaseAPI::TesseractRect(image.get_buffer(), image.get_bpp()/8,
bytes_per_line, 0, 0,
image.get_xsize(), image.get_ysize());
text_out = text;
delete [] text;
} else {
BLOCK_LIST blocks;
STRING filename = argv[1];
int len = filename.length();
if (len > 4 && filename[len - 4] == '.') {
filename[len - 4] = '\0';
}
if (!read_unlv_file(filename, image.get_xsize(), image.get_ysize(),
&blocks)) {
fprintf(stderr, "Error: Must have a unlv zone file %s to read!\n",
filename.string());
return 1;
}
BLOCK_IT b_it = &blocks;
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
BLOCK* block = b_it.data();
BOX box = block->bounding_box();
char* text = TessBaseAPI::TesseractRectUNLV(image.get_buffer(),
image.get_bpp()/8,
bytes_per_line,
box.left(),
image.get_ysize() - box.top(),
box.width(),
box.height());
text_out += text;
delete [] text;
if (tessedit_serial_unlv == 1)
TessBaseAPI::ClearAdaptiveClassifier();
}
}
outfile = argv[2];
outfile += ".txt";
FILE* fp = fopen(outfile.string(), "w");
if (fp != NULL) {
fwrite(text, 1, strlen(text), fp);
fwrite(text_out.string(), 1, text_out.length(), fp);
fclose(fp);
}
delete [] text;
TessBaseAPI::End();
return 0; //Normal exit

View File

@ -527,7 +527,9 @@ BOOL8 read_unlv_file( //print list of sides
else {
while (fscanf (pdfp, "%d %d %d %d %*s", &x, &y, &width, &height) >= 4) {
//make rect block
block = new BLOCK (name.string (), TRUE, 0, 0, (INT16) x, (INT16) (ysize - 1 - y - height), (INT16) (x + width), (INT16) (ysize - 1 - y));
block = new BLOCK (name.string (), TRUE, 0, 0,
(INT16) x, (INT16) (ysize - y - height),
(INT16) (x + width), (INT16) (ysize - y));
//on end of list
block_it.add_to_end (block);
}

View File

@ -63,7 +63,7 @@ make_toggle_var (debug_8, 0, make_debug_8, 6, 8, toggle_debug_8, "Debug #8");
make_toggle_var (display_ratings, 0, make_display_ratings,
6, 9, toggle_ratings, "Ratings display");
make_toggle_var (display_text, 1, make_display_text,
make_toggle_var (display_text, 0, make_display_text,
6, 10, toggle_text, "Display Text");
make_toggle_var (show_bold, 1, make_show_bold,

1
tessdata/configs/makebox Normal file
View File

@ -0,0 +1 @@
tessedit_create_boxfile 1

3
tessdata/configs/unlv Normal file
View File

@ -0,0 +1,3 @@
tessedit_write_unlv 1
tessedit_write_output 0
tessedit_write_txt_map 0

View File

@ -1,78 +1,2 @@
#################################################
# Adaptive Matcher Using PreAdapted Templates
#################################################
acts_fx 0x800
acts_ocr 0x20
RatingScale 30.0
CertaintyScale 20.0
#EnableMatcher 0
#CurrentFx 2
MinSlope 0.414213562
MaxSlope 2.414213562
#ExtremityMode 1
NormMethod 1
EnableAdaptiveMatcher 1
NormAdjMidpoint 32.0
NormAdjCurl 2.0
MinNormScaleX 0.0
MaxNormScaleX 0.325
MinNormScaleY 0.0
MaxNormScaleY 0.325
BuiltInTemplatesFile tessdata/inttemp
BuiltInCutoffsFile tessdata/pffmtable
EnableLearning 0
SaveAdaptedTemplates 0
UsePreAdaptedTemplates 0
ReliableConfigThreshold 2
MinNumPermClasses 3
#EnableStopper 1
GoodAdaptiveMatch 0.125
GreatAdaptiveMatch 0.0
EnableIntFX 1
EnableNewAdaptRules 1
################################################################################
#
# File: marks/configs/knobs
# Description: Control variables for 'marks' code
# Author: Mark Seaman, OCR Technology
# Created: Wed Feb 27 11:27:27 1991
# Modified: Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
# Language: Text
# Package: N/A
# Status: Experimental (Do Not Distribute)
#
# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
#
################################################################################
#hidden_edges 1
save_doc_words 1
doc_dict_enable 1
ClassPrunerThreshold 229
ClassPrunerMultiplier 15
IntThetaFudge 128
CPCutoffStrength 0.15
EvidenceTableBits 9
IntEvidenceTruncBits 14
SEExponentialMultiplier 0
SimilarityCenter 0.0075
#################################################
# Adaptive Matcher Using 2 Passes
#################################################
EnableLearning 1
SaveAdaptedTemplates 0
UsePreAdaptedTemplates 0
#save_errors 0
# No content needed as all defaults are correct.

View File

@ -0,0 +1,2 @@
chop_enable 0
enable_assoc 0

View File

@ -2,80 +2,6 @@
# Adaptive Matcher Using PreAdapted Templates
#################################################
acts_fx 0x800
acts_ocr 0x20
RatingScale 30.0
CertaintyScale 20.0
#EnableMatcher 0
#CurrentFx 2
EnableAdaptiveMatcher 1
NormAdjMidpoint 32.0
NormAdjCurl 2.0
MinNormScaleX 0.0
MaxNormScaleX 0.325
MinNormScaleY 0.0
MaxNormScaleY 0.325
BuiltInTemplatesFile tessdata/inttemp
BuiltInCutoffsFile tessdata/pffmtable
EnableLearning 0
SaveAdaptedTemplates 0
UsePreAdaptedTemplates 0
ReliableConfigThreshold 2
MinNumPermClasses 3
#EnableStopper 1
GoodAdaptiveMatch 0.125
GreatAdaptiveMatch 0.0
EnableIntFX 1
EnableNewAdaptRules 1
EnableAdaptiveDebugger 1
MatchDebugFlags 6
MatcherDebugLevel 1
################################################################################
#
# File: marks/configs/knobs
# Description: Control variables for 'marks' code
# Author: Mark Seaman, OCR Technology
# Created: Wed Feb 27 11:27:27 1991
# Modified: Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
# Language: Text
# Package: N/A
# Status: Experimental (Do Not Distribute)
#
# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
#
################################################################################
#hidden_edges 1
save_doc_words 1
doc_dict_enable 1
ClassPrunerThreshold 229
ClassPrunerMultiplier 15
IntThetaFudge 128
CPCutoffStrength 0.15
EvidenceTableBits 9
IntEvidenceTruncBits 14
SEExponentialMultiplier 0
SimilarityCenter 0.0075
#################################################
# Adaptive Matcher Using 2 Passes
#################################################
display_splits 0
display_all_words 0
display_all_blobs 0
display_segmentations 0
EnableLearning 1
SaveAdaptedTemplates 0
UsePreAdaptedTemplates 0
#save_errors 0

View File

@ -0,0 +1,13 @@
#################################################
# Adaptive Matcher Using PreAdapted Templates
#################################################
EnableAdaptiveDebugger 1
MatchDebugFlags 6
MatcherDebugLevel 1
display_splits 0
display_all_words 1
display_all_blobs 1
display_segmentations 2
display_ratings 1

View File

@ -0,0 +1,2 @@
display_text 0

View File

@ -2,70 +2,6 @@
# Adaptive Matcher Using PreAdapted Templates
#################################################
acts_fx 0x800
acts_ocr 0x20
RatingScale 30.0
CertaintyScale 20.0
#EnableMatcher 0
#CurrentFx 2
EnableAdaptiveMatcher 1
NormAdjMidpoint 32.0
NormAdjCurl 2.0
MinNormScaleX 0.0
MaxNormScaleX 0.325
MinNormScaleY 0.0
MaxNormScaleY 0.325
BuiltInTemplatesFile tessdata/inttemp
BuiltInCutoffsFile tessdata/pffmtable
EnableLearning 0
SaveAdaptedTemplates 0
UsePreAdaptedTemplates 0
ReliableConfigThreshold 2
MinNumPermClasses 3
#EnableStopper 1
GoodAdaptiveMatch 0.125
GreatAdaptiveMatch 0.0
EnableIntFX 1
EnableNewAdaptRules 1
################################################################################
#
# File: marks/configs/knobs
# Description: Control variables for 'marks' code
# Author: Mark Seaman, OCR Technology
# Created: Wed Feb 27 11:27:27 1991
# Modified: Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
# Language: Text
# Package: N/A
# Status: Experimental (Do Not Distribute)
#
# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
#
################################################################################
#hidden_edges 1
save_doc_words 1
doc_dict_enable 1
ClassPrunerThreshold 229
ClassPrunerMultiplier 15
IntThetaFudge 128
CPCutoffStrength 0.15
EvidenceTableBits 9
IntEvidenceTruncBits 14
SEExponentialMultiplier 0
SimilarityCenter 0.0075
#################################################
# Adaptive Matcher Using 2 Passes
#################################################
display_splits 0
display_all_words 1
display_all_blobs 1

185
testing/Makefile Normal file
View File

@ -0,0 +1,185 @@
# Makefile.in generated automatically by automake 1.4-p6 from Makefile.am
# Copyright (C) 1994, 1995-8, 1999, 2001 Free Software Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE.
SHELL = /bin/sh
srcdir = .
top_srcdir = ..
prefix = /usr/local
exec_prefix = ${prefix}
bindir = ${exec_prefix}/bin
sbindir = ${exec_prefix}/sbin
libexecdir = ${exec_prefix}/libexec
datadir = ${prefix}/share
sysconfdir = ${prefix}/etc
sharedstatedir = ${prefix}/com
localstatedir = ${prefix}/var
libdir = ${exec_prefix}/lib
infodir = ${prefix}/info
mandir = ${prefix}/man
includedir = ${prefix}/include/tesseract
oldincludedir = /usr/include
DESTDIR =
pkgdatadir = $(datadir)/
pkglibdir = $(libdir)/
pkgincludedir = $(includedir)/
top_builddir = ..
ACLOCAL = aclocal-1.4
AUTOCONF = autoconf
AUTOMAKE = automake-1.4
AUTOHEADER = autoheader
INSTALL = /usr/bin/install -c
INSTALL_PROGRAM = ${INSTALL} $(AM_INSTALL_PROGRAM_FLAGS)
INSTALL_DATA = ${INSTALL} -m 644
INSTALL_SCRIPT = ${INSTALL}
transform = s,x,x,
NORMAL_INSTALL = :
PRE_INSTALL = :
POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
host_alias =
host_triplet = x86_64-unknown-linux-gnu
CC = gcc
CXX = g++
HAVE_LIB = @HAVE_LIB@
LIB = @LIB@
LTLIB = @LTLIB@
MAINT = #
MAKEINFO = /home/rays/src/opensrc/tesseract-ocr/config/missing makeinfo
PACKAGE =
PACKAGE_DATE = 07/2007
PACKAGE_NAME = tesseract
PACKAGE_VERSION = 2.00
PACKAGE_YEAR = 2007
RANLIB = ranlib
VERSION =
EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum
mkinstalldirs = $(SHELL) $(top_srcdir)/config/mkinstalldirs
CONFIG_HEADER = ../config_auto.h
CONFIG_CLEAN_FILES =
DIST_COMMON = README Makefile.am Makefile.in
DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST)
TAR = tar
GZIP_ENV = --best
all: all-redirect
.SUFFIXES:
$(srcdir)/Makefile.in: # Makefile.am $(top_srcdir)/configure.ac $(ACLOCAL_M4)
cd $(top_srcdir) && $(AUTOMAKE) --gnu testing/Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status $(BUILT_SOURCES)
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
tags: TAGS
TAGS:
distdir = $(top_builddir)/$(PACKAGE_NAME)-$(PACKAGE_VERSION)/$(subdir)
subdir = testing
distdir: $(DISTFILES)
here=`cd $(top_builddir) && pwd`; \
top_distdir=`cd $(top_distdir) && pwd`; \
distdir=`cd $(distdir) && pwd`; \
cd $(top_srcdir) \
&& $(AUTOMAKE) --include-deps --build-dir=$$here --srcdir-name=$(top_srcdir) --output-dir=$$top_distdir --gnu testing/Makefile
$(mkinstalldirs) $(distdir)/reports
@for file in $(DISTFILES); do \
d=$(srcdir); \
if test -d $$d/$$file; then \
cp -pr $$d/$$file $(distdir)/$$file; \
else \
test -f $(distdir)/$$file \
|| ln $$d/$$file $(distdir)/$$file 2> /dev/null \
|| cp -p $$d/$$file $(distdir)/$$file || :; \
fi; \
done
info-am:
info: info-am
dvi-am:
dvi: dvi-am
check-am: all-am
check: check-am
installcheck-am:
installcheck: installcheck-am
install-exec-am:
install-exec: install-exec-am
install-data-am:
install-data: install-data-am
install-am: all-am
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
install: install-am
uninstall-am:
uninstall: uninstall-am
all-am: Makefile
all-redirect: all-am
install-strip:
$(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install
installdirs:
mostlyclean-generic:
clean-generic:
distclean-generic:
-rm -f Makefile $(CONFIG_CLEAN_FILES)
-rm -f config.cache config.log stamp-h stamp-h[0-9]*
maintainer-clean-generic:
mostlyclean-am: mostlyclean-generic
mostlyclean: mostlyclean-am
clean-am: clean-generic mostlyclean-am
clean: clean-am
distclean-am: distclean-generic clean-am
distclean: distclean-am
maintainer-clean-am: maintainer-clean-generic distclean-am
@echo "This command is intended for maintainers to use;"
@echo "it deletes files that may require special tools to rebuild."
maintainer-clean: maintainer-clean-am
.PHONY: tags distdir info-am info dvi-am dvi check check-am \
installcheck-am installcheck install-exec-am install-exec \
install-data-am install-data install-am install uninstall-am uninstall \
all-redirect all-am all installdirs mostlyclean-generic \
distclean-generic clean-generic maintainer-clean-generic clean \
mostlyclean distclean maintainer-clean
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:

2
testing/Makefile.am Normal file
View File

@ -0,0 +1,2 @@
EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum

185
testing/Makefile.in Normal file
View File

@ -0,0 +1,185 @@
# Makefile.in generated automatically by automake 1.4-p6 from Makefile.am
# Copyright (C) 1994, 1995-8, 1999, 2001 Free Software Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE.
SHELL = @SHELL@
srcdir = @srcdir@
top_srcdir = @top_srcdir@
VPATH = @srcdir@
prefix = @prefix@
exec_prefix = @exec_prefix@
bindir = @bindir@
sbindir = @sbindir@
libexecdir = @libexecdir@
datadir = @datadir@
sysconfdir = @sysconfdir@
sharedstatedir = @sharedstatedir@
localstatedir = @localstatedir@
libdir = @libdir@
infodir = @infodir@
mandir = @mandir@
includedir = @includedir@
oldincludedir = /usr/include
DESTDIR =
pkgdatadir = $(datadir)/@PACKAGE@
pkglibdir = $(libdir)/@PACKAGE@
pkgincludedir = $(includedir)/@PACKAGE@
top_builddir = ..
ACLOCAL = @ACLOCAL@
AUTOCONF = @AUTOCONF@
AUTOMAKE = @AUTOMAKE@
AUTOHEADER = @AUTOHEADER@
INSTALL = @INSTALL@
INSTALL_PROGRAM = @INSTALL_PROGRAM@ $(AM_INSTALL_PROGRAM_FLAGS)
INSTALL_DATA = @INSTALL_DATA@
INSTALL_SCRIPT = @INSTALL_SCRIPT@
transform = @program_transform_name@
NORMAL_INSTALL = :
PRE_INSTALL = :
POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
host_alias = @host_alias@
host_triplet = @host@
CC = @CC@
CXX = @CXX@
HAVE_LIB = @HAVE_LIB@
LIB = @LIB@
LTLIB = @LTLIB@
MAINT = @MAINT@
MAKEINFO = @MAKEINFO@
PACKAGE = @PACKAGE@
PACKAGE_DATE = @PACKAGE_DATE@
PACKAGE_NAME = @PACKAGE_NAME@
PACKAGE_VERSION = @PACKAGE_VERSION@
PACKAGE_YEAR = @PACKAGE_YEAR@
RANLIB = @RANLIB@
VERSION = @VERSION@
EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum
mkinstalldirs = $(SHELL) $(top_srcdir)/config/mkinstalldirs
CONFIG_HEADER = ../config_auto.h
CONFIG_CLEAN_FILES =
DIST_COMMON = README Makefile.am Makefile.in
DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST)
TAR = tar
GZIP_ENV = --best
all: all-redirect
.SUFFIXES:
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.ac $(ACLOCAL_M4)
cd $(top_srcdir) && $(AUTOMAKE) --gnu testing/Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status $(BUILT_SOURCES)
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
tags: TAGS
TAGS:
distdir = $(top_builddir)/$(PACKAGE_NAME)-$(PACKAGE_VERSION)/$(subdir)
subdir = testing
distdir: $(DISTFILES)
here=`cd $(top_builddir) && pwd`; \
top_distdir=`cd $(top_distdir) && pwd`; \
distdir=`cd $(distdir) && pwd`; \
cd $(top_srcdir) \
&& $(AUTOMAKE) --include-deps --build-dir=$$here --srcdir-name=$(top_srcdir) --output-dir=$$top_distdir --gnu testing/Makefile
$(mkinstalldirs) $(distdir)/reports
@for file in $(DISTFILES); do \
d=$(srcdir); \
if test -d $$d/$$file; then \
cp -pr $$d/$$file $(distdir)/$$file; \
else \
test -f $(distdir)/$$file \
|| ln $$d/$$file $(distdir)/$$file 2> /dev/null \
|| cp -p $$d/$$file $(distdir)/$$file || :; \
fi; \
done
info-am:
info: info-am
dvi-am:
dvi: dvi-am
check-am: all-am
check: check-am
installcheck-am:
installcheck: installcheck-am
install-exec-am:
install-exec: install-exec-am
install-data-am:
install-data: install-data-am
install-am: all-am
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
install: install-am
uninstall-am:
uninstall: uninstall-am
all-am: Makefile
all-redirect: all-am
install-strip:
$(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install
installdirs:
mostlyclean-generic:
clean-generic:
distclean-generic:
-rm -f Makefile $(CONFIG_CLEAN_FILES)
-rm -f config.cache config.log stamp-h stamp-h[0-9]*
maintainer-clean-generic:
mostlyclean-am: mostlyclean-generic
mostlyclean: mostlyclean-am
clean-am: clean-generic mostlyclean-am
clean: clean-am
distclean-am: distclean-generic clean-am
distclean: distclean-am
maintainer-clean-am: maintainer-clean-generic distclean-am
@echo "This command is intended for maintainers to use;"
@echo "it deletes files that may require special tools to rebuild."
maintainer-clean: maintainer-clean-am
.PHONY: tags distdir info-am info dvi-am dvi check check-am \
installcheck-am installcheck install-exec-am install-exec \
install-data-am install-data install-am install uninstall-am uninstall \
all-redirect all-am all installdirs mostlyclean-generic \
distclean-generic clean-generic maintainer-clean-generic clean \
mostlyclean distclean maintainer-clean
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:

43
testing/README Normal file
View File

@ -0,0 +1,43 @@
How to run UNLV tests.
The scripts in this directory make it possible to duplicate the tests
published in the Fourth Annual Test of OCR Accuracy.
See http://www.isri.unlv.edu/downloads/AT-1995.pdf
but first you have to get the tools and data from UNLV:
Step 1: to download the images goto
http://www.isri.unlv.edu/ISRI/OCRtk
and get 3b.tgz, Bb.tgz, Mb.tgz and Nb.tgz.
Step 2: extract the files. It doesn't really matter where
in your filesystem you put them, but they must go under a common
root so you have directories 3, B, M and N in, for example,
/users/me/ISRI-OCRtk.
Step 3: Reorg the files
The lack of tif extensions on the images is inconvenient, so there
is a script to reorganize the data to match the rest of the test
scripts.
cd to /users/me/ISRI-OCRtk or wherever 3, B, M and N ended up and run
/blah/blah/tesseract-ocr/testing/reorgdata.sh 3B
This makes directories doe3.3B, bus.3B, mag.3B and news.3B.
You can now get rid of 3, B, M, and N unless you want to get some of the
other scanning resolutions out of them.
Step 4: Download the ISRI toolkit from:
http://www.isri.unlv.edu/downloads/ftk-1.0.tgz
Step 5: If they work for you, use the binaries directly from the bin
directory and put them in tesseract-ocr/testing/unlv
otherwise build the tools for yourself and put them there.
Step 6: cd back to your main tesseract-ocr dir and Build tesseract.
Step 7: run testing/runalltests.sh with the root data dir and testname:
testing/runalltests.sh /users/me/ISRI-OCRtk tess2.0
and go to the gym, have lunch etc.
Step 8: There should be a file
testing/reports/tess2.0.summary that contains the final summarized accuracy
report and comparison with the 1995 results.

61
testing/counttestset.sh Executable file
View File

@ -0,0 +1,61 @@
#!/bin/bash
# File: counttestset.sh
# Description: Script to count the errors on a single UNLV set.
# Author: Ray Smith
# Created: Wed Jun 13 11:58:01 PDT 2007
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ne 1 ]
then
echo "Usage:$0 pagesfile"
exit 1
fi
if [ ! -d ccmain ]
then
echo "Run $0 from the tesseract-ocr root directory!"
exit 1
fi
if [ ! -r testing/unlv/accuracy ]
then
echo "Please download the UNLV accuracy tools (and build) to testing/unlv"
exit 1
fi
pages=$1
imdir=${pages%/pages}
setname=${imdir##*/}
resdir=testing/results/$setname
mkdir -p testing/reports
echo "Counting on set $setname in directory $imdir to $resdir"
accfiles=""
wafiles=""
while read page dir
do
if [ "$dir" ]
then
srcdir="$imdir/$dir"
else
srcdir="$imdir"
fi
# echo "$srcdir/$page.tif"
# Count character errors.
testing/unlv/accuracy $srcdir/$page.txt $resdir/$page.txt $resdir/$page.acc
accfiles="$accfiles $resdir/$page.acc"
# Count word errors.
testing/unlv/wordacc $srcdir/$page.txt $resdir/$page.txt $resdir/$page.wa
wafiles="$wafiles $resdir/$page.wa"
done <$pages
testing/unlv/accsum $accfiles >testing/reports/$setname.characc
testing/unlv/wordaccsum $wafiles >testing/reports/$setname.wordacc

44
testing/reorgdata.sh Executable file
View File

@ -0,0 +1,44 @@
#!/bin/bash
if [ $# -ne 1 ]
then
echo "Usage:$0 scantype"
echo "UNLV data comes in several scan types:"
echo "3B=300 dpi binary"
echo "3A=adaptive thresholded 300 dpi"
echo "3G=300 dpi grey"
echo "4B=400dpi binary"
echo "2B=200dpi binary"
echo "For now we only use 3B"
exit 1
fi
ext=$1
#There are several test sets without meaningful names, so rename
#them with something a bit more meaningful.
#Each s is oldname/newname
for s in 3/doe3 B/bus M/mag N/news L/legal R/rep S/spn Z/zset
do
old=${s%/*}
#if this set was downloaded then process it.
if [ -r "$old/PAGES" ]
then
new=${s#*/}.$ext
mkdir -p $new
echo "Set $old -> $new"
#The pages file had - instead of _ so fix it and add the extension.
for page in `cat $old/PAGES`
do
echo "${page%-*}_${page#*-}.$ext"
done >$new/pages
for f in `cat $new/pages`
do
#Put a tif extension on the tif files.
cp $old/${old}_B/$f $new/$f.tif
#Put a uzn extension on the zone files.
cp $old/${old}_B/${f}Z $new/$f.uzn
#Cat all the truth files together and put into a single txt file.
cat $old/${old}_GT/${f%.$ext}.Z* >$new/$f.txt
done
fi
done

View File

@ -0,0 +1 @@
1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00%

View File

@ -0,0 +1 @@
1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00%

View File

@ -0,0 +1 @@
1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00%

View File

@ -0,0 +1 @@
1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00%

110
testing/runalltests.sh Executable file
View File

@ -0,0 +1,110 @@
#!/bin/bash
# File: runalltests.sh
# Description: Script to run a set of UNLV test sets.
# Author: Ray Smith
# Created: Thu Jun 14 08:21:01 PDT 2007
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ne 2 ]
then
echo "Usage:$0 unlv-data-dir version-id"
exit 1
fi
if [ ! -d ccmain ]
then
echo "Run $0 from the tesseract-ocr root directory!"
exit 1
fi
if [ ! -r ccmain/tesseract -a ! -r tesseract.exe ]
then
echo "Please build tesseract before running $0"
exit 1
fi
if [ ! -r testing/unlv/accuracy -a ! -r testing/unlv/accuracy.exe ]
then
echo "Please download the UNLV accuracy tools (and build) to testing/unlv"
exit 1
fi
#deltapc new old calculates the %change from old to new
deltapc() {
awk ' BEGIN {
printf("%.2f", 100.0*('$1'-'$2')/'$2');
}'
}
imdir="$1"
vid="$2"
bindir=${0%/*}
if [ "$bindir" = "$0" ]
then
bindir="./"
fi
rdir=testing/reports
testsets="bus.3B doe3.3B mag.3B news.3B"
totalerrs=0
totalwerrs=0
totalnswerrs=0
totalolderrs=0
totaloldwerrs=0
totaloldnswerrs=0
for set in $testsets
do
if [ -r $imdir/$set/pages ]
then
# Run tesseract on all the pages.
$bindir/runtestset.sh $imdir/$set/pages
# Count the errors on all the pages.
$bindir/counttestset.sh $imdir/$set/pages
# Get the old character word and nonstop word errors.
olderrs=`cat testing/reports/1995.$set.sum | cut -f3`
oldwerrs=`cat testing/reports/1995.$set.sum | cut -f6`
oldnswerrs=`cat testing/reports/1995.$set.sum | cut -f9`
# Get the new character word and nonstop word errors and accuracy.
cherrs=`head -4 testing/reports/$set.characc |tail -1 |cut -c1-9 |
tr -d '[:blank:]'`
chacc=`head -5 testing/reports/$set.characc |tail -1 |cut -c1-9 |
tr -d '[:blank:]'`
wderrs=`head -4 testing/reports/$set.wordacc |tail -1 |cut -c1-9 |
tr -d '[:blank:]'`
wdacc=`head -5 testing/reports/$set.wordacc |tail -1 |cut -c1-9 |
tr -d '[:blank:]'`
nswderrs=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 |
cut -c10-17 |tr -d '[:blank:]'`
nswdacc=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 |
cut -c19-26 |tr -d '[:blank:]'`
# Compute the percent change.
chdelta=`deltapc $cherrs $olderrs`
wdelta=`deltapc $wderrs $oldwerrs`
nswdelta=`deltapc $nswderrs $oldnswerrs`
sumfile=$rdir/$vid.$set.sum
echo "$vid $set $cherrs $chacc $chdelta% $wderrs $wdacc\
$wdelta% $nswderrs $nswdacc $nswdelta%" >$sumfile
# Sum totals over all the testsets.
let totalerrs=totalerrs+cherrs
let totalwerrs=totalwerrs+wderrs
let totalnswerrs=totalnswerrs+nswderrs
let totalolderrs=totalolderrs+olderrs
let totaloldwerrs=totaloldwerrs+oldwerrs
let totaloldnswerrs=totaloldnswerrs+oldnswerrs
fi
done
# Compute grand total percent change.
chdelta=`deltapc $totalerrs $totalolderrs`
wdelta=`deltapc $totalwerrs $totaloldwerrs`
nswdelta=`deltapc $totalnswerrs $totaloldnswerrs `
tfile=$rdir/$vid.total.sum
echo "$vid Total $totalerrs - $chdelta% $totalwerrs\
- $wdelta% $totalnswerrs - $nswdelta%" >$tfile
cat $rdir/1995.*.sum $rdir/$vid.*.sum >$rdir/$vid.summary

61
testing/runtestset.sh Executable file
View File

@ -0,0 +1,61 @@
#!/bin/bash
# File: runtestset.sh
# Description: Script to run tesseract on a single UNLV set.
# Author: Ray Smith
# Created: Wed Jun 13 10:13:01 PDT 2007
#
# (C) Copyright 2007, Google Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if [ $# -ne 1 ]
then
echo "Usage:$0 pagesfile"
exit 1
fi
if [ ! -d ccmain ]
then
echo "Run $0 from the tesseract-ocr root directory!"
exit 1
fi
if [ ! -r ccmain/tesseract ]
then
if [ ! -r tesseract.exe ]
then
echo "Please build tesseract before running $0"
exit 1
else
tess="./tesseract.exe"
fi
else
tess="ccmain/tesseract"
export TESSDATA_PREFIX=$PWD/
fi
pages=$1
imdir=${pages%/pages}
setname=${imdir##*/}
resdir=testing/results/$setname
echo "Testing on set $setname in directory $imdir to $resdir"
mkdir -p $resdir
while read page dir
do
# A pages file may be a list of files with subdirs or maybe just
# a plain list of files so accomodate both.
if [ "$dir" ]
then
srcdir="$imdir/$dir"
else
srcdir="$imdir"
fi
# echo "$srcdir/$page.tif"
$tess $srcdir/$page.tif $resdir/$page nobatch unlv
done <$pages