mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 14:41:36 +08:00
API/output changes to produce unlv-style latin-1 output and test scripts
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@86 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
eeaca1beba
commit
627368df42
@ -24,20 +24,22 @@ what measures we are interested in.
|
|||||||
/* #define SECURE_NAMES done in secnames.h when necessary*/
|
/* #define SECURE_NAMES done in secnames.h when necessary*/
|
||||||
|
|
||||||
#include "mfcpch.h"
|
#include "mfcpch.h"
|
||||||
#include "applybox.h"
|
#include "applybox.h"
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#ifdef __UNIX__
|
#ifdef __UNIX__
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#endif
|
#endif
|
||||||
#include "mainblk.h"
|
#include "mainblk.h"
|
||||||
#include "genblob.h"
|
#include "genblob.h"
|
||||||
#include "fixxht.h"
|
#include "fixxht.h"
|
||||||
#include "control.h"
|
#include "control.h"
|
||||||
#include "tessbox.h"
|
#include "tessbox.h"
|
||||||
#include "globals.h"
|
#include "globals.h"
|
||||||
#include "secname.h"
|
#include "secname.h"
|
||||||
|
#include "unichar.h"
|
||||||
|
#include "matchdefs.h"
|
||||||
|
|
||||||
#define SECURE_NAMES
|
#define SECURE_NAMES
|
||||||
#ifndef SECURE_NAMES
|
#ifndef SECURE_NAMES
|
||||||
@ -47,10 +49,13 @@ what measures we are interested in.
|
|||||||
#define EXTERN
|
#define EXTERN
|
||||||
EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");
|
EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");
|
||||||
EXTERN INT_VAR (applybox_debug, 0, "Debug level");
|
EXTERN INT_VAR (applybox_debug, 0, "Debug level");
|
||||||
EXTERN STRING_VAR (applybox_test_exclusions, "|",
|
EXTERN STRING_VAR (applybox_test_exclusions, "",
|
||||||
"Chars ignored for testing");
|
"Chars ignored for testing");
|
||||||
EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");
|
EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");
|
||||||
|
|
||||||
|
// The unicharset used during box training
|
||||||
|
static UNICHARSET unicharset_boxes;
|
||||||
|
|
||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* The code re-assigns outlines to form words each with ONE labelled blob.
|
* The code re-assigns outlines to form words each with ONE labelled blob.
|
||||||
* Noise is left in UNLABELLED words. The chars on the page are checked crudely
|
* Noise is left in UNLABELLED words. The chars on the page are checked crudely
|
||||||
@ -89,7 +94,7 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
|
|||||||
INT16 boxfile_lineno = 0;
|
INT16 boxfile_lineno = 0;
|
||||||
INT16 boxfile_charno = 0;
|
INT16 boxfile_charno = 0;
|
||||||
BOX box; //boxfile box
|
BOX box; //boxfile box
|
||||||
char ch[2]; //correct ch from boxfile
|
UNICHAR_ID uch_id; //correct ch from boxfile
|
||||||
ROW *row;
|
ROW *row;
|
||||||
ROW *prev_row = NULL;
|
ROW *prev_row = NULL;
|
||||||
INT16 prev_box_right = MAX_INT16;
|
INT16 prev_box_right = MAX_INT16;
|
||||||
@ -100,15 +105,20 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
|
|||||||
INT16 labels_ok;
|
INT16 labels_ok;
|
||||||
INT16 rows_ok;
|
INT16 rows_ok;
|
||||||
INT16 bad_blobs;
|
INT16 bad_blobs;
|
||||||
INT16 tgt_char_counts[128]; //No. of box samples
|
INT16 tgt_char_counts[MAX_NUM_CLASSES]; //No. of box samples
|
||||||
// INT16 labelled_char_counts[128]; //No. of unique labelled samples
|
// INT16 labelled_char_counts[128]; //No. of unique labelled samples
|
||||||
INT16 i;
|
INT16 i;
|
||||||
INT16 rebalance_count = 0;
|
INT16 rebalance_count = 0;
|
||||||
char min_char;
|
UNICHAR_ID min_uch_id;
|
||||||
INT16 min_samples;
|
INT16 min_samples;
|
||||||
INT16 final_labelled_blob_count;
|
INT16 final_labelled_blob_count;
|
||||||
|
|
||||||
for (i = 0; i < 128; i++)
|
// Clean the unichar set
|
||||||
|
unicharset_boxes.clear();
|
||||||
|
// Space character needed to represent NIL classification
|
||||||
|
unicharset_boxes.unichar_insert(" ");
|
||||||
|
|
||||||
|
for (i = 0; i < MAX_NUM_CLASSES; i++)
|
||||||
tgt_char_counts[i] = 0;
|
tgt_char_counts[i] = 0;
|
||||||
|
|
||||||
FILE* box_file;
|
FILE* box_file;
|
||||||
@ -120,11 +130,10 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
|
|||||||
filename.string(), errno);
|
filename.string(), errno);
|
||||||
}
|
}
|
||||||
|
|
||||||
ch[1] = '\0';
|
|
||||||
clear_any_old_text(block_list);
|
clear_any_old_text(block_list);
|
||||||
while (read_next_box (box_file, &box, &ch[0])) {
|
while (read_next_box (box_file, &box, &uch_id)) {
|
||||||
box_count++;
|
box_count++;
|
||||||
tgt_char_counts[ch[0]]++;
|
tgt_char_counts[uch_id]++;
|
||||||
row = find_row_of_box (block_list, box, block_id, row_id);
|
row = find_row_of_box (block_list, box, block_id, row_id);
|
||||||
if (box.left () < prev_box_right) {
|
if (box.left () < prev_box_right) {
|
||||||
boxfile_lineno++;
|
boxfile_lineno++;
|
||||||
@ -135,14 +144,16 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
|
|||||||
|
|
||||||
if (row == NULL) {
|
if (row == NULL) {
|
||||||
box_failures++;
|
box_failures++;
|
||||||
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
|
report_failed_box (boxfile_lineno, boxfile_charno, box,
|
||||||
|
unicharset_boxes.id_to_unichar(uch_id),
|
||||||
"FAILURE! box overlaps no blobs or blobs in multiple rows");
|
"FAILURE! box overlaps no blobs or blobs in multiple rows");
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if ((box.left () >= prev_box_right) && (row != prev_row))
|
if ((box.left () >= prev_box_right) && (row != prev_row))
|
||||||
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
|
report_failed_box (boxfile_lineno, boxfile_charno, box,
|
||||||
|
unicharset_boxes.id_to_unichar(uch_id),
|
||||||
"WARNING! false row break");
|
"WARNING! false row break");
|
||||||
box_failures += resegment_box (row, box, ch, block_id, row_id,
|
box_failures += resegment_box (row, box, uch_id, block_id, row_id,
|
||||||
boxfile_lineno, boxfile_charno);
|
boxfile_lineno, boxfile_charno);
|
||||||
prev_row = row;
|
prev_row = row;
|
||||||
}
|
}
|
||||||
@ -154,7 +165,7 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
|
|||||||
bad_blobs,
|
bad_blobs,
|
||||||
tgt_char_counts,
|
tgt_char_counts,
|
||||||
rebalance_count,
|
rebalance_count,
|
||||||
min_char,
|
&min_uch_id,
|
||||||
min_samples,
|
min_samples,
|
||||||
final_labelled_blob_count);
|
final_labelled_blob_count);
|
||||||
tprintf ("APPLY_BOXES:\n");
|
tprintf ("APPLY_BOXES:\n");
|
||||||
@ -163,7 +174,8 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks
|
|||||||
labels_ok, rows_ok);
|
labels_ok, rows_ok);
|
||||||
tprintf (" Box failures detected: %6d\n", box_failures);
|
tprintf (" Box failures detected: %6d\n", box_failures);
|
||||||
tprintf (" Duped blobs for rebalance:%6d\n", rebalance_count);
|
tprintf (" Duped blobs for rebalance:%6d\n", rebalance_count);
|
||||||
tprintf (" \"%c\" has fewest samples:%6d\n", min_char, min_samples);
|
tprintf (" \"%s\" has fewest samples:%6d\n",
|
||||||
|
unicharset_boxes.id_to_unichar(min_uch_id), min_samples);
|
||||||
tprintf (" Total unlabelled words: %6d\n",
|
tprintf (" Total unlabelled words: %6d\n",
|
||||||
bad_blobs);
|
bad_blobs);
|
||||||
tprintf (" Final labelled words: %6d\n",
|
tprintf (" Final labelled words: %6d\n",
|
||||||
@ -194,7 +206,7 @@ void clear_any_old_text( //remove correct text
|
|||||||
|
|
||||||
BOOL8 read_next_box(FILE* box_file, //
|
BOOL8 read_next_box(FILE* box_file, //
|
||||||
BOX *box,
|
BOX *box,
|
||||||
char *ch) {
|
UNICHAR_ID *uch_id) {
|
||||||
char buff[256]; //boxfile read buffer
|
char buff[256]; //boxfile read buffer
|
||||||
char *buffptr = buff;
|
char *buffptr = buff;
|
||||||
STRING box_filename;
|
STRING box_filename;
|
||||||
@ -204,23 +216,38 @@ BOOL8 read_next_box(FILE* box_file, //
|
|||||||
INT32 x_max;
|
INT32 x_max;
|
||||||
INT32 y_max;
|
INT32 y_max;
|
||||||
INT32 count = 0;
|
INT32 count = 0;
|
||||||
|
char uch[256];
|
||||||
|
|
||||||
while (!feof (box_file)) {
|
while (!feof (box_file)) {
|
||||||
fgets (buff, sizeof (buff) - 1, box_file);
|
fgets (buff, sizeof (buff) - 1, box_file);
|
||||||
line++;
|
line++;
|
||||||
|
|
||||||
|
buffptr = buff;
|
||||||
|
const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
|
||||||
|
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
|
||||||
|
buffptr += 3; // Skip unicode file designation.
|
||||||
/* Check for blank lines in box file */
|
/* Check for blank lines in box file */
|
||||||
for (buffptr = buff; isspace (*buffptr); buffptr++)
|
while (isspace (*buffptr))
|
||||||
;
|
buffptr++;
|
||||||
if (*buffptr != '\0') {
|
if (*buffptr != '\0') {
|
||||||
count =
|
count =
|
||||||
sscanf (buff,
|
sscanf (buffptr,
|
||||||
"%c " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
|
"%s " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "
|
||||||
INT32FORMAT, ch, &x_min, &y_min, &x_max, &y_max);
|
INT32FORMAT, uch, &x_min, &y_min, &x_max, &y_max);
|
||||||
if (count != 5) {
|
if (count != 5) {
|
||||||
tprintf ("Box file format error on line %i ignored\n", line);
|
tprintf ("Box file format error on line %i ignored\n", line);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
if (!unicharset_boxes.contains_unichar(uch))
|
||||||
|
{
|
||||||
|
unicharset_boxes.unichar_insert(uch);
|
||||||
|
if (unicharset_boxes.size() > MAX_NUM_CLASSES) {
|
||||||
|
tprintf("Error: Size of unicharset of boxes is \
|
||||||
|
greater than MAX_NUM_CLASSES\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*uch_id = unicharset_boxes.unichar_to_id(uch);
|
||||||
*box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
|
*box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));
|
||||||
return TRUE; //read a box ok
|
return TRUE; //read a box ok
|
||||||
}
|
}
|
||||||
@ -314,7 +341,7 @@ ROW *find_row_of_box( //
|
|||||||
INT16 resegment_box( //
|
INT16 resegment_box( //
|
||||||
ROW *row,
|
ROW *row,
|
||||||
BOX box,
|
BOX box,
|
||||||
char *ch,
|
UNICHAR_ID uch_id,
|
||||||
INT16 block_id,
|
INT16 block_id,
|
||||||
INT16 row_id,
|
INT16 row_id,
|
||||||
INT16 boxfile_lineno,
|
INT16 boxfile_lineno,
|
||||||
@ -358,7 +385,7 @@ INT16 resegment_box( //
|
|||||||
if (applybox_debug > 4)
|
if (applybox_debug > 4)
|
||||||
report_failed_box (boxfile_lineno,
|
report_failed_box (boxfile_lineno,
|
||||||
boxfile_charno,
|
boxfile_charno,
|
||||||
box, ch,
|
box, unicharset_boxes.id_to_unichar(uch_id),
|
||||||
"FAILURE! box overlaps blob in labelled word");
|
"FAILURE! box overlaps blob in labelled word");
|
||||||
}
|
}
|
||||||
if (applybox_debug > 4)
|
if (applybox_debug > 4)
|
||||||
@ -375,7 +402,7 @@ INT16 resegment_box( //
|
|||||||
if (new_word == NULL) {
|
if (new_word == NULL) {
|
||||||
/* Make a new word with a single blob */
|
/* Make a new word with a single blob */
|
||||||
new_word = word->shallow_copy ();
|
new_word = word->shallow_copy ();
|
||||||
new_word->set_text (ch);
|
new_word->set_text (unicharset_boxes.id_to_unichar(uch_id));
|
||||||
if (polyg)
|
if (polyg)
|
||||||
new_blob = new PBLOB;
|
new_blob = new PBLOB;
|
||||||
else
|
else
|
||||||
@ -414,63 +441,75 @@ INT16 resegment_box( //
|
|||||||
word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f;
|
word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f;
|
||||||
baseline = row->base_line (word_x_centre);
|
baseline = row->base_line (word_x_centre);
|
||||||
|
|
||||||
if (STRING (chs_caps_ht).contains (ch[0]) &&
|
#if 0
|
||||||
(new_word_box.top () <
|
if (strlen(unicharset_boxes.id_to_unichar(uch_id)) == 1) {
|
||||||
baseline + (1 + applybox_error_band) * row->x_height ())) {
|
if (STRING (chs_caps_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
|
||||||
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
|
(new_word_box.top () <
|
||||||
"FAILURE! caps-ht char didn't ascend");
|
baseline + (1 + applybox_error_band) * row->x_height ())) {
|
||||||
new_word->set_text ("");
|
report_failed_box (boxfile_lineno, boxfile_charno, box,
|
||||||
return 1;
|
unicharset_boxes.id_to_unichar(uch_id),
|
||||||
}
|
"FAILURE! caps-ht char didn't ascend");
|
||||||
if (STRING (chs_odd_top).contains (ch[0]) &&
|
new_word->set_text ("");
|
||||||
(new_word_box.top () <
|
return 1;
|
||||||
baseline + (1 - applybox_error_band) * row->x_height ())) {
|
}
|
||||||
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
|
if (STRING (chs_odd_top).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
|
||||||
"FAILURE! Odd top char below xht");
|
(new_word_box.top () <
|
||||||
new_word->set_text ("");
|
baseline + (1 - applybox_error_band) * row->x_height ())) {
|
||||||
return 1;
|
report_failed_box (boxfile_lineno, boxfile_charno, box,
|
||||||
}
|
unicharset_boxes.id_to_unichar(uch_id),
|
||||||
if (STRING (chs_x_ht).contains (ch[0]) &&
|
"FAILURE! Odd top char below xht");
|
||||||
((new_word_box.top () >
|
new_word->set_text ("");
|
||||||
baseline + (1 + applybox_error_band) * row->x_height ()) ||
|
return 1;
|
||||||
(new_word_box.top () <
|
}
|
||||||
baseline + (1 - applybox_error_band) * row->x_height ()))) {
|
if (STRING (chs_x_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
|
||||||
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
|
((new_word_box.top () >
|
||||||
"FAILURE! x-ht char didn't have top near xht");
|
baseline + (1 + applybox_error_band) * row->x_height ()) ||
|
||||||
new_word->set_text ("");
|
(new_word_box.top () <
|
||||||
return 1;
|
baseline + (1 - applybox_error_band) * row->x_height ()))) {
|
||||||
}
|
report_failed_box (boxfile_lineno, boxfile_charno, box,
|
||||||
if (STRING (chs_non_ambig_bl).contains (ch[0]) &&
|
unicharset_boxes.id_to_unichar(uch_id),
|
||||||
((new_word_box.bottom () <
|
"FAILURE! x-ht char didn't have top near xht");
|
||||||
baseline - applybox_error_band * row->x_height ()) ||
|
new_word->set_text ("");
|
||||||
(new_word_box.bottom () >
|
return 1;
|
||||||
baseline + applybox_error_band * row->x_height ()))) {
|
}
|
||||||
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
|
if (STRING (chs_non_ambig_bl).contains
|
||||||
"FAILURE! non ambig BL char didnt have bottom near baseline");
|
(unicharset_boxes.id_to_unichar(uch_id)[0]) &&
|
||||||
new_word->set_text ("");
|
((new_word_box.bottom () <
|
||||||
return 1;
|
baseline - applybox_error_band * row->x_height ()) ||
|
||||||
}
|
(new_word_box.bottom () >
|
||||||
if (STRING (chs_odd_bot).contains (ch[0]) &&
|
baseline + applybox_error_band * row->x_height ()))) {
|
||||||
(new_word_box.bottom () >
|
report_failed_box (boxfile_lineno, boxfile_charno, box,
|
||||||
baseline + applybox_error_band * row->x_height ())) {
|
unicharset_boxes.id_to_unichar(uch_id),
|
||||||
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
|
"FAILURE! non ambig BL char didnt have bottom near baseline");
|
||||||
"FAILURE! Odd bottom char above baseline");
|
new_word->set_text ("");
|
||||||
new_word->set_text ("");
|
return 1;
|
||||||
return 1;
|
}
|
||||||
}
|
if (STRING (chs_odd_bot).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
|
||||||
if (STRING (chs_desc).contains (ch[0]) &&
|
(new_word_box.bottom () >
|
||||||
(new_word_box.bottom () >
|
baseline + applybox_error_band * row->x_height ())) {
|
||||||
baseline - applybox_error_band * row->x_height ())) {
|
report_failed_box (boxfile_lineno, boxfile_charno, box,
|
||||||
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
|
unicharset_boxes.id_to_unichar(uch_id),
|
||||||
|
"FAILURE! Odd bottom char above baseline");
|
||||||
|
new_word->set_text ("");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (STRING (chs_desc).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) &&
|
||||||
|
(new_word_box.bottom () >
|
||||||
|
baseline - applybox_error_band * row->x_height ())) {
|
||||||
|
report_failed_box (boxfile_lineno, boxfile_charno, box,
|
||||||
|
unicharset_boxes.id_to_unichar(uch_id),
|
||||||
"FAILURE! Descender doesn't descend");
|
"FAILURE! Descender doesn't descend");
|
||||||
new_word->set_text ("");
|
new_word->set_text ("");
|
||||||
return 1;
|
return 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
report_failed_box (boxfile_lineno, boxfile_charno, box, ch,
|
report_failed_box (boxfile_lineno, boxfile_charno, box,
|
||||||
"FAILURE! Couldn't find any blobs");
|
unicharset_boxes.id_to_unichar(uch_id),
|
||||||
|
"FAILURE! Couldn't find any blobs");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -492,7 +531,7 @@ void tidy_up( //
|
|||||||
INT16 &unlabelled_words,
|
INT16 &unlabelled_words,
|
||||||
INT16 *tgt_char_counts,
|
INT16 *tgt_char_counts,
|
||||||
INT16 &rebalance_count,
|
INT16 &rebalance_count,
|
||||||
char &min_char,
|
UNICHAR_ID *min_uch_id,
|
||||||
INT16 &min_samples,
|
INT16 &min_samples,
|
||||||
INT16 &final_labelled_blob_count) {
|
INT16 &final_labelled_blob_count) {
|
||||||
BLOCK_IT block_it(block_list);
|
BLOCK_IT block_it(block_list);
|
||||||
@ -507,16 +546,16 @@ void tidy_up( //
|
|||||||
BOOL8 row_ok;
|
BOOL8 row_ok;
|
||||||
BOOL8 rebalance_needed = FALSE;
|
BOOL8 rebalance_needed = FALSE;
|
||||||
//No. of unique labelled samples
|
//No. of unique labelled samples
|
||||||
INT16 labelled_char_counts[128];
|
INT16 labelled_char_counts[MAX_NUM_CLASSES];
|
||||||
INT16 i;
|
INT16 i;
|
||||||
char ch;
|
UNICHAR_ID uch_id;
|
||||||
char prev_ch = '\0';
|
UNICHAR_ID prev_uch_id = -1;
|
||||||
BOOL8 at_dupe_of_prev_word;
|
BOOL8 at_dupe_of_prev_word;
|
||||||
ROW *prev_row = NULL;
|
ROW *prev_row = NULL;
|
||||||
INT16 left;
|
INT16 left;
|
||||||
INT16 prev_left = -1;
|
INT16 prev_left = -1;
|
||||||
|
|
||||||
for (i = 0; i < 128; i++)
|
for (i = 0; i < MAX_NUM_CLASSES; i++)
|
||||||
labelled_char_counts[i] = 0;
|
labelled_char_counts[i] = 0;
|
||||||
|
|
||||||
ok_char_count = 0;
|
ok_char_count = 0;
|
||||||
@ -556,7 +595,7 @@ void tidy_up( //
|
|||||||
block_idx, row_idx, all_row_idx);
|
block_idx, row_idx, all_row_idx);
|
||||||
|
|
||||||
ok_char_count++;
|
ok_char_count++;
|
||||||
labelled_char_counts[*word->text ()]++;
|
labelled_char_counts[unicharset_boxes.unichar_to_id(word->text ())]++;
|
||||||
row_ok = TRUE;
|
row_ok = TRUE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -571,24 +610,24 @@ void tidy_up( //
|
|||||||
}
|
}
|
||||||
|
|
||||||
min_samples = 9999;
|
min_samples = 9999;
|
||||||
for (i = 0; i < 128; i++) {
|
for (i = 0; i < unicharset_boxes.size(); i++) {
|
||||||
if (tgt_char_counts[i] > labelled_char_counts[i]) {
|
if (tgt_char_counts[i] > labelled_char_counts[i]) {
|
||||||
if (labelled_char_counts[i] <= 1) {
|
if (labelled_char_counts[i] <= 1) {
|
||||||
tprintf
|
tprintf
|
||||||
("APPLY_BOXES: FATALITY - %d labelled samples of \"%c\" - target is %d\n",
|
("APPLY_BOXES: FATALITY - %d labelled samples of \"%s\" - target is %d\n",
|
||||||
labelled_char_counts[i], (char) i, tgt_char_counts[i]);
|
labelled_char_counts[i], unicharset_boxes.id_to_unichar(i), tgt_char_counts[i]);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
rebalance_needed = TRUE;
|
rebalance_needed = TRUE;
|
||||||
if (applybox_debug > 0)
|
if (applybox_debug > 0)
|
||||||
tprintf
|
tprintf
|
||||||
("APPLY_BOXES: REBALANCE REQD \"%c\" - target of %d from %d labelled samples\n",
|
("APPLY_BOXES: REBALANCE REQD \"%s\" - target of %d from %d labelled samples\n",
|
||||||
(char) i, tgt_char_counts[i], labelled_char_counts[i]);
|
unicharset_boxes.id_to_unichar(i), tgt_char_counts[i], labelled_char_counts[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {
|
if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {
|
||||||
min_samples = labelled_char_counts[i];
|
min_samples = labelled_char_counts[i];
|
||||||
min_char = (char) i;
|
*min_uch_id = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -605,33 +644,36 @@ void tidy_up( //
|
|||||||
!word_it.cycled_list (); word_it.forward ()) {
|
!word_it.cycled_list (); word_it.forward ()) {
|
||||||
word = word_it.data ();
|
word = word_it.data ();
|
||||||
left = word->bounding_box ().left ();
|
left = word->bounding_box ().left ();
|
||||||
ch = *word->text ();
|
if (*word->text () != '\0')
|
||||||
|
uch_id = unicharset_boxes.unichar_to_id(word->text ());
|
||||||
|
else
|
||||||
|
uch_id = -1;
|
||||||
at_dupe_of_prev_word = ((row == prev_row) &&
|
at_dupe_of_prev_word = ((row == prev_row) &&
|
||||||
(left = prev_left) &&
|
(left = prev_left) &&
|
||||||
(ch == prev_ch));
|
(uch_id == prev_uch_id));
|
||||||
if ((ch != '\0') &&
|
if ((uch_id != -1) &&
|
||||||
(labelled_char_counts[ch] > 1) &&
|
(labelled_char_counts[uch_id] > 1) &&
|
||||||
(tgt_char_counts[ch] > labelled_char_counts[ch]) &&
|
(tgt_char_counts[uch_id] > labelled_char_counts[uch_id]) &&
|
||||||
(!at_dupe_of_prev_word)) {
|
(!at_dupe_of_prev_word)) {
|
||||||
/* Duplicate the word to rebalance the labelled samples */
|
/* Duplicate the word to rebalance the labelled samples */
|
||||||
if (applybox_debug > 9) {
|
if (applybox_debug > 9) {
|
||||||
tprintf ("Duping \"%c\" from ", ch);
|
tprintf ("Duping \"%s\" from ", unicharset_boxes.id_to_unichar(uch_id));
|
||||||
word->bounding_box ().print ();
|
word->bounding_box ().print ();
|
||||||
}
|
}
|
||||||
duplicate_word = new WERD;
|
duplicate_word = new WERD;
|
||||||
*duplicate_word = *word;
|
*duplicate_word = *word;
|
||||||
word_it.add_after_then_move (duplicate_word);
|
word_it.add_after_then_move (duplicate_word);
|
||||||
rebalance_count++;
|
rebalance_count++;
|
||||||
labelled_char_counts[ch]++;
|
labelled_char_counts[uch_id]++;
|
||||||
}
|
}
|
||||||
prev_row = row;
|
prev_row = row;
|
||||||
prev_left = left;
|
prev_left = left;
|
||||||
prev_ch = ch;
|
prev_uch_id = uch_id;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
rebalance_needed = FALSE;
|
rebalance_needed = FALSE;
|
||||||
for (i = 0; i < 128; i++) {
|
for (i = 0; i < unicharset_boxes.size(); i++) {
|
||||||
if ((tgt_char_counts[i] > labelled_char_counts[i]) &&
|
if ((tgt_char_counts[i] > labelled_char_counts[i]) &&
|
||||||
(labelled_char_counts[i] > 1)) {
|
(labelled_char_counts[i] > 1)) {
|
||||||
rebalance_needed = TRUE;
|
rebalance_needed = TRUE;
|
||||||
@ -653,7 +695,7 @@ void tidy_up( //
|
|||||||
for (word_it.mark_cycle_pt ();
|
for (word_it.mark_cycle_pt ();
|
||||||
!word_it.cycled_list (); word_it.forward ()) {
|
!word_it.cycled_list (); word_it.forward ()) {
|
||||||
word = word_it.data ();
|
word = word_it.data ();
|
||||||
if ((strlen (word->text ()) == 1) &&
|
if ((strlen (word->text ()) > 0) &&
|
||||||
(word->gblob_list ()->length () == 1))
|
(word->gblob_list ()->length () == 1))
|
||||||
final_labelled_blob_count++;
|
final_labelled_blob_count++;
|
||||||
}
|
}
|
||||||
@ -665,7 +707,7 @@ void tidy_up( //
|
|||||||
void report_failed_box(INT16 boxfile_lineno,
|
void report_failed_box(INT16 boxfile_lineno,
|
||||||
INT16 boxfile_charno,
|
INT16 boxfile_charno,
|
||||||
BOX box,
|
BOX box,
|
||||||
char *box_ch,
|
const char *box_ch,
|
||||||
const char *err_msg) {
|
const char *err_msg) {
|
||||||
if (applybox_debug > 4)
|
if (applybox_debug > 4)
|
||||||
tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n",
|
tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n",
|
||||||
@ -687,10 +729,9 @@ void apply_box_training(BLOCK_LIST *block_list) {
|
|||||||
PBLOB_IT blob_it;
|
PBLOB_IT blob_it;
|
||||||
DENORM denorm;
|
DENORM denorm;
|
||||||
INT16 count = 0;
|
INT16 count = 0;
|
||||||
char ch[2];
|
char unichar[UNICHAR_LEN + 1];
|
||||||
|
|
||||||
ch[1] = '\0';
|
|
||||||
|
|
||||||
|
unichar[UNICHAR_LEN] = '\0';
|
||||||
tprintf ("Generating training data\n");
|
tprintf ("Generating training data\n");
|
||||||
for (block_it.mark_cycle_pt ();
|
for (block_it.mark_cycle_pt ();
|
||||||
!block_it.cycled_list (); block_it.forward ()) {
|
!block_it.cycled_list (); block_it.forward ()) {
|
||||||
@ -701,23 +742,22 @@ void apply_box_training(BLOCK_LIST *block_list) {
|
|||||||
for (word_it.mark_cycle_pt ();
|
for (word_it.mark_cycle_pt ();
|
||||||
!word_it.cycled_list (); word_it.forward ()) {
|
!word_it.cycled_list (); word_it.forward ()) {
|
||||||
word = word_it.data ();
|
word = word_it.data ();
|
||||||
if ((strlen (word->text ()) == 1) &&
|
if ((strlen (word->text ()) > 0) &&
|
||||||
(word->gblob_list ()->length () == 1)) {
|
(word->gblob_list ()->length () == 1)) {
|
||||||
/* Here is a word with a single char label and a single blob so train on it */
|
/* Here is a word with a single unichar label and a single blob so train on it */
|
||||||
bln_word =
|
bln_word =
|
||||||
make_bln_copy (word, row, row->x_height (), &denorm);
|
make_bln_copy (word, row, row->x_height (), &denorm);
|
||||||
blob_it.set_to_list (bln_word->blob_list ());
|
blob_it.set_to_list (bln_word->blob_list ());
|
||||||
ch[0] = *word->text ();
|
strncpy(unichar, word->text (), UNICHAR_LEN);
|
||||||
tess_training_tester (blob_it.data (),
|
tess_training_tester (blob_it.data (),
|
||||||
//single blob
|
//single blob
|
||||||
&denorm, TRUE, //correct
|
&denorm, TRUE, //correct
|
||||||
ch, //correct ASCII char
|
unichar, //correct character
|
||||||
1, //ASCII length
|
strlen(unichar), //character length
|
||||||
NULL);
|
NULL);
|
||||||
copy_outword = *(bln_word);
|
copy_outword = *(bln_word);
|
||||||
copy_outword.baseline_denormalise (&denorm);
|
copy_outword.baseline_denormalise (&denorm);
|
||||||
blob_it.set_to_list (copy_outword.blob_list ());
|
blob_it.set_to_list (copy_outword.blob_list ());
|
||||||
ch[0] = *word->text ();
|
|
||||||
delete bln_word;
|
delete bln_word;
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
@ -793,7 +833,7 @@ void apply_box_testing(BLOCK_LIST *block_list) {
|
|||||||
choice list, outword blob lists and best_choice string are the same
|
choice list, outword blob lists and best_choice string are the same
|
||||||
length. A TESS screw up is indicated by a blank filled or 0 length string.
|
length. A TESS screw up is indicated by a blank filled or 0 length string.
|
||||||
*/
|
*/
|
||||||
if ((best_choice->string ().length () == 0) ||
|
if ((best_choice->lengths ().length () == 0) ||
|
||||||
(strspn (best_choice->string ().string (), " ") ==
|
(strspn (best_choice->string ().string (), " ") ==
|
||||||
best_choice->string ().length ())) {
|
best_choice->string ().length ())) {
|
||||||
rej_count++;
|
rej_count++;
|
||||||
@ -804,22 +844,22 @@ void apply_box_testing(BLOCK_LIST *block_list) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if ((best_choice->string ().length () !=
|
if ((best_choice->lengths ().length () !=
|
||||||
outword->blob_list ()->length ()) ||
|
outword->blob_list ()->length ()) ||
|
||||||
(best_choice->string ().length () !=
|
(best_choice->lengths ().length () !=
|
||||||
blob_choices.length ())) {
|
blob_choices.length ())) {
|
||||||
tprintf
|
tprintf
|
||||||
("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
|
("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",
|
||||||
best_choice->string ().string (),
|
best_choice->string ().string (),
|
||||||
best_choice->string ().length (),
|
best_choice->lengths ().length (),
|
||||||
outword->blob_list ()->length (),
|
outword->blob_list ()->length (),
|
||||||
blob_choices.length ());
|
blob_choices.length ());
|
||||||
}
|
}
|
||||||
ASSERT_HOST (best_choice->string ().length () ==
|
ASSERT_HOST (best_choice->lengths ().length () ==
|
||||||
outword->blob_list ()->length ());
|
outword->blob_list ()->length ());
|
||||||
ASSERT_HOST (best_choice->string ().length () ==
|
ASSERT_HOST (best_choice->lengths ().length () ==
|
||||||
blob_choices.length ());
|
blob_choices.length ());
|
||||||
fix_quotes ((char *) best_choice->string ().string (),
|
fix_quotes (best_choice,
|
||||||
//turn to double
|
//turn to double
|
||||||
outword, &blob_choices);
|
outword, &blob_choices);
|
||||||
if (strcmp (best_choice->string ().string (), ch) != 0) {
|
if (strcmp (best_choice->string ().string (), ch) != 0) {
|
||||||
|
@ -27,6 +27,7 @@
|
|||||||
#include "applybox.h"
|
#include "applybox.h"
|
||||||
#include "pgedit.h"
|
#include "pgedit.h"
|
||||||
#include "varabled.h"
|
#include "varabled.h"
|
||||||
|
#include "output.h"
|
||||||
#include "adaptmatch.h"
|
#include "adaptmatch.h"
|
||||||
|
|
||||||
BOOL_VAR(tessedit_resegment_from_boxes, FALSE,
|
BOOL_VAR(tessedit_resegment_from_boxes, FALSE,
|
||||||
@ -37,6 +38,8 @@ BOOL_VAR(tessedit_train_from_boxes, FALSE,
|
|||||||
// Minimum sensible image size to be worth running tesseract.
|
// Minimum sensible image size to be worth running tesseract.
|
||||||
const int kMinRectSize = 10;
|
const int kMinRectSize = 10;
|
||||||
|
|
||||||
|
static STRING input_file = "noname.tif";
|
||||||
|
|
||||||
// Start tesseract.
|
// Start tesseract.
|
||||||
// The datapath must be the name of the data directory or some other file
|
// The datapath must be the name of the data directory or some other file
|
||||||
// in which the data directory resides (for instance argv[0].)
|
// in which the data directory resides (for instance argv[0].)
|
||||||
@ -70,6 +73,12 @@ int TessBaseAPI::InitWithLanguage(const char* datapath, const char* outputbase,
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Set the name of the input file. Needed only for training and
|
||||||
|
// loading a UNLV zone file.
|
||||||
|
void TessBaseAPI::SetInputName(const char* name) {
|
||||||
|
input_file = name;
|
||||||
|
}
|
||||||
|
|
||||||
// Recognize a rectangle from an image and return the result as a string.
|
// Recognize a rectangle from an image and return the result as a string.
|
||||||
// May be called many times for a single Init.
|
// May be called many times for a single Init.
|
||||||
// Currently has no error checking.
|
// Currently has no error checking.
|
||||||
@ -96,6 +105,52 @@ char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
|
|||||||
return RecognizeToString();
|
return RecognizeToString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// As TesseractRect but produces a box file as output.
|
||||||
|
char* TessBaseAPI::TesseractRectBoxes(const unsigned char* imagedata,
|
||||||
|
int bytes_per_pixel,
|
||||||
|
int bytes_per_line,
|
||||||
|
int left, int top,
|
||||||
|
int width, int height,
|
||||||
|
int imageheight) {
|
||||||
|
if (width < kMinRectSize || height < kMinRectSize)
|
||||||
|
return NULL; // Nothing worth doing.
|
||||||
|
|
||||||
|
// Copy/Threshold the image to the tesseract global page_image.
|
||||||
|
CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
|
||||||
|
left, top, width, height);
|
||||||
|
|
||||||
|
BLOCK_LIST block_list;
|
||||||
|
|
||||||
|
FindLines(&block_list);
|
||||||
|
|
||||||
|
// Now run the main recognition.
|
||||||
|
PAGE_RES* page_res = Recognize(&block_list, NULL);
|
||||||
|
|
||||||
|
return TesseractToBoxText(page_res, left, imageheight - (top + height));
|
||||||
|
}
|
||||||
|
|
||||||
|
char* TessBaseAPI::TesseractRectUNLV(const unsigned char* imagedata,
|
||||||
|
int bytes_per_pixel,
|
||||||
|
int bytes_per_line,
|
||||||
|
int left, int top,
|
||||||
|
int width, int height) {
|
||||||
|
if (width < kMinRectSize || height < kMinRectSize)
|
||||||
|
return NULL; // Nothing worth doing.
|
||||||
|
|
||||||
|
// Copy/Threshold the image to the tesseract global page_image.
|
||||||
|
CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line,
|
||||||
|
left, top, width, height);
|
||||||
|
|
||||||
|
BLOCK_LIST block_list;
|
||||||
|
|
||||||
|
FindLines(&block_list);
|
||||||
|
|
||||||
|
// Now run the main recognition.
|
||||||
|
PAGE_RES* page_res = Recognize(&block_list, NULL);
|
||||||
|
|
||||||
|
return TesseractToUNLV(page_res);
|
||||||
|
}
|
||||||
|
|
||||||
// Call between pages or documents etc to free up memory and forget
|
// Call between pages or documents etc to free up memory and forget
|
||||||
// adaptive data.
|
// adaptive data.
|
||||||
void TessBaseAPI::ClearAdaptiveClassifier() {
|
void TessBaseAPI::ClearAdaptiveClassifier() {
|
||||||
@ -326,7 +381,7 @@ void TessBaseAPI::CopyBinaryRect(const unsigned char* imagedata,
|
|||||||
image.capture(const_cast<unsigned char*>(imagedata),
|
image.capture(const_cast<unsigned char*>(imagedata),
|
||||||
bytes_per_line*8, top + height, 1);
|
bytes_per_line*8, top + height, 1);
|
||||||
page_image.create(width, height, 1);
|
page_image.create(width, height, 1);
|
||||||
copy_sub_image(&image, left, top, width, height, &page_image, 0, 0, false);
|
copy_sub_image(&image, left, 0, width, height, &page_image, 0, 0, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Low-level function to recognize the current global image to a string.
|
// Low-level function to recognize the current global image to a string.
|
||||||
@ -343,7 +398,6 @@ char* TessBaseAPI::RecognizeToString() {
|
|||||||
|
|
||||||
// Find lines from the image making the BLOCK_LIST.
|
// Find lines from the image making the BLOCK_LIST.
|
||||||
void TessBaseAPI::FindLines(BLOCK_LIST* block_list) {
|
void TessBaseAPI::FindLines(BLOCK_LIST* block_list) {
|
||||||
STRING input_file = "noname.tif";
|
|
||||||
// The following call creates a full-page block and then runs connected
|
// The following call creates a full-page block and then runs connected
|
||||||
// component analysis and text line creation.
|
// component analysis and text line creation.
|
||||||
pgeditor_read_file(input_file, block_list);
|
pgeditor_read_file(input_file, block_list);
|
||||||
@ -369,21 +423,32 @@ PAGE_RES* TessBaseAPI::Recognize(BLOCK_LIST* block_list, ETEXT_DESC* monitor) {
|
|||||||
return page_res;
|
return page_res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Return the maximum length that the output text string might occupy.
|
||||||
|
int TessBaseAPI::TextLength(PAGE_RES* page_res) {
|
||||||
|
PAGE_RES_IT page_res_it(page_res);
|
||||||
|
int total_length = 2;
|
||||||
|
// Iterate over the data structures to extract the recognition result.
|
||||||
|
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
||||||
|
page_res_it.forward()) {
|
||||||
|
WERD_RES *word = page_res_it.word();
|
||||||
|
WERD_CHOICE* choice = word->best_choice;
|
||||||
|
if (choice != NULL) {
|
||||||
|
total_length += choice->string().length() + 1;
|
||||||
|
for (int i = 0; i < word->reject_map.length(); ++i) {
|
||||||
|
if (word->reject_map[i].rejected())
|
||||||
|
++total_length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return total_length;
|
||||||
|
}
|
||||||
|
|
||||||
// Make a text string from the internal data structures.
|
// Make a text string from the internal data structures.
|
||||||
// The input page_res is deleted.
|
// The input page_res is deleted.
|
||||||
char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
|
char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
|
||||||
if (page_res != NULL) {
|
if (page_res != NULL) {
|
||||||
int total_length = 2;
|
int total_length = TextLength(page_res);
|
||||||
PAGE_RES_IT page_res_it(page_res);
|
PAGE_RES_IT page_res_it(page_res);
|
||||||
// Iterate over the data structures to extract the recognition result.
|
|
||||||
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
|
||||||
page_res_it.forward()) {
|
|
||||||
WERD_RES *word = page_res_it.word();
|
|
||||||
WERD_CHOICE* choice = word->best_choice;
|
|
||||||
if (choice != NULL) {
|
|
||||||
total_length += choice->string().length() + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
char* result = new char[total_length];
|
char* result = new char[total_length];
|
||||||
char* ptr = result;
|
char* ptr = result;
|
||||||
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
||||||
@ -406,3 +471,207 @@ char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) {
|
|||||||
}
|
}
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int ConvertWordToBoxText(WERD_RES *word,
|
||||||
|
ROW_RES* row,
|
||||||
|
int left,
|
||||||
|
int bottom,
|
||||||
|
char* word_str) {
|
||||||
|
// Copy the output word and denormalize it back to image coords.
|
||||||
|
WERD copy_outword;
|
||||||
|
copy_outword = *(word->outword);
|
||||||
|
copy_outword.baseline_denormalise(&word->denorm);
|
||||||
|
PBLOB_IT blob_it;
|
||||||
|
blob_it.set_to_list(copy_outword.blob_list());
|
||||||
|
int length = copy_outword.blob_list()->length();
|
||||||
|
int output_size = 0;
|
||||||
|
|
||||||
|
if (length > 0) {
|
||||||
|
for (int index = 0, offset = 0; index < length;
|
||||||
|
offset += word->best_choice->lengths()[index++], blob_it.forward()) {
|
||||||
|
PBLOB* blob = blob_it.data();
|
||||||
|
BOX blob_box = blob->bounding_box();
|
||||||
|
if (word->tess_failed ||
|
||||||
|
blob_box.left() < 0 ||
|
||||||
|
blob_box.right() > page_image.get_xsize() ||
|
||||||
|
blob_box.bottom() < 0 ||
|
||||||
|
blob_box.top() > page_image.get_ysize()) {
|
||||||
|
// Bounding boxes can be illegal when tess fails on a word.
|
||||||
|
blob_box = word->word->bounding_box(); // Use original word as backup.
|
||||||
|
tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
|
||||||
|
blob_box.left(), blob_box.bottom(),
|
||||||
|
blob_box.right(), blob_box.top());
|
||||||
|
}
|
||||||
|
|
||||||
|
// A single classification unit can be composed of several UTF-8
|
||||||
|
// characters. Append each of them to the result.
|
||||||
|
for (int sub = 0; sub < word->best_choice->lengths()[index]; ++sub) {
|
||||||
|
char ch = word->best_choice->string()[offset + sub];
|
||||||
|
// Tesseract uses space for recognition failure. Fix to a reject
|
||||||
|
// character, '~' so we don't create illegal box files.
|
||||||
|
if (ch == ' ')
|
||||||
|
ch = '~';
|
||||||
|
word_str[output_size++] = ch;
|
||||||
|
}
|
||||||
|
sprintf(word_str + output_size, " %d %d %d %d\n",
|
||||||
|
blob_box.left() + left, blob_box.bottom() + bottom,
|
||||||
|
blob_box.right() + left, blob_box.top() + bottom);
|
||||||
|
output_size += strlen(word_str + output_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return output_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Multiplier for textlength assumes 4 numbers @ 5 digits and a space
|
||||||
|
// plus the newline and the orginial character = 4*(5+1)+2
|
||||||
|
const int kMaxCharsPerChar = 26;
|
||||||
|
|
||||||
|
// Make a text string from the internal data structures.
|
||||||
|
// The input page_res is deleted.
|
||||||
|
// The text string takes the form of a box file as needed for training.
|
||||||
|
char* TessBaseAPI::TesseractToBoxText(PAGE_RES* page_res,
|
||||||
|
int left, int bottom) {
|
||||||
|
if (page_res != NULL) {
|
||||||
|
int total_length = TextLength(page_res) * kMaxCharsPerChar;
|
||||||
|
PAGE_RES_IT page_res_it(page_res);
|
||||||
|
char* result = new char[total_length];
|
||||||
|
char* ptr = result;
|
||||||
|
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
||||||
|
page_res_it.forward()) {
|
||||||
|
WERD_RES *word = page_res_it.word();
|
||||||
|
ptr += ConvertWordToBoxText(word,page_res_it.row(),left, bottom, ptr);
|
||||||
|
}
|
||||||
|
*ptr = '\0';
|
||||||
|
delete page_res;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make a text string from the internal data structures.
|
||||||
|
// The input page_res is deleted. The text string is converted
|
||||||
|
// to UNLV-format: Latin-1 with specific reject and suspect codes.
|
||||||
|
const char kUnrecognized = '~';
|
||||||
|
// Conversion table for non-latin characters.
|
||||||
|
// Maps characters out of the latin set into the latin set.
|
||||||
|
// TODO(rays) incorporate this translation into unicharset.
|
||||||
|
const int kUniChs[] = {
|
||||||
|
0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
|
||||||
|
};
|
||||||
|
// Latin chars corresponding to the unicode chars above.
|
||||||
|
const int kLatinChs[] = {
|
||||||
|
0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
|
||||||
|
};
|
||||||
|
|
||||||
|
char* TessBaseAPI::TesseractToUNLV(PAGE_RES* page_res) {
|
||||||
|
bool tilde_crunch_written = false;
|
||||||
|
bool last_char_was_newline = true;
|
||||||
|
bool last_char_was_tilde = false;
|
||||||
|
|
||||||
|
if (page_res != NULL) {
|
||||||
|
int total_length = TextLength(page_res);
|
||||||
|
PAGE_RES_IT page_res_it(page_res);
|
||||||
|
char* result = new char[total_length];
|
||||||
|
char* ptr = result;
|
||||||
|
for (page_res_it.restart_page(); page_res_it.word () != NULL;
|
||||||
|
page_res_it.forward()) {
|
||||||
|
WERD_RES *word = page_res_it.word();
|
||||||
|
// Process the current word.
|
||||||
|
if (word->unlv_crunch_mode != CR_NONE) {
|
||||||
|
if (word->unlv_crunch_mode != CR_DELETE &&
|
||||||
|
(!tilde_crunch_written ||
|
||||||
|
(word->unlv_crunch_mode == CR_KEEP_SPACE &&
|
||||||
|
word->word->space () > 0 &&
|
||||||
|
!word->word->flag (W_FUZZY_NON) &&
|
||||||
|
!word->word->flag (W_FUZZY_SP)))) {
|
||||||
|
if (!word->word->flag (W_BOL) &&
|
||||||
|
word->word->space () > 0 &&
|
||||||
|
!word->word->flag (W_FUZZY_NON) &&
|
||||||
|
!word->word->flag (W_FUZZY_SP)) {
|
||||||
|
/* Write a space to separate from preceeding good text */
|
||||||
|
*ptr++ = ' ';
|
||||||
|
last_char_was_tilde = false;
|
||||||
|
}
|
||||||
|
if (!last_char_was_tilde) {
|
||||||
|
// Write a reject char.
|
||||||
|
last_char_was_tilde = true;
|
||||||
|
*ptr++ = kUnrecognized;
|
||||||
|
tilde_crunch_written = true;
|
||||||
|
last_char_was_newline = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// NORMAL PROCESSING of non tilde crunched words.
|
||||||
|
tilde_crunch_written = false;
|
||||||
|
|
||||||
|
if (last_char_was_tilde &&
|
||||||
|
word->word->space () == 0 &&
|
||||||
|
(word->best_choice->string ()[0] == ' ')) {
|
||||||
|
/* Prevent adjacent tilde across words - we know that adjacent tildes within
|
||||||
|
words have been removed */
|
||||||
|
char* p = (char *) word->best_choice->string().string ();
|
||||||
|
strcpy (p, p + 1); //shuffle up
|
||||||
|
p = (char *) word->best_choice->lengths().string ();
|
||||||
|
strcpy (p, p + 1); //shuffle up
|
||||||
|
word->reject_map.remove_pos (0);
|
||||||
|
PBLOB_IT blob_it = word->outword->blob_list ();
|
||||||
|
delete blob_it.extract (); //get rid of reject blob
|
||||||
|
}
|
||||||
|
|
||||||
|
if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps)
|
||||||
|
ensure_rep_chars_are_consistent(word);
|
||||||
|
|
||||||
|
set_unlv_suspects(word);
|
||||||
|
const char* wordstr = word->best_choice->string().string();
|
||||||
|
if (wordstr[0] != 0) {
|
||||||
|
if (!last_char_was_newline)
|
||||||
|
*ptr++ = ' ';
|
||||||
|
else
|
||||||
|
last_char_was_newline = false;
|
||||||
|
int offset = 0;
|
||||||
|
const STRING& lengths = word->best_choice->lengths();
|
||||||
|
int length = lengths.length();
|
||||||
|
for (int i = 0; i < length; offset += lengths[i++]) {
|
||||||
|
if (wordstr[offset] == ' ' ||
|
||||||
|
wordstr[offset] == '~' ||
|
||||||
|
wordstr[offset] == '|') {
|
||||||
|
*ptr++ = kUnrecognized;
|
||||||
|
last_char_was_tilde = true;
|
||||||
|
} else {
|
||||||
|
if (word->reject_map[i].rejected())
|
||||||
|
*ptr++ = '^';
|
||||||
|
UNICHAR ch(wordstr + offset, lengths[i]);
|
||||||
|
int uni_ch = ch.first_uni();
|
||||||
|
for (int j = 0; kUniChs[j] != 0; ++j) {
|
||||||
|
if (kUniChs[j] == uni_ch) {
|
||||||
|
uni_ch = kLatinChs[j];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (uni_ch <= 0xff) {
|
||||||
|
*ptr++ = static_cast<char>(uni_ch);
|
||||||
|
last_char_was_tilde = false;
|
||||||
|
} else {
|
||||||
|
*ptr++ = kUnrecognized;
|
||||||
|
last_char_was_tilde = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (word->word->flag(W_EOL) && !last_char_was_newline) {
|
||||||
|
/* Add a new line output */
|
||||||
|
*ptr++ = '\n';
|
||||||
|
tilde_crunch_written = false;
|
||||||
|
last_char_was_newline = true;
|
||||||
|
last_char_was_tilde = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*ptr++ = '\n';
|
||||||
|
*ptr = '\0';
|
||||||
|
delete page_res;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -20,8 +20,6 @@
|
|||||||
#ifndef THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
|
#ifndef THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
|
||||||
#define THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
|
#define THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
|
||||||
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
class PAGE_RES;
|
class PAGE_RES;
|
||||||
class BLOCK_LIST;
|
class BLOCK_LIST;
|
||||||
|
|
||||||
@ -56,6 +54,10 @@ class TessBaseAPI {
|
|||||||
const char* language, const char* configfile,
|
const char* language, const char* configfile,
|
||||||
bool numeric_mode, int argc, char* argv[]);
|
bool numeric_mode, int argc, char* argv[]);
|
||||||
|
|
||||||
|
// Set the name of the input file. Needed only for training and
|
||||||
|
// reading a UNLV zone file.
|
||||||
|
static void SetInputName(const char* name);
|
||||||
|
|
||||||
// Recognize a rectangle from an image and return the result as a string.
|
// Recognize a rectangle from an image and return the result as a string.
|
||||||
// May be called many times for a single Init.
|
// May be called many times for a single Init.
|
||||||
// Currently has no error checking.
|
// Currently has no error checking.
|
||||||
@ -71,6 +73,19 @@ class TessBaseAPI {
|
|||||||
int bytes_per_pixel,
|
int bytes_per_pixel,
|
||||||
int bytes_per_line,
|
int bytes_per_line,
|
||||||
int left, int top, int width, int height);
|
int left, int top, int width, int height);
|
||||||
|
// As TesseractRect but produces a box file as output.
|
||||||
|
// Image height is needed as well as rect height, since output y-coords
|
||||||
|
// will be relative to the bottom of the image.
|
||||||
|
static char* TesseractRectBoxes(const unsigned char* imagedata,
|
||||||
|
int bytes_per_pixel,
|
||||||
|
int bytes_per_line,
|
||||||
|
int left, int top, int width, int height,
|
||||||
|
int imageheight);
|
||||||
|
// As TesseractRect but produces UNLV-style output.
|
||||||
|
static char* TesseractRectUNLV(const unsigned char* imagedata,
|
||||||
|
int bytes_per_pixel,
|
||||||
|
int bytes_per_line,
|
||||||
|
int left, int top, int width, int height);
|
||||||
|
|
||||||
// Call between pages or documents etc to free up memory and forget
|
// Call between pages or documents etc to free up memory and forget
|
||||||
// adaptive data.
|
// adaptive data.
|
||||||
@ -153,8 +168,18 @@ class TessBaseAPI {
|
|||||||
static PAGE_RES* Recognize(BLOCK_LIST* block_list,
|
static PAGE_RES* Recognize(BLOCK_LIST* block_list,
|
||||||
struct ETEXT_STRUCT* monitor);
|
struct ETEXT_STRUCT* monitor);
|
||||||
|
|
||||||
|
// Return the maximum length that the output text string might occupy.
|
||||||
|
static int TextLength(PAGE_RES* page_res);
|
||||||
// Convert (and free) the internal data structures into a text string.
|
// Convert (and free) the internal data structures into a text string.
|
||||||
static char* TesseractToText(PAGE_RES* page_res);
|
static char* TesseractToText(PAGE_RES* page_res);
|
||||||
|
// Make a text string from the internal data structures.
|
||||||
|
// The input page_res is deleted.
|
||||||
|
// The text string takes the form of a box file as needed for training.
|
||||||
|
static char* TesseractToBoxText(PAGE_RES* page_res, int left, int bottom);
|
||||||
|
// Make a text string from the internal data structures.
|
||||||
|
// The input page_res is deleted. The text string is converted
|
||||||
|
// to UNLV-format: Latin-1 with specific reject and suspect codes.
|
||||||
|
static char* TesseractToUNLV(PAGE_RES* page_res);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
|
#endif // THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__
|
||||||
|
@ -35,6 +35,7 @@
|
|||||||
#include "docqual.h"
|
#include "docqual.h"
|
||||||
#include "output.h"
|
#include "output.h"
|
||||||
#include "bestfirst.h"
|
#include "bestfirst.h"
|
||||||
|
#include "globals.h"
|
||||||
|
|
||||||
#define EXTERN
|
#define EXTERN
|
||||||
|
|
||||||
@ -55,12 +56,12 @@ EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,
|
|||||||
"Write block separators in output");
|
"Write block separators in output");
|
||||||
EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,
|
EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,
|
||||||
"Write raw stuff to name.raw");
|
"Write raw stuff to name.raw");
|
||||||
EXTERN BOOL_EVAR (tessedit_write_output, TRUE, "Write text to name.txt");
|
EXTERN BOOL_EVAR (tessedit_write_output, FALSE, "Write text to name.txt");
|
||||||
EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,
|
EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,
|
||||||
"Return ratings in IPEOCRAPI data");
|
"Return ratings in IPEOCRAPI data");
|
||||||
EXTERN BOOL_EVAR (tessedit_write_txt_map, TRUE,
|
EXTERN BOOL_EVAR (tessedit_write_txt_map, FALSE,
|
||||||
"Write .txt to .etx map file");
|
"Write .txt to .etx map file");
|
||||||
EXTERN BOOL_EVAR (tessedit_write_rep_codes, TRUE,
|
EXTERN BOOL_EVAR (tessedit_write_rep_codes, FALSE,
|
||||||
"Write repetition char code");
|
"Write repetition char code");
|
||||||
EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");
|
EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");
|
||||||
EXTERN STRING_EVAR (unrecognised_char, "|",
|
EXTERN STRING_EVAR (unrecognised_char, "|",
|
||||||
@ -106,7 +107,6 @@ INT32 pixels_to_pts( //convert coords
|
|||||||
return (INT32) (pts + 0.5); //round it
|
return (INT32) (pts + 0.5); //round it
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void output_pass( //Tess output pass //send to api
|
void output_pass( //Tess output pass //send to api
|
||||||
PAGE_RES_IT &page_res_it,
|
PAGE_RES_IT &page_res_it,
|
||||||
BOOL8 write_to_shm,
|
BOOL8 write_to_shm,
|
||||||
@ -119,8 +119,7 @@ void output_pass( //Tess output pass //send to api
|
|||||||
|
|
||||||
if (tessedit_write_txt_map)
|
if (tessedit_write_txt_map)
|
||||||
txt_mapfile = open_outfile (".map");
|
txt_mapfile = open_outfile (".map");
|
||||||
if (tessedit_write_unlv)
|
|
||||||
unlv_file = open_outfile (".unlv");
|
|
||||||
page_res_it.restart_page ();
|
page_res_it.restart_page ();
|
||||||
block_of_last_word = NULL;
|
block_of_last_word = NULL;
|
||||||
while (page_res_it.word () != NULL) {
|
while (page_res_it.word () != NULL) {
|
||||||
@ -189,7 +188,6 @@ void output_pass( //Tess output pass //send to api
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* write_results()
|
* write_results()
|
||||||
*
|
*
|
||||||
@ -211,9 +209,10 @@ void write_results( //output a word
|
|||||||
) {
|
) {
|
||||||
//word to do
|
//word to do
|
||||||
WERD_RES *word = page_res_it.word ();
|
WERD_RES *word = page_res_it.word ();
|
||||||
WERD_CHOICE *ep_choice; //ep format
|
// WERD_CHOICE *ep_choice; //ep format
|
||||||
STRING repetition_code;
|
STRING repetition_code;
|
||||||
const STRING *wordstr;
|
const STRING *wordstr;
|
||||||
|
STRING wordstr_lengths;
|
||||||
const char *text;
|
const char *text;
|
||||||
int i;
|
int i;
|
||||||
char unrecognised = STRING (unrecognised_char)[0];
|
char unrecognised = STRING (unrecognised_char)[0];
|
||||||
@ -312,15 +311,12 @@ void write_results( //output a word
|
|||||||
if (tessedit_write_output && !NO_BLOCK)
|
if (tessedit_write_output && !NO_BLOCK)
|
||||||
fprintf (textfile, "%s", txt_chs);
|
fprintf (textfile, "%s", txt_chs);
|
||||||
|
|
||||||
if (tessedit_write_unlv)
|
|
||||||
fprintf (unlv_file, "%s", txt_chs);
|
|
||||||
|
|
||||||
if (tessedit_write_txt_map)
|
if (tessedit_write_txt_map)
|
||||||
fprintf (txt_mapfile, "%s", map_chs);
|
fprintf (txt_mapfile, "%s", map_chs);
|
||||||
|
|
||||||
//terminate string
|
//terminate string
|
||||||
ep_chars[ep_chars_index] = '\0';
|
ep_chars[ep_chars_index] = '\0';
|
||||||
word->ep_choice = new WERD_CHOICE (ep_chars, 0, 0, NO_PERM);
|
word->ep_choice = new WERD_CHOICE (ep_chars, NULL, 0, 0, NO_PERM);
|
||||||
|
|
||||||
if (force_eol)
|
if (force_eol)
|
||||||
empty_block = TRUE;
|
empty_block = TRUE;
|
||||||
@ -345,6 +341,8 @@ void write_results( //output a word
|
|||||||
words have been removed */
|
words have been removed */
|
||||||
ptr = (char *) word->best_choice->string ().string ();
|
ptr = (char *) word->best_choice->string ().string ();
|
||||||
strcpy (ptr, ptr + 1); //shuffle up
|
strcpy (ptr, ptr + 1); //shuffle up
|
||||||
|
ptr = (char *) word->best_choice->lengths ().string ();
|
||||||
|
strcpy (ptr, ptr + 1); //shuffle up
|
||||||
word->reject_map.remove_pos (0);
|
word->reject_map.remove_pos (0);
|
||||||
blob_it = word->outword->blob_list ();
|
blob_it = word->outword->blob_list ();
|
||||||
delete blob_it.extract (); //get rid of reject blob
|
delete blob_it.extract (); //get rid of reject blob
|
||||||
@ -354,8 +352,10 @@ void write_results( //output a word
|
|||||||
last_char_was_tilde = FALSE;
|
last_char_was_tilde = FALSE;
|
||||||
else {
|
else {
|
||||||
if (word->reject_map.length () > 0) {
|
if (word->reject_map.length () > 0) {
|
||||||
if (word->best_choice->string ()[word->reject_map.length () - 1] ==
|
for (i = 0, ptr = (char *) word->best_choice->string().string();
|
||||||
' ')
|
i < word->reject_map.length () - 1; ++i)
|
||||||
|
ptr += word->best_choice->lengths()[i];
|
||||||
|
if (*ptr == ' ')
|
||||||
last_char_was_tilde = TRUE;
|
last_char_was_tilde = TRUE;
|
||||||
else
|
else
|
||||||
last_char_was_tilde = FALSE;
|
last_char_was_tilde = FALSE;
|
||||||
@ -365,7 +365,7 @@ void write_results( //output a word
|
|||||||
/* else it is unchanged as there are no output chars */
|
/* else it is unchanged as there are no output chars */
|
||||||
}
|
}
|
||||||
|
|
||||||
ptr = (char *) word->best_choice->string ().string ();
|
ptr = (char *) word->best_choice->lengths ().string ();
|
||||||
ASSERT_HOST (strlen (ptr) == word->reject_map.length ());
|
ASSERT_HOST (strlen (ptr) == word->reject_map.length ());
|
||||||
|
|
||||||
if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
|
if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)
|
||||||
@ -379,21 +379,26 @@ void write_results( //output a word
|
|||||||
dict_word (word->best_choice->string ().string ()));
|
dict_word (word->best_choice->string ().string ()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
if (tessedit_write_unlv) {
|
if (tessedit_write_unlv) {
|
||||||
write_unlv_text(word);
|
write_unlv_text(word);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
|
if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
|
||||||
repetition_code = "|^~R";
|
repetition_code = "|^~R";
|
||||||
repetition_code += get_rep_char (word);
|
wordstr_lengths = "\001\001\001\001";
|
||||||
|
repetition_code += unicharset.id_to_unichar(get_rep_char (word));
|
||||||
|
wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word)));
|
||||||
wordstr = &repetition_code;
|
wordstr = &repetition_code;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
wordstr = &(word->best_choice->string ());
|
wordstr = &(word->best_choice->string ());
|
||||||
|
wordstr_lengths = word->best_choice->lengths ();
|
||||||
if (tessedit_zero_rejection) {
|
if (tessedit_zero_rejection) {
|
||||||
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
|
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
|
||||||
text = wordstr->string ();
|
text = wordstr->string ();
|
||||||
for (i = 0; text[i] != '\0'; i++) {
|
for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
|
||||||
if (word->reject_map[i].rejected ())
|
if (word->reject_map[i].rejected ())
|
||||||
word->reject_map[i].setrej_minimal_rej_accept ();
|
word->reject_map[i].setrej_minimal_rej_accept ();
|
||||||
}
|
}
|
||||||
@ -401,8 +406,8 @@ void write_results( //output a word
|
|||||||
if (tessedit_minimal_rejection) {
|
if (tessedit_minimal_rejection) {
|
||||||
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
|
/* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
|
||||||
text = wordstr->string ();
|
text = wordstr->string ();
|
||||||
for (i = 0; text[i] != '\0'; i++) {
|
for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) {
|
||||||
if ((text[i] != ' ') && word->reject_map[i].rejected ())
|
if ((*text != ' ') && word->reject_map[i].rejected ())
|
||||||
word->reject_map[i].setrej_minimal_rej_accept ();
|
word->reject_map[i].setrej_minimal_rej_accept ();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -410,8 +415,9 @@ void write_results( //output a word
|
|||||||
|
|
||||||
if (write_to_shm)
|
if (write_to_shm)
|
||||||
write_shm_text (word, page_res_it.block ()->block,
|
write_shm_text (word, page_res_it.block ()->block,
|
||||||
page_res_it.row (), *wordstr);
|
page_res_it.row (), *wordstr, wordstr_lengths);
|
||||||
|
|
||||||
|
#if 0
|
||||||
if (tessedit_write_output)
|
if (tessedit_write_output)
|
||||||
write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);
|
write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile);
|
||||||
|
|
||||||
@ -424,12 +430,12 @@ void write_results( //output a word
|
|||||||
|
|
||||||
ep_choice = make_epaper_choice (word, newline_type);
|
ep_choice = make_epaper_choice (word, newline_type);
|
||||||
word->ep_choice = ep_choice;
|
word->ep_choice = ep_choice;
|
||||||
|
#endif
|
||||||
|
|
||||||
character_count += word->best_choice->string ().length ();
|
character_count += word->best_choice->lengths ().length ();
|
||||||
word_count++;
|
word_count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************
|
/**********************************************************************
|
||||||
* make_epaper_choice
|
* make_epaper_choice
|
||||||
*
|
*
|
||||||
@ -437,6 +443,7 @@ void write_results( //output a word
|
|||||||
* determine whether each blob should be rejected.
|
* determine whether each blob should be rejected.
|
||||||
**********************************************************************/
|
**********************************************************************/
|
||||||
|
|
||||||
|
#if 0
|
||||||
WERD_CHOICE *make_epaper_choice( //convert one word
|
WERD_CHOICE *make_epaper_choice( //convert one word
|
||||||
WERD_RES *word, //word to do
|
WERD_RES *word, //word to do
|
||||||
char newline_type //type of newline
|
char newline_type //type of newline
|
||||||
@ -482,7 +489,8 @@ WERD_CHOICE *make_epaper_choice( //convert one word
|
|||||||
if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
|
if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
|
||||||
strcpy (word_string + index, "|^~R");
|
strcpy (word_string + index, "|^~R");
|
||||||
index += 4;
|
index += 4;
|
||||||
word_string[index++] = get_rep_char (word);
|
strcpy(word_string + index, unicharset.id_to_unichar(get_rep_char (word)));
|
||||||
|
index += strlen(unicharset.id_to_unichar(get_rep_char (word)));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (!blob_it.empty ())
|
if (!blob_it.empty ())
|
||||||
@ -537,7 +545,7 @@ WERD_CHOICE *make_epaper_choice( //convert one word
|
|||||||
ASSERT_HOST (strlen (word_string) == index);
|
ASSERT_HOST (strlen (word_string) == index);
|
||||||
return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
|
return new WERD_CHOICE (word_string, 0, 0, NO_PERM);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/**********************************************************************
|
/**********************************************************************
|
||||||
* make_reject
|
* make_reject
|
||||||
@ -653,6 +661,7 @@ char determine_newline_type( //test line ends
|
|||||||
* to the given file.
|
* to the given file.
|
||||||
**********************************************************************/
|
**********************************************************************/
|
||||||
|
|
||||||
|
#if 0
|
||||||
void write_cooked_text( //write output
|
void write_cooked_text( //write output
|
||||||
WERD *word, //word to do
|
WERD *word, //word to do
|
||||||
const STRING &text, //text to write
|
const STRING &text, //text to write
|
||||||
@ -749,6 +758,7 @@ void write_cooked_text( //write output
|
|||||||
if (status != 0)
|
if (status != 0)
|
||||||
WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno);
|
WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************
|
/**********************************************************************
|
||||||
@ -761,7 +771,8 @@ void write_shm_text( //write output
|
|||||||
WERD_RES *word, //word to do
|
WERD_RES *word, //word to do
|
||||||
BLOCK *block, //block it is from
|
BLOCK *block, //block it is from
|
||||||
ROW_RES *row, //row it is from
|
ROW_RES *row, //row it is from
|
||||||
const STRING &text //text to write
|
const STRING &text, //text to write
|
||||||
|
const STRING &text_lengths
|
||||||
) {
|
) {
|
||||||
INT32 index; //char counter
|
INT32 index; //char counter
|
||||||
INT32 index2; //char counter
|
INT32 index2; //char counter
|
||||||
@ -777,6 +788,8 @@ void write_shm_text( //write output
|
|||||||
WERD copy_outword; // copy to denorm
|
WERD copy_outword; // copy to denorm
|
||||||
UINT32 rating; //of char
|
UINT32 rating; //of char
|
||||||
BOOL8 lineend; //end of line
|
BOOL8 lineend; //end of line
|
||||||
|
int offset;
|
||||||
|
int offset2;
|
||||||
|
|
||||||
//point size
|
//point size
|
||||||
ptsize = pixels_to_pts ((INT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
|
ptsize = pixels_to_pts ((INT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300);
|
||||||
@ -786,13 +799,14 @@ void write_shm_text( //write output
|
|||||||
copy_outword = *(word->outword);
|
copy_outword = *(word->outword);
|
||||||
copy_outword.baseline_denormalise (&word->denorm);
|
copy_outword.baseline_denormalise (&word->denorm);
|
||||||
blob_it.set_to_list (copy_outword.blob_list ());
|
blob_it.set_to_list (copy_outword.blob_list ());
|
||||||
length = text.length ();
|
length = text_lengths.length ();
|
||||||
|
|
||||||
if (length > 0) {
|
if (length > 0) {
|
||||||
blanks = word->word->space ();
|
blanks = word->word->space ();
|
||||||
if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
|
if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL))
|
||||||
blanks = 1;
|
blanks = 1;
|
||||||
for (index = 0; index < length; index++, blob_it.forward ()) {
|
for (index = 0, offset = 0; index < length;
|
||||||
|
offset += text_lengths[index++], blob_it.forward ()) {
|
||||||
blob = blob_it.data ();
|
blob = blob_it.data ();
|
||||||
blob_box = blob->bounding_box ();
|
blob_box = blob->bounding_box ();
|
||||||
|
|
||||||
@ -804,7 +818,7 @@ void write_shm_text( //write output
|
|||||||
if (tessedit_write_ratings)
|
if (tessedit_write_ratings)
|
||||||
rating = (UINT32) (-word->best_choice->certainty () / 0.035);
|
rating = (UINT32) (-word->best_choice->certainty () / 0.035);
|
||||||
else if (tessedit_zero_rejection)
|
else if (tessedit_zero_rejection)
|
||||||
rating = text[index] == ' ' ? 100 : 0;
|
rating = text[offset] == ' ' ? 100 : 0;
|
||||||
else
|
else
|
||||||
rating = word->reject_map[index].accepted ()? 0 : 100;
|
rating = word->reject_map[index].accepted ()? 0 : 100;
|
||||||
if (rating > 255)
|
if (rating > 255)
|
||||||
@ -819,22 +833,41 @@ void write_shm_text( //write output
|
|||||||
|
|
||||||
lineend = word->word->flag (W_EOL) && index == length - 1;
|
lineend = word->word->flag (W_EOL) && index == length - 1;
|
||||||
if (word->word->flag (W_EOL) && tessedit_zero_rejection
|
if (word->word->flag (W_EOL) && tessedit_zero_rejection
|
||||||
&& index < length - 1 && text[index + 1] == ' ') {
|
&& index < length - 1 && text[index + text_lengths[index]] == ' ') {
|
||||||
for (index2 = index + 1; index2 < length && text[index2] == ' ';
|
for (index2 = index + 1, offset2 = offset + text_lengths[index];
|
||||||
index2++);
|
index2 < length && text[offset2] == ' ';
|
||||||
|
offset2 += text_lengths[index2++]);
|
||||||
if (index2 == length)
|
if (index2 == length)
|
||||||
lineend = TRUE;
|
lineend = TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!tessedit_zero_rejection || text[index] != ' '
|
if (!tessedit_zero_rejection || text[offset] != ' '
|
||||||
|| tessedit_word_for_word) {
|
|| tessedit_word_for_word) {
|
||||||
//confidence
|
//confidence
|
||||||
ocr_append_char (text[index] == ' ' ? unrecognised : text[index], blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, (UINT8) rating,
|
if (text[offset] == ' ') {
|
||||||
ptsize, //point size
|
ocr_append_char (unrecognised,
|
||||||
blanks, enhancement, //enhancement
|
blob_box.left (), blob_box.right (),
|
||||||
OCR_CDIR_LEFT_RIGHT,
|
page_image.get_ysize () - 1 - blob_box.top (),
|
||||||
OCR_LDIR_DOWN_RIGHT,
|
page_image.get_ysize () - 1 - blob_box.bottom (),
|
||||||
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
|
font, (UINT8) rating,
|
||||||
|
ptsize, //point size
|
||||||
|
blanks, enhancement, //enhancement
|
||||||
|
OCR_CDIR_LEFT_RIGHT,
|
||||||
|
OCR_LDIR_DOWN_RIGHT,
|
||||||
|
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
|
||||||
|
} else {
|
||||||
|
for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)
|
||||||
|
ocr_append_char (text[offset + suboffset],
|
||||||
|
blob_box.left (), blob_box.right (),
|
||||||
|
page_image.get_ysize () - 1 - blob_box.top (),
|
||||||
|
page_image.get_ysize () - 1 - blob_box.bottom (),
|
||||||
|
font, (UINT8) rating,
|
||||||
|
ptsize, //point size
|
||||||
|
blanks, enhancement, //enhancement
|
||||||
|
OCR_CDIR_LEFT_RIGHT,
|
||||||
|
OCR_LDIR_DOWN_RIGHT,
|
||||||
|
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
|
||||||
|
}
|
||||||
blanks = 0;
|
blanks = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -863,13 +896,17 @@ void write_shm_text( //write output
|
|||||||
lineend = word->word->flag (W_EOL);
|
lineend = word->word->flag (W_EOL);
|
||||||
|
|
||||||
//font index
|
//font index
|
||||||
ocr_append_char (unrecognised, blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font,
|
ocr_append_char (unrecognised,
|
||||||
rating, //confidence
|
blob_box.left (), blob_box.right (),
|
||||||
ptsize, //point size
|
page_image.get_ysize () - 1 - blob_box.top (),
|
||||||
blanks, enhancement, //enhancement
|
page_image.get_ysize () - 1 - blob_box.bottom (),
|
||||||
OCR_CDIR_LEFT_RIGHT,
|
font,
|
||||||
OCR_LDIR_DOWN_RIGHT,
|
rating, //confidence
|
||||||
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
|
ptsize, //point size
|
||||||
|
blanks, enhancement, //enhancement
|
||||||
|
OCR_CDIR_LEFT_RIGHT,
|
||||||
|
OCR_LDIR_DOWN_RIGHT,
|
||||||
|
lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -888,6 +925,7 @@ void write_shm_text( //write output
|
|||||||
* newdiff needs etx files!
|
* newdiff needs etx files!
|
||||||
**********************************************************************/
|
**********************************************************************/
|
||||||
|
|
||||||
|
#if 0
|
||||||
void write_map( //output a map file
|
void write_map( //output a map file
|
||||||
FILE *mapfile, //mapfile to write to
|
FILE *mapfile, //mapfile to write to
|
||||||
WERD_RES *word) {
|
WERD_RES *word) {
|
||||||
@ -937,6 +975,7 @@ void write_map( //output a map file
|
|||||||
if (status != 0)
|
if (status != 0)
|
||||||
WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
|
WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
@ -957,6 +996,7 @@ FILE *open_outfile( //open .map & .unlv file
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#if 0
|
||||||
void write_unlv_text(WERD_RES *word) {
|
void write_unlv_text(WERD_RES *word) {
|
||||||
const char *wordstr;
|
const char *wordstr;
|
||||||
|
|
||||||
@ -1015,6 +1055,7 @@ void write_unlv_text(WERD_RES *word) {
|
|||||||
if (status != 0)
|
if (status != 0)
|
||||||
WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
|
WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
@ -1022,21 +1063,24 @@ void write_unlv_text(WERD_RES *word) {
|
|||||||
* Return the first accepted character from the repetition string. This is the
|
* Return the first accepted character from the repetition string. This is the
|
||||||
* character which is repeated - as determined earlier by fix_rep_char()
|
* character which is repeated - as determined earlier by fix_rep_char()
|
||||||
*************************************************************************/
|
*************************************************************************/
|
||||||
char get_rep_char( // what char is repeated?
|
UNICHAR_ID get_rep_char(WERD_RES *word) { // what char is repeated?
|
||||||
WERD_RES *word) {
|
|
||||||
int i;
|
int i;
|
||||||
|
int offset;
|
||||||
|
|
||||||
for (i = 0;
|
for (i = 0, offset = 0;
|
||||||
((i < word->reject_map.length ()) &&
|
((i < word->reject_map.length ()) &&
|
||||||
(word->reject_map[i].rejected ())); i++);
|
(word->reject_map[i].rejected ()));
|
||||||
|
offset += word->best_choice->lengths()[i++]);
|
||||||
if (i < word->reject_map.length ())
|
if (i < word->reject_map.length ())
|
||||||
return word->best_choice->string ()[i];
|
return unicharset.unichar_to_id(word->best_choice->string().string()
|
||||||
|
+ offset,
|
||||||
|
word->best_choice->lengths()[i]);
|
||||||
else
|
else
|
||||||
return STRING (unrecognised_char)[0];
|
return unicharset.unichar_to_id(unrecognised_char.string());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void ensure_rep_chars_are_consistent(WERD_RES *word) {
|
void ensure_rep_chars_are_consistent(WERD_RES *word) {
|
||||||
|
#if 0
|
||||||
char rep_char = get_rep_char (word);
|
char rep_char = get_rep_char (word);
|
||||||
char *ptr;
|
char *ptr;
|
||||||
|
|
||||||
@ -1045,8 +1089,24 @@ void ensure_rep_chars_are_consistent(WERD_RES *word) {
|
|||||||
if (*ptr != rep_char)
|
if (*ptr != rep_char)
|
||||||
*ptr = rep_char;
|
*ptr = rep_char;
|
||||||
}
|
}
|
||||||
}
|
#endif
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate
|
||||||
|
int i;
|
||||||
|
char *ptr;
|
||||||
|
STRING consistent_string;
|
||||||
|
STRING consistent_string_lengths;
|
||||||
|
|
||||||
|
ptr = (char *) word->best_choice->string ().string ();
|
||||||
|
for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) {
|
||||||
|
consistent_string += unicharset.id_to_unichar(rep_char);
|
||||||
|
consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char));
|
||||||
|
}
|
||||||
|
word->best_choice->string() = consistent_string;
|
||||||
|
word->best_choice->lengths() = consistent_string_lengths;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* SUSPECT LEVELS
|
* SUSPECT LEVELS
|
||||||
@ -1062,7 +1122,9 @@ void ensure_rep_chars_are_consistent(WERD_RES *word) {
|
|||||||
void set_unlv_suspects(WERD_RES *word) {
|
void set_unlv_suspects(WERD_RES *word) {
|
||||||
int len = word->reject_map.length ();
|
int len = word->reject_map.length ();
|
||||||
int i;
|
int i;
|
||||||
|
int offset;
|
||||||
const char *ptr;
|
const char *ptr;
|
||||||
|
const char *lengths = word->best_choice->lengths ().string ();
|
||||||
float rating_per_ch;
|
float rating_per_ch;
|
||||||
|
|
||||||
ptr = word->best_choice->string ().string ();
|
ptr = word->best_choice->string ().string ();
|
||||||
@ -1080,10 +1142,12 @@ void set_unlv_suspects(WERD_RES *word) {
|
|||||||
|
|
||||||
/* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
|
/* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
|
||||||
|
|
||||||
if (safe_dict_word (ptr) && (count_alphas (ptr) > suspect_short_words)) {
|
if (safe_dict_word (ptr) && (count_alphas (ptr, lengths) >
|
||||||
|
suspect_short_words)) {
|
||||||
/* Unreject alphas in dictionary words */
|
/* Unreject alphas in dictionary words */
|
||||||
for (i = 0; i < len; i++) {
|
for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
|
||||||
if (word->reject_map[i].rejected () && isalpha (ptr[i]))
|
if (word->reject_map[i].rejected () &&
|
||||||
|
unicharset.get_isalpha (ptr + offset, lengths[i]))
|
||||||
word->reject_map[i].setrej_minimal_rej_accept ();
|
word->reject_map[i].setrej_minimal_rej_accept ();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1095,8 +1159,8 @@ void set_unlv_suspects(WERD_RES *word) {
|
|||||||
|
|
||||||
if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
|
if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
|
||||||
/* Unreject any Tess Acceptable word - but NOT tess reject chs*/
|
/* Unreject any Tess Acceptable word - but NOT tess reject chs*/
|
||||||
for (i = 0; i < len; i++) {
|
for (i = 0, offset = 0; i < len; offset += lengths[i++]) {
|
||||||
if (word->reject_map[i].rejected () && (ptr[i] != ' '))
|
if (word->reject_map[i].rejected () && (ptr[offset] != ' '))
|
||||||
word->reject_map[i].setrej_minimal_rej_accept ();
|
word->reject_map[i].setrej_minimal_rej_accept ();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1130,9 +1194,11 @@ void set_unlv_suspects(WERD_RES *word) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((acceptable_word_string (word->best_choice->string ().string ())
|
if ((acceptable_word_string (word->best_choice->string ().string (),
|
||||||
|
word->best_choice->lengths ().string ())
|
||||||
!= AC_UNACCEPTABLE) ||
|
!= AC_UNACCEPTABLE) ||
|
||||||
acceptable_number_string (word->best_choice->string ().string ())) {
|
acceptable_number_string (word->best_choice->string ().string (),
|
||||||
|
word->best_choice->lengths ().string ())) {
|
||||||
if (word->reject_map.length () > suspect_short_words) {
|
if (word->reject_map.length () > suspect_short_words) {
|
||||||
for (i = 0; i < len; i++) {
|
for (i = 0; i < len; i++) {
|
||||||
if (word->reject_map[i].rejected () &&
|
if (word->reject_map[i].rejected () &&
|
||||||
@ -1149,11 +1215,12 @@ void set_unlv_suspects(WERD_RES *word) {
|
|||||||
|
|
||||||
|
|
||||||
INT16 count_alphas( //how many alphas
|
INT16 count_alphas( //how many alphas
|
||||||
const char *s) {
|
const char *s,
|
||||||
|
const char *lengths) {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
|
||||||
for (; *s != '\0'; s++) {
|
for (; *s != '\0'; s += *(lengths++)) {
|
||||||
if (isalpha (*s))
|
if (unicharset.get_isalpha(s, *lengths))
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
return count;
|
return count;
|
||||||
@ -1161,36 +1228,43 @@ INT16 count_alphas( //how many alphas
|
|||||||
|
|
||||||
|
|
||||||
INT16 count_alphanums( //how many alphanums
|
INT16 count_alphanums( //how many alphanums
|
||||||
const char *s) {
|
const char *s,
|
||||||
|
const char *lengths) {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
|
||||||
for (; *s != '\0'; s++) {
|
for (; *s != '\0'; s += *(lengths++)) {
|
||||||
if (isalnum (*s))
|
if (unicharset.get_isalpha(s, *lengths) ||
|
||||||
|
unicharset.get_isdigit(s, *lengths))
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
BOOL8 acceptable_number_string(const char *s) {
|
BOOL8 acceptable_number_string(const char *s,
|
||||||
|
const char *lengths) {
|
||||||
BOOL8 prev_digit = FALSE;
|
BOOL8 prev_digit = FALSE;
|
||||||
|
|
||||||
if (*s == '(')
|
if (*lengths == 1 && *s == '(')
|
||||||
s++;
|
s++;
|
||||||
|
|
||||||
if ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))
|
if (*lengths == 1 &&
|
||||||
|
((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
|
||||||
s++;
|
s++;
|
||||||
|
|
||||||
for (; *s != '\0'; s++) {
|
for (; *s != '\0'; s += *(lengths++)) {
|
||||||
if (isdigit (*s))
|
if (unicharset.get_isdigit (s, *lengths))
|
||||||
prev_digit = TRUE;
|
prev_digit = TRUE;
|
||||||
else if (prev_digit && ((*s == '.') || (*s == ',') || (*s == '-')))
|
|
||||||
prev_digit = FALSE;
|
|
||||||
else if (prev_digit &&
|
else if (prev_digit &&
|
||||||
(*(s + 1) == '\0') && ((*s == '%') || (*s == ')')))
|
(*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
|
||||||
|
prev_digit = FALSE;
|
||||||
|
else if (prev_digit && *lengths == 1 &&
|
||||||
|
(*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
|
||||||
return TRUE;
|
return TRUE;
|
||||||
else if (prev_digit &&
|
else if (prev_digit &&
|
||||||
(*s == '%') && (*(s + 1) == ')') && (*(s + 2) == '\0'))
|
*lengths == 1 && (*s == '%') &&
|
||||||
|
(*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
|
||||||
|
(*(s + *lengths + *(lengths + 1)) == '\0'))
|
||||||
return TRUE;
|
return TRUE;
|
||||||
else
|
else
|
||||||
return FALSE;
|
return FALSE;
|
||||||
|
@ -31,7 +31,9 @@
|
|||||||
#include "stderr.h"
|
#include "stderr.h"
|
||||||
#include "notdll.h"
|
#include "notdll.h"
|
||||||
#include "mainblk.h"
|
#include "mainblk.h"
|
||||||
|
#include "output.h"
|
||||||
#include "globals.h"
|
#include "globals.h"
|
||||||
|
#include "blread.h"
|
||||||
#include "tfacep.h"
|
#include "tfacep.h"
|
||||||
#include "callnet.h"
|
#include "callnet.h"
|
||||||
|
|
||||||
@ -40,7 +42,10 @@
|
|||||||
#define API_CONFIG "configs/api_config"
|
#define API_CONFIG "configs/api_config"
|
||||||
#define EXTERN
|
#define EXTERN
|
||||||
|
|
||||||
|
EXTERN BOOL_VAR (tessedit_create_boxfile, FALSE, "Output text with boxes");
|
||||||
EXTERN BOOL_VAR (tessedit_read_image, TRUE, "Ensure the image is read");
|
EXTERN BOOL_VAR (tessedit_read_image, TRUE, "Ensure the image is read");
|
||||||
|
EXTERN INT_VAR (tessedit_serial_unlv, 0,
|
||||||
|
"0->Whole page, 1->serial no adapt, 2->serial with adapt");
|
||||||
EXTERN BOOL_VAR (tessedit_write_images, FALSE,
|
EXTERN BOOL_VAR (tessedit_write_images, FALSE,
|
||||||
"Capture the image from the IPE");
|
"Capture the image from the IPE");
|
||||||
EXTERN BOOL_VAR (tessedit_debug_to_screen, FALSE, "Dont use debug file");
|
EXTERN BOOL_VAR (tessedit_debug_to_screen, FALSE, "Dont use debug file");
|
||||||
@ -63,15 +68,30 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
if (argc < 3) {
|
if (argc < 3) {
|
||||||
USAGE.error (argv[0], EXIT,
|
USAGE.error (argv[0], EXIT,
|
||||||
"%s imagename outputbase [configfile [[+|-]varfile]...]\n", argv[0]);
|
"%s imagename outputbase [-l lang] [configfile [[+|-]varfile]...]\n",
|
||||||
|
argv[0]);
|
||||||
|
}
|
||||||
|
// Find the required language.
|
||||||
|
const char* lang = "eng";
|
||||||
|
int arg = 3;
|
||||||
|
if (argc >= 5 && strcmp(argv[3], "-l") == 0) {
|
||||||
|
lang = argv[4];
|
||||||
|
arg = 5;
|
||||||
|
}
|
||||||
|
// Find the basename of the input file.
|
||||||
|
STRING infile(argv[1]);
|
||||||
|
const char* lastdot = strrchr(argv[1], '.');
|
||||||
|
if (lastdot != NULL) {
|
||||||
|
infile[lastdot - argv[1]] = '\0';
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argc == 3)
|
if (argc == arg)
|
||||||
TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL,
|
TessBaseAPI::InitWithLanguage(argv[0], infile.string(), lang,
|
||||||
NULL, false, 0, argv + 2);
|
NULL, false, 0, argv + arg);
|
||||||
else
|
else
|
||||||
TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL,
|
TessBaseAPI::InitWithLanguage(argv[0], infile.string(), lang,
|
||||||
argv[3], false, argc - 4, argv + 4);
|
argv[arg], false,
|
||||||
|
argc - arg - 1, argv + arg + 1);
|
||||||
|
|
||||||
tprintf ("Tesseract Open Source OCR Engine\n");
|
tprintf ("Tesseract Open Source OCR Engine\n");
|
||||||
|
|
||||||
@ -92,20 +112,70 @@ int main(int argc, char **argv) {
|
|||||||
argv[1]);
|
argv[1]);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
STRING text_out;
|
||||||
int bytes_per_line = check_legal_image_size(image.get_xsize(),
|
int bytes_per_line = check_legal_image_size(image.get_xsize(),
|
||||||
image.get_ysize(),
|
image.get_ysize(),
|
||||||
image.get_bpp());
|
image.get_bpp());
|
||||||
char* text = TessBaseAPI::TesseractRect(image.get_buffer(), image.get_bpp()/8,
|
if (tessedit_serial_unlv == 0) {
|
||||||
bytes_per_line, 0, 0,
|
TessBaseAPI::SetInputName(argv[1]);
|
||||||
image.get_xsize(), image.get_ysize());
|
char* text;
|
||||||
|
if (tessedit_create_boxfile)
|
||||||
|
text = TessBaseAPI::TesseractRectBoxes(image.get_buffer(),
|
||||||
|
image.get_bpp()/8,
|
||||||
|
bytes_per_line, 0, 0,
|
||||||
|
image.get_xsize(),
|
||||||
|
image.get_ysize(),
|
||||||
|
image.get_ysize());
|
||||||
|
else if (tessedit_write_unlv)
|
||||||
|
text = TessBaseAPI::TesseractRectUNLV(image.get_buffer(),
|
||||||
|
image.get_bpp()/8,
|
||||||
|
bytes_per_line, 0, 0,
|
||||||
|
image.get_xsize(),
|
||||||
|
image.get_ysize());
|
||||||
|
else
|
||||||
|
text = TessBaseAPI::TesseractRect(image.get_buffer(), image.get_bpp()/8,
|
||||||
|
bytes_per_line, 0, 0,
|
||||||
|
image.get_xsize(), image.get_ysize());
|
||||||
|
text_out = text;
|
||||||
|
delete [] text;
|
||||||
|
} else {
|
||||||
|
BLOCK_LIST blocks;
|
||||||
|
STRING filename = argv[1];
|
||||||
|
int len = filename.length();
|
||||||
|
if (len > 4 && filename[len - 4] == '.') {
|
||||||
|
filename[len - 4] = '\0';
|
||||||
|
}
|
||||||
|
if (!read_unlv_file(filename, image.get_xsize(), image.get_ysize(),
|
||||||
|
&blocks)) {
|
||||||
|
fprintf(stderr, "Error: Must have a unlv zone file %s to read!\n",
|
||||||
|
filename.string());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
BLOCK_IT b_it = &blocks;
|
||||||
|
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
|
||||||
|
BLOCK* block = b_it.data();
|
||||||
|
BOX box = block->bounding_box();
|
||||||
|
char* text = TessBaseAPI::TesseractRectUNLV(image.get_buffer(),
|
||||||
|
image.get_bpp()/8,
|
||||||
|
bytes_per_line,
|
||||||
|
box.left(),
|
||||||
|
image.get_ysize() - box.top(),
|
||||||
|
box.width(),
|
||||||
|
box.height());
|
||||||
|
text_out += text;
|
||||||
|
delete [] text;
|
||||||
|
if (tessedit_serial_unlv == 1)
|
||||||
|
TessBaseAPI::ClearAdaptiveClassifier();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
outfile = argv[2];
|
outfile = argv[2];
|
||||||
outfile += ".txt";
|
outfile += ".txt";
|
||||||
FILE* fp = fopen(outfile.string(), "w");
|
FILE* fp = fopen(outfile.string(), "w");
|
||||||
if (fp != NULL) {
|
if (fp != NULL) {
|
||||||
fwrite(text, 1, strlen(text), fp);
|
fwrite(text_out.string(), 1, text_out.length(), fp);
|
||||||
fclose(fp);
|
fclose(fp);
|
||||||
}
|
}
|
||||||
delete [] text;
|
|
||||||
TessBaseAPI::End();
|
TessBaseAPI::End();
|
||||||
|
|
||||||
return 0; //Normal exit
|
return 0; //Normal exit
|
||||||
|
@ -527,7 +527,9 @@ BOOL8 read_unlv_file( //print list of sides
|
|||||||
else {
|
else {
|
||||||
while (fscanf (pdfp, "%d %d %d %d %*s", &x, &y, &width, &height) >= 4) {
|
while (fscanf (pdfp, "%d %d %d %d %*s", &x, &y, &width, &height) >= 4) {
|
||||||
//make rect block
|
//make rect block
|
||||||
block = new BLOCK (name.string (), TRUE, 0, 0, (INT16) x, (INT16) (ysize - 1 - y - height), (INT16) (x + width), (INT16) (ysize - 1 - y));
|
block = new BLOCK (name.string (), TRUE, 0, 0,
|
||||||
|
(INT16) x, (INT16) (ysize - y - height),
|
||||||
|
(INT16) (x + width), (INT16) (ysize - y));
|
||||||
//on end of list
|
//on end of list
|
||||||
block_it.add_to_end (block);
|
block_it.add_to_end (block);
|
||||||
}
|
}
|
||||||
|
@ -63,7 +63,7 @@ make_toggle_var (debug_8, 0, make_debug_8, 6, 8, toggle_debug_8, "Debug #8");
|
|||||||
make_toggle_var (display_ratings, 0, make_display_ratings,
|
make_toggle_var (display_ratings, 0, make_display_ratings,
|
||||||
6, 9, toggle_ratings, "Ratings display");
|
6, 9, toggle_ratings, "Ratings display");
|
||||||
|
|
||||||
make_toggle_var (display_text, 1, make_display_text,
|
make_toggle_var (display_text, 0, make_display_text,
|
||||||
6, 10, toggle_text, "Display Text");
|
6, 10, toggle_text, "Display Text");
|
||||||
|
|
||||||
make_toggle_var (show_bold, 1, make_show_bold,
|
make_toggle_var (show_bold, 1, make_show_bold,
|
||||||
|
1
tessdata/configs/makebox
Normal file
1
tessdata/configs/makebox
Normal file
@ -0,0 +1 @@
|
|||||||
|
tessedit_create_boxfile 1
|
3
tessdata/configs/unlv
Normal file
3
tessdata/configs/unlv
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
tessedit_write_unlv 1
|
||||||
|
tessedit_write_output 0
|
||||||
|
tessedit_write_txt_map 0
|
@ -1,78 +1,2 @@
|
|||||||
#################################################
|
# No content needed as all defaults are correct.
|
||||||
# Adaptive Matcher Using PreAdapted Templates
|
|
||||||
#################################################
|
|
||||||
|
|
||||||
acts_fx 0x800
|
|
||||||
acts_ocr 0x20
|
|
||||||
|
|
||||||
RatingScale 30.0
|
|
||||||
CertaintyScale 20.0
|
|
||||||
|
|
||||||
#EnableMatcher 0
|
|
||||||
#CurrentFx 2
|
|
||||||
MinSlope 0.414213562
|
|
||||||
MaxSlope 2.414213562
|
|
||||||
#ExtremityMode 1
|
|
||||||
NormMethod 1
|
|
||||||
EnableAdaptiveMatcher 1
|
|
||||||
|
|
||||||
NormAdjMidpoint 32.0
|
|
||||||
NormAdjCurl 2.0
|
|
||||||
|
|
||||||
MinNormScaleX 0.0
|
|
||||||
MaxNormScaleX 0.325
|
|
||||||
MinNormScaleY 0.0
|
|
||||||
MaxNormScaleY 0.325
|
|
||||||
|
|
||||||
BuiltInTemplatesFile tessdata/inttemp
|
|
||||||
BuiltInCutoffsFile tessdata/pffmtable
|
|
||||||
|
|
||||||
EnableLearning 0
|
|
||||||
SaveAdaptedTemplates 0
|
|
||||||
UsePreAdaptedTemplates 0
|
|
||||||
ReliableConfigThreshold 2
|
|
||||||
MinNumPermClasses 3
|
|
||||||
|
|
||||||
#EnableStopper 1
|
|
||||||
GoodAdaptiveMatch 0.125
|
|
||||||
GreatAdaptiveMatch 0.0
|
|
||||||
|
|
||||||
EnableIntFX 1
|
|
||||||
EnableNewAdaptRules 1
|
|
||||||
################################################################################
|
|
||||||
#
|
|
||||||
# File: marks/configs/knobs
|
|
||||||
# Description: Control variables for 'marks' code
|
|
||||||
# Author: Mark Seaman, OCR Technology
|
|
||||||
# Created: Wed Feb 27 11:27:27 1991
|
|
||||||
# Modified: Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
|
|
||||||
# Language: Text
|
|
||||||
# Package: N/A
|
|
||||||
# Status: Experimental (Do Not Distribute)
|
|
||||||
#
|
|
||||||
# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
|
|
||||||
#
|
|
||||||
################################################################################
|
|
||||||
|
|
||||||
#hidden_edges 1
|
|
||||||
|
|
||||||
save_doc_words 1
|
|
||||||
doc_dict_enable 1
|
|
||||||
ClassPrunerThreshold 229
|
|
||||||
ClassPrunerMultiplier 15
|
|
||||||
IntThetaFudge 128
|
|
||||||
CPCutoffStrength 0.15
|
|
||||||
EvidenceTableBits 9
|
|
||||||
IntEvidenceTruncBits 14
|
|
||||||
SEExponentialMultiplier 0
|
|
||||||
SimilarityCenter 0.0075
|
|
||||||
#################################################
|
|
||||||
# Adaptive Matcher Using 2 Passes
|
|
||||||
#################################################
|
|
||||||
|
|
||||||
EnableLearning 1
|
|
||||||
SaveAdaptedTemplates 0
|
|
||||||
UsePreAdaptedTemplates 0
|
|
||||||
|
|
||||||
#save_errors 0
|
|
||||||
|
|
||||||
|
2
tessdata/tessconfigs/batch.nochop
Normal file
2
tessdata/tessconfigs/batch.nochop
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
chop_enable 0
|
||||||
|
enable_assoc 0
|
@ -2,80 +2,6 @@
|
|||||||
# Adaptive Matcher Using PreAdapted Templates
|
# Adaptive Matcher Using PreAdapted Templates
|
||||||
#################################################
|
#################################################
|
||||||
|
|
||||||
acts_fx 0x800
|
|
||||||
acts_ocr 0x20
|
|
||||||
|
|
||||||
RatingScale 30.0
|
|
||||||
CertaintyScale 20.0
|
|
||||||
|
|
||||||
#EnableMatcher 0
|
|
||||||
#CurrentFx 2
|
|
||||||
EnableAdaptiveMatcher 1
|
|
||||||
|
|
||||||
NormAdjMidpoint 32.0
|
|
||||||
NormAdjCurl 2.0
|
|
||||||
|
|
||||||
MinNormScaleX 0.0
|
|
||||||
MaxNormScaleX 0.325
|
|
||||||
MinNormScaleY 0.0
|
|
||||||
MaxNormScaleY 0.325
|
|
||||||
|
|
||||||
BuiltInTemplatesFile tessdata/inttemp
|
|
||||||
BuiltInCutoffsFile tessdata/pffmtable
|
|
||||||
|
|
||||||
EnableLearning 0
|
|
||||||
SaveAdaptedTemplates 0
|
|
||||||
UsePreAdaptedTemplates 0
|
|
||||||
ReliableConfigThreshold 2
|
|
||||||
MinNumPermClasses 3
|
|
||||||
|
|
||||||
#EnableStopper 1
|
|
||||||
GoodAdaptiveMatch 0.125
|
|
||||||
GreatAdaptiveMatch 0.0
|
|
||||||
|
|
||||||
EnableIntFX 1
|
|
||||||
EnableNewAdaptRules 1
|
|
||||||
EnableAdaptiveDebugger 1
|
EnableAdaptiveDebugger 1
|
||||||
MatchDebugFlags 6
|
MatchDebugFlags 6
|
||||||
MatcherDebugLevel 1
|
MatcherDebugLevel 1
|
||||||
################################################################################
|
|
||||||
#
|
|
||||||
# File: marks/configs/knobs
|
|
||||||
# Description: Control variables for 'marks' code
|
|
||||||
# Author: Mark Seaman, OCR Technology
|
|
||||||
# Created: Wed Feb 27 11:27:27 1991
|
|
||||||
# Modified: Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
|
|
||||||
# Language: Text
|
|
||||||
# Package: N/A
|
|
||||||
# Status: Experimental (Do Not Distribute)
|
|
||||||
#
|
|
||||||
# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
|
|
||||||
#
|
|
||||||
################################################################################
|
|
||||||
|
|
||||||
#hidden_edges 1
|
|
||||||
|
|
||||||
save_doc_words 1
|
|
||||||
doc_dict_enable 1
|
|
||||||
ClassPrunerThreshold 229
|
|
||||||
ClassPrunerMultiplier 15
|
|
||||||
IntThetaFudge 128
|
|
||||||
CPCutoffStrength 0.15
|
|
||||||
EvidenceTableBits 9
|
|
||||||
IntEvidenceTruncBits 14
|
|
||||||
SEExponentialMultiplier 0
|
|
||||||
SimilarityCenter 0.0075
|
|
||||||
#################################################
|
|
||||||
# Adaptive Matcher Using 2 Passes
|
|
||||||
#################################################
|
|
||||||
|
|
||||||
display_splits 0
|
|
||||||
display_all_words 0
|
|
||||||
display_all_blobs 0
|
|
||||||
display_segmentations 0
|
|
||||||
EnableLearning 1
|
|
||||||
SaveAdaptedTemplates 0
|
|
||||||
UsePreAdaptedTemplates 0
|
|
||||||
|
|
||||||
#save_errors 0
|
|
||||||
|
|
||||||
|
13
tessdata/tessconfigs/msdemo
Normal file
13
tessdata/tessconfigs/msdemo
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
#################################################
|
||||||
|
# Adaptive Matcher Using PreAdapted Templates
|
||||||
|
#################################################
|
||||||
|
|
||||||
|
EnableAdaptiveDebugger 1
|
||||||
|
MatchDebugFlags 6
|
||||||
|
MatcherDebugLevel 1
|
||||||
|
|
||||||
|
display_splits 0
|
||||||
|
display_all_words 1
|
||||||
|
display_all_blobs 1
|
||||||
|
display_segmentations 2
|
||||||
|
display_ratings 1
|
2
tessdata/tessconfigs/nobatch
Normal file
2
tessdata/tessconfigs/nobatch
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
display_text 0
|
||||||
|
|
@ -2,70 +2,6 @@
|
|||||||
# Adaptive Matcher Using PreAdapted Templates
|
# Adaptive Matcher Using PreAdapted Templates
|
||||||
#################################################
|
#################################################
|
||||||
|
|
||||||
acts_fx 0x800
|
|
||||||
acts_ocr 0x20
|
|
||||||
|
|
||||||
RatingScale 30.0
|
|
||||||
CertaintyScale 20.0
|
|
||||||
|
|
||||||
#EnableMatcher 0
|
|
||||||
#CurrentFx 2
|
|
||||||
EnableAdaptiveMatcher 1
|
|
||||||
|
|
||||||
NormAdjMidpoint 32.0
|
|
||||||
NormAdjCurl 2.0
|
|
||||||
|
|
||||||
MinNormScaleX 0.0
|
|
||||||
MaxNormScaleX 0.325
|
|
||||||
MinNormScaleY 0.0
|
|
||||||
MaxNormScaleY 0.325
|
|
||||||
|
|
||||||
BuiltInTemplatesFile tessdata/inttemp
|
|
||||||
BuiltInCutoffsFile tessdata/pffmtable
|
|
||||||
|
|
||||||
EnableLearning 0
|
|
||||||
SaveAdaptedTemplates 0
|
|
||||||
UsePreAdaptedTemplates 0
|
|
||||||
ReliableConfigThreshold 2
|
|
||||||
MinNumPermClasses 3
|
|
||||||
|
|
||||||
#EnableStopper 1
|
|
||||||
GoodAdaptiveMatch 0.125
|
|
||||||
GreatAdaptiveMatch 0.0
|
|
||||||
|
|
||||||
EnableIntFX 1
|
|
||||||
EnableNewAdaptRules 1
|
|
||||||
################################################################################
|
|
||||||
#
|
|
||||||
# File: marks/configs/knobs
|
|
||||||
# Description: Control variables for 'marks' code
|
|
||||||
# Author: Mark Seaman, OCR Technology
|
|
||||||
# Created: Wed Feb 27 11:27:27 1991
|
|
||||||
# Modified: Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt
|
|
||||||
# Language: Text
|
|
||||||
# Package: N/A
|
|
||||||
# Status: Experimental (Do Not Distribute)
|
|
||||||
#
|
|
||||||
# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved.
|
|
||||||
#
|
|
||||||
################################################################################
|
|
||||||
|
|
||||||
#hidden_edges 1
|
|
||||||
|
|
||||||
save_doc_words 1
|
|
||||||
doc_dict_enable 1
|
|
||||||
ClassPrunerThreshold 229
|
|
||||||
ClassPrunerMultiplier 15
|
|
||||||
IntThetaFudge 128
|
|
||||||
CPCutoffStrength 0.15
|
|
||||||
EvidenceTableBits 9
|
|
||||||
IntEvidenceTruncBits 14
|
|
||||||
SEExponentialMultiplier 0
|
|
||||||
SimilarityCenter 0.0075
|
|
||||||
#################################################
|
|
||||||
# Adaptive Matcher Using 2 Passes
|
|
||||||
#################################################
|
|
||||||
|
|
||||||
display_splits 0
|
display_splits 0
|
||||||
display_all_words 1
|
display_all_words 1
|
||||||
display_all_blobs 1
|
display_all_blobs 1
|
||||||
|
185
testing/Makefile
Normal file
185
testing/Makefile
Normal file
@ -0,0 +1,185 @@
|
|||||||
|
# Makefile.in generated automatically by automake 1.4-p6 from Makefile.am
|
||||||
|
|
||||||
|
# Copyright (C) 1994, 1995-8, 1999, 2001 Free Software Foundation, Inc.
|
||||||
|
# This Makefile.in is free software; the Free Software Foundation
|
||||||
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
|
# with or without modifications, as long as this notice is preserved.
|
||||||
|
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
|
||||||
|
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||||
|
# PARTICULAR PURPOSE.
|
||||||
|
|
||||||
|
|
||||||
|
SHELL = /bin/sh
|
||||||
|
|
||||||
|
srcdir = .
|
||||||
|
top_srcdir = ..
|
||||||
|
|
||||||
|
prefix = /usr/local
|
||||||
|
exec_prefix = ${prefix}
|
||||||
|
|
||||||
|
bindir = ${exec_prefix}/bin
|
||||||
|
sbindir = ${exec_prefix}/sbin
|
||||||
|
libexecdir = ${exec_prefix}/libexec
|
||||||
|
datadir = ${prefix}/share
|
||||||
|
sysconfdir = ${prefix}/etc
|
||||||
|
sharedstatedir = ${prefix}/com
|
||||||
|
localstatedir = ${prefix}/var
|
||||||
|
libdir = ${exec_prefix}/lib
|
||||||
|
infodir = ${prefix}/info
|
||||||
|
mandir = ${prefix}/man
|
||||||
|
includedir = ${prefix}/include/tesseract
|
||||||
|
oldincludedir = /usr/include
|
||||||
|
|
||||||
|
DESTDIR =
|
||||||
|
|
||||||
|
pkgdatadir = $(datadir)/
|
||||||
|
pkglibdir = $(libdir)/
|
||||||
|
pkgincludedir = $(includedir)/
|
||||||
|
|
||||||
|
top_builddir = ..
|
||||||
|
|
||||||
|
ACLOCAL = aclocal-1.4
|
||||||
|
AUTOCONF = autoconf
|
||||||
|
AUTOMAKE = automake-1.4
|
||||||
|
AUTOHEADER = autoheader
|
||||||
|
|
||||||
|
INSTALL = /usr/bin/install -c
|
||||||
|
INSTALL_PROGRAM = ${INSTALL} $(AM_INSTALL_PROGRAM_FLAGS)
|
||||||
|
INSTALL_DATA = ${INSTALL} -m 644
|
||||||
|
INSTALL_SCRIPT = ${INSTALL}
|
||||||
|
transform = s,x,x,
|
||||||
|
|
||||||
|
NORMAL_INSTALL = :
|
||||||
|
PRE_INSTALL = :
|
||||||
|
POST_INSTALL = :
|
||||||
|
NORMAL_UNINSTALL = :
|
||||||
|
PRE_UNINSTALL = :
|
||||||
|
POST_UNINSTALL = :
|
||||||
|
host_alias =
|
||||||
|
host_triplet = x86_64-unknown-linux-gnu
|
||||||
|
CC = gcc
|
||||||
|
CXX = g++
|
||||||
|
HAVE_LIB = @HAVE_LIB@
|
||||||
|
LIB = @LIB@
|
||||||
|
LTLIB = @LTLIB@
|
||||||
|
MAINT = #
|
||||||
|
MAKEINFO = /home/rays/src/opensrc/tesseract-ocr/config/missing makeinfo
|
||||||
|
PACKAGE =
|
||||||
|
PACKAGE_DATE = 07/2007
|
||||||
|
PACKAGE_NAME = tesseract
|
||||||
|
PACKAGE_VERSION = 2.00
|
||||||
|
PACKAGE_YEAR = 2007
|
||||||
|
RANLIB = ranlib
|
||||||
|
VERSION =
|
||||||
|
|
||||||
|
EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum
|
||||||
|
mkinstalldirs = $(SHELL) $(top_srcdir)/config/mkinstalldirs
|
||||||
|
CONFIG_HEADER = ../config_auto.h
|
||||||
|
CONFIG_CLEAN_FILES =
|
||||||
|
DIST_COMMON = README Makefile.am Makefile.in
|
||||||
|
|
||||||
|
|
||||||
|
DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST)
|
||||||
|
|
||||||
|
TAR = tar
|
||||||
|
GZIP_ENV = --best
|
||||||
|
all: all-redirect
|
||||||
|
.SUFFIXES:
|
||||||
|
$(srcdir)/Makefile.in: # Makefile.am $(top_srcdir)/configure.ac $(ACLOCAL_M4)
|
||||||
|
cd $(top_srcdir) && $(AUTOMAKE) --gnu testing/Makefile
|
||||||
|
|
||||||
|
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status $(BUILT_SOURCES)
|
||||||
|
cd $(top_builddir) \
|
||||||
|
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
|
||||||
|
|
||||||
|
tags: TAGS
|
||||||
|
TAGS:
|
||||||
|
|
||||||
|
|
||||||
|
distdir = $(top_builddir)/$(PACKAGE_NAME)-$(PACKAGE_VERSION)/$(subdir)
|
||||||
|
|
||||||
|
subdir = testing
|
||||||
|
|
||||||
|
distdir: $(DISTFILES)
|
||||||
|
here=`cd $(top_builddir) && pwd`; \
|
||||||
|
top_distdir=`cd $(top_distdir) && pwd`; \
|
||||||
|
distdir=`cd $(distdir) && pwd`; \
|
||||||
|
cd $(top_srcdir) \
|
||||||
|
&& $(AUTOMAKE) --include-deps --build-dir=$$here --srcdir-name=$(top_srcdir) --output-dir=$$top_distdir --gnu testing/Makefile
|
||||||
|
$(mkinstalldirs) $(distdir)/reports
|
||||||
|
@for file in $(DISTFILES); do \
|
||||||
|
d=$(srcdir); \
|
||||||
|
if test -d $$d/$$file; then \
|
||||||
|
cp -pr $$d/$$file $(distdir)/$$file; \
|
||||||
|
else \
|
||||||
|
test -f $(distdir)/$$file \
|
||||||
|
|| ln $$d/$$file $(distdir)/$$file 2> /dev/null \
|
||||||
|
|| cp -p $$d/$$file $(distdir)/$$file || :; \
|
||||||
|
fi; \
|
||||||
|
done
|
||||||
|
info-am:
|
||||||
|
info: info-am
|
||||||
|
dvi-am:
|
||||||
|
dvi: dvi-am
|
||||||
|
check-am: all-am
|
||||||
|
check: check-am
|
||||||
|
installcheck-am:
|
||||||
|
installcheck: installcheck-am
|
||||||
|
install-exec-am:
|
||||||
|
install-exec: install-exec-am
|
||||||
|
|
||||||
|
install-data-am:
|
||||||
|
install-data: install-data-am
|
||||||
|
|
||||||
|
install-am: all-am
|
||||||
|
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
|
||||||
|
install: install-am
|
||||||
|
uninstall-am:
|
||||||
|
uninstall: uninstall-am
|
||||||
|
all-am: Makefile
|
||||||
|
all-redirect: all-am
|
||||||
|
install-strip:
|
||||||
|
$(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install
|
||||||
|
installdirs:
|
||||||
|
|
||||||
|
|
||||||
|
mostlyclean-generic:
|
||||||
|
|
||||||
|
clean-generic:
|
||||||
|
|
||||||
|
distclean-generic:
|
||||||
|
-rm -f Makefile $(CONFIG_CLEAN_FILES)
|
||||||
|
-rm -f config.cache config.log stamp-h stamp-h[0-9]*
|
||||||
|
|
||||||
|
maintainer-clean-generic:
|
||||||
|
mostlyclean-am: mostlyclean-generic
|
||||||
|
|
||||||
|
mostlyclean: mostlyclean-am
|
||||||
|
|
||||||
|
clean-am: clean-generic mostlyclean-am
|
||||||
|
|
||||||
|
clean: clean-am
|
||||||
|
|
||||||
|
distclean-am: distclean-generic clean-am
|
||||||
|
|
||||||
|
distclean: distclean-am
|
||||||
|
|
||||||
|
maintainer-clean-am: maintainer-clean-generic distclean-am
|
||||||
|
@echo "This command is intended for maintainers to use;"
|
||||||
|
@echo "it deletes files that may require special tools to rebuild."
|
||||||
|
|
||||||
|
maintainer-clean: maintainer-clean-am
|
||||||
|
|
||||||
|
.PHONY: tags distdir info-am info dvi-am dvi check check-am \
|
||||||
|
installcheck-am installcheck install-exec-am install-exec \
|
||||||
|
install-data-am install-data install-am install uninstall-am uninstall \
|
||||||
|
all-redirect all-am all installdirs mostlyclean-generic \
|
||||||
|
distclean-generic clean-generic maintainer-clean-generic clean \
|
||||||
|
mostlyclean distclean maintainer-clean
|
||||||
|
|
||||||
|
|
||||||
|
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
||||||
|
# Otherwise a system limit (for SysV at least) may be exceeded.
|
||||||
|
.NOEXPORT:
|
2
testing/Makefile.am
Normal file
2
testing/Makefile.am
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
|
||||||
|
EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum
|
185
testing/Makefile.in
Normal file
185
testing/Makefile.in
Normal file
@ -0,0 +1,185 @@
|
|||||||
|
# Makefile.in generated automatically by automake 1.4-p6 from Makefile.am
|
||||||
|
|
||||||
|
# Copyright (C) 1994, 1995-8, 1999, 2001 Free Software Foundation, Inc.
|
||||||
|
# This Makefile.in is free software; the Free Software Foundation
|
||||||
|
# gives unlimited permission to copy and/or distribute it,
|
||||||
|
# with or without modifications, as long as this notice is preserved.
|
||||||
|
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
|
||||||
|
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||||
|
# PARTICULAR PURPOSE.
|
||||||
|
|
||||||
|
|
||||||
|
SHELL = @SHELL@
|
||||||
|
|
||||||
|
srcdir = @srcdir@
|
||||||
|
top_srcdir = @top_srcdir@
|
||||||
|
VPATH = @srcdir@
|
||||||
|
prefix = @prefix@
|
||||||
|
exec_prefix = @exec_prefix@
|
||||||
|
|
||||||
|
bindir = @bindir@
|
||||||
|
sbindir = @sbindir@
|
||||||
|
libexecdir = @libexecdir@
|
||||||
|
datadir = @datadir@
|
||||||
|
sysconfdir = @sysconfdir@
|
||||||
|
sharedstatedir = @sharedstatedir@
|
||||||
|
localstatedir = @localstatedir@
|
||||||
|
libdir = @libdir@
|
||||||
|
infodir = @infodir@
|
||||||
|
mandir = @mandir@
|
||||||
|
includedir = @includedir@
|
||||||
|
oldincludedir = /usr/include
|
||||||
|
|
||||||
|
DESTDIR =
|
||||||
|
|
||||||
|
pkgdatadir = $(datadir)/@PACKAGE@
|
||||||
|
pkglibdir = $(libdir)/@PACKAGE@
|
||||||
|
pkgincludedir = $(includedir)/@PACKAGE@
|
||||||
|
|
||||||
|
top_builddir = ..
|
||||||
|
|
||||||
|
ACLOCAL = @ACLOCAL@
|
||||||
|
AUTOCONF = @AUTOCONF@
|
||||||
|
AUTOMAKE = @AUTOMAKE@
|
||||||
|
AUTOHEADER = @AUTOHEADER@
|
||||||
|
|
||||||
|
INSTALL = @INSTALL@
|
||||||
|
INSTALL_PROGRAM = @INSTALL_PROGRAM@ $(AM_INSTALL_PROGRAM_FLAGS)
|
||||||
|
INSTALL_DATA = @INSTALL_DATA@
|
||||||
|
INSTALL_SCRIPT = @INSTALL_SCRIPT@
|
||||||
|
transform = @program_transform_name@
|
||||||
|
|
||||||
|
NORMAL_INSTALL = :
|
||||||
|
PRE_INSTALL = :
|
||||||
|
POST_INSTALL = :
|
||||||
|
NORMAL_UNINSTALL = :
|
||||||
|
PRE_UNINSTALL = :
|
||||||
|
POST_UNINSTALL = :
|
||||||
|
host_alias = @host_alias@
|
||||||
|
host_triplet = @host@
|
||||||
|
CC = @CC@
|
||||||
|
CXX = @CXX@
|
||||||
|
HAVE_LIB = @HAVE_LIB@
|
||||||
|
LIB = @LIB@
|
||||||
|
LTLIB = @LTLIB@
|
||||||
|
MAINT = @MAINT@
|
||||||
|
MAKEINFO = @MAKEINFO@
|
||||||
|
PACKAGE = @PACKAGE@
|
||||||
|
PACKAGE_DATE = @PACKAGE_DATE@
|
||||||
|
PACKAGE_NAME = @PACKAGE_NAME@
|
||||||
|
PACKAGE_VERSION = @PACKAGE_VERSION@
|
||||||
|
PACKAGE_YEAR = @PACKAGE_YEAR@
|
||||||
|
RANLIB = @RANLIB@
|
||||||
|
VERSION = @VERSION@
|
||||||
|
|
||||||
|
EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum
|
||||||
|
mkinstalldirs = $(SHELL) $(top_srcdir)/config/mkinstalldirs
|
||||||
|
CONFIG_HEADER = ../config_auto.h
|
||||||
|
CONFIG_CLEAN_FILES =
|
||||||
|
DIST_COMMON = README Makefile.am Makefile.in
|
||||||
|
|
||||||
|
|
||||||
|
DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST)
|
||||||
|
|
||||||
|
TAR = tar
|
||||||
|
GZIP_ENV = --best
|
||||||
|
all: all-redirect
|
||||||
|
.SUFFIXES:
|
||||||
|
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.ac $(ACLOCAL_M4)
|
||||||
|
cd $(top_srcdir) && $(AUTOMAKE) --gnu testing/Makefile
|
||||||
|
|
||||||
|
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status $(BUILT_SOURCES)
|
||||||
|
cd $(top_builddir) \
|
||||||
|
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
|
||||||
|
|
||||||
|
tags: TAGS
|
||||||
|
TAGS:
|
||||||
|
|
||||||
|
|
||||||
|
distdir = $(top_builddir)/$(PACKAGE_NAME)-$(PACKAGE_VERSION)/$(subdir)
|
||||||
|
|
||||||
|
subdir = testing
|
||||||
|
|
||||||
|
distdir: $(DISTFILES)
|
||||||
|
here=`cd $(top_builddir) && pwd`; \
|
||||||
|
top_distdir=`cd $(top_distdir) && pwd`; \
|
||||||
|
distdir=`cd $(distdir) && pwd`; \
|
||||||
|
cd $(top_srcdir) \
|
||||||
|
&& $(AUTOMAKE) --include-deps --build-dir=$$here --srcdir-name=$(top_srcdir) --output-dir=$$top_distdir --gnu testing/Makefile
|
||||||
|
$(mkinstalldirs) $(distdir)/reports
|
||||||
|
@for file in $(DISTFILES); do \
|
||||||
|
d=$(srcdir); \
|
||||||
|
if test -d $$d/$$file; then \
|
||||||
|
cp -pr $$d/$$file $(distdir)/$$file; \
|
||||||
|
else \
|
||||||
|
test -f $(distdir)/$$file \
|
||||||
|
|| ln $$d/$$file $(distdir)/$$file 2> /dev/null \
|
||||||
|
|| cp -p $$d/$$file $(distdir)/$$file || :; \
|
||||||
|
fi; \
|
||||||
|
done
|
||||||
|
info-am:
|
||||||
|
info: info-am
|
||||||
|
dvi-am:
|
||||||
|
dvi: dvi-am
|
||||||
|
check-am: all-am
|
||||||
|
check: check-am
|
||||||
|
installcheck-am:
|
||||||
|
installcheck: installcheck-am
|
||||||
|
install-exec-am:
|
||||||
|
install-exec: install-exec-am
|
||||||
|
|
||||||
|
install-data-am:
|
||||||
|
install-data: install-data-am
|
||||||
|
|
||||||
|
install-am: all-am
|
||||||
|
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
|
||||||
|
install: install-am
|
||||||
|
uninstall-am:
|
||||||
|
uninstall: uninstall-am
|
||||||
|
all-am: Makefile
|
||||||
|
all-redirect: all-am
|
||||||
|
install-strip:
|
||||||
|
$(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install
|
||||||
|
installdirs:
|
||||||
|
|
||||||
|
|
||||||
|
mostlyclean-generic:
|
||||||
|
|
||||||
|
clean-generic:
|
||||||
|
|
||||||
|
distclean-generic:
|
||||||
|
-rm -f Makefile $(CONFIG_CLEAN_FILES)
|
||||||
|
-rm -f config.cache config.log stamp-h stamp-h[0-9]*
|
||||||
|
|
||||||
|
maintainer-clean-generic:
|
||||||
|
mostlyclean-am: mostlyclean-generic
|
||||||
|
|
||||||
|
mostlyclean: mostlyclean-am
|
||||||
|
|
||||||
|
clean-am: clean-generic mostlyclean-am
|
||||||
|
|
||||||
|
clean: clean-am
|
||||||
|
|
||||||
|
distclean-am: distclean-generic clean-am
|
||||||
|
|
||||||
|
distclean: distclean-am
|
||||||
|
|
||||||
|
maintainer-clean-am: maintainer-clean-generic distclean-am
|
||||||
|
@echo "This command is intended for maintainers to use;"
|
||||||
|
@echo "it deletes files that may require special tools to rebuild."
|
||||||
|
|
||||||
|
maintainer-clean: maintainer-clean-am
|
||||||
|
|
||||||
|
.PHONY: tags distdir info-am info dvi-am dvi check check-am \
|
||||||
|
installcheck-am installcheck install-exec-am install-exec \
|
||||||
|
install-data-am install-data install-am install uninstall-am uninstall \
|
||||||
|
all-redirect all-am all installdirs mostlyclean-generic \
|
||||||
|
distclean-generic clean-generic maintainer-clean-generic clean \
|
||||||
|
mostlyclean distclean maintainer-clean
|
||||||
|
|
||||||
|
|
||||||
|
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
||||||
|
# Otherwise a system limit (for SysV at least) may be exceeded.
|
||||||
|
.NOEXPORT:
|
43
testing/README
Normal file
43
testing/README
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
How to run UNLV tests.
|
||||||
|
|
||||||
|
The scripts in this directory make it possible to duplicate the tests
|
||||||
|
published in the Fourth Annual Test of OCR Accuracy.
|
||||||
|
See http://www.isri.unlv.edu/downloads/AT-1995.pdf
|
||||||
|
but first you have to get the tools and data from UNLV:
|
||||||
|
|
||||||
|
Step 1: to download the images goto
|
||||||
|
http://www.isri.unlv.edu/ISRI/OCRtk
|
||||||
|
and get 3b.tgz, Bb.tgz, Mb.tgz and Nb.tgz.
|
||||||
|
|
||||||
|
Step 2: extract the files. It doesn't really matter where
|
||||||
|
in your filesystem you put them, but they must go under a common
|
||||||
|
root so you have directories 3, B, M and N in, for example,
|
||||||
|
/users/me/ISRI-OCRtk.
|
||||||
|
|
||||||
|
Step 3: Reorg the files
|
||||||
|
The lack of tif extensions on the images is inconvenient, so there
|
||||||
|
is a script to reorganize the data to match the rest of the test
|
||||||
|
scripts.
|
||||||
|
cd to /users/me/ISRI-OCRtk or wherever 3, B, M and N ended up and run
|
||||||
|
/blah/blah/tesseract-ocr/testing/reorgdata.sh 3B
|
||||||
|
This makes directories doe3.3B, bus.3B, mag.3B and news.3B.
|
||||||
|
You can now get rid of 3, B, M, and N unless you want to get some of the
|
||||||
|
other scanning resolutions out of them.
|
||||||
|
|
||||||
|
Step 4: Download the ISRI toolkit from:
|
||||||
|
http://www.isri.unlv.edu/downloads/ftk-1.0.tgz
|
||||||
|
|
||||||
|
Step 5: If they work for you, use the binaries directly from the bin
|
||||||
|
directory and put them in tesseract-ocr/testing/unlv
|
||||||
|
otherwise build the tools for yourself and put them there.
|
||||||
|
|
||||||
|
Step 6: cd back to your main tesseract-ocr dir and Build tesseract.
|
||||||
|
|
||||||
|
Step 7: run testing/runalltests.sh with the root data dir and testname:
|
||||||
|
testing/runalltests.sh /users/me/ISRI-OCRtk tess2.0
|
||||||
|
and go to the gym, have lunch etc.
|
||||||
|
|
||||||
|
Step 8: There should be a file
|
||||||
|
testing/reports/tess2.0.summary that contains the final summarized accuracy
|
||||||
|
report and comparison with the 1995 results.
|
||||||
|
|
61
testing/counttestset.sh
Executable file
61
testing/counttestset.sh
Executable file
@ -0,0 +1,61 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# File: counttestset.sh
|
||||||
|
# Description: Script to count the errors on a single UNLV set.
|
||||||
|
# Author: Ray Smith
|
||||||
|
# Created: Wed Jun 13 11:58:01 PDT 2007
|
||||||
|
#
|
||||||
|
# (C) Copyright 2007, Google Inc.
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
if [ $# -ne 1 ]
|
||||||
|
then
|
||||||
|
echo "Usage:$0 pagesfile"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ ! -d ccmain ]
|
||||||
|
then
|
||||||
|
echo "Run $0 from the tesseract-ocr root directory!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ ! -r testing/unlv/accuracy ]
|
||||||
|
then
|
||||||
|
echo "Please download the UNLV accuracy tools (and build) to testing/unlv"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
pages=$1
|
||||||
|
|
||||||
|
imdir=${pages%/pages}
|
||||||
|
setname=${imdir##*/}
|
||||||
|
resdir=testing/results/$setname
|
||||||
|
mkdir -p testing/reports
|
||||||
|
echo "Counting on set $setname in directory $imdir to $resdir"
|
||||||
|
accfiles=""
|
||||||
|
wafiles=""
|
||||||
|
while read page dir
|
||||||
|
do
|
||||||
|
if [ "$dir" ]
|
||||||
|
then
|
||||||
|
srcdir="$imdir/$dir"
|
||||||
|
else
|
||||||
|
srcdir="$imdir"
|
||||||
|
fi
|
||||||
|
# echo "$srcdir/$page.tif"
|
||||||
|
# Count character errors.
|
||||||
|
testing/unlv/accuracy $srcdir/$page.txt $resdir/$page.txt $resdir/$page.acc
|
||||||
|
accfiles="$accfiles $resdir/$page.acc"
|
||||||
|
# Count word errors.
|
||||||
|
testing/unlv/wordacc $srcdir/$page.txt $resdir/$page.txt $resdir/$page.wa
|
||||||
|
wafiles="$wafiles $resdir/$page.wa"
|
||||||
|
done <$pages
|
||||||
|
testing/unlv/accsum $accfiles >testing/reports/$setname.characc
|
||||||
|
testing/unlv/wordaccsum $wafiles >testing/reports/$setname.wordacc
|
||||||
|
|
||||||
|
|
44
testing/reorgdata.sh
Executable file
44
testing/reorgdata.sh
Executable file
@ -0,0 +1,44 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ $# -ne 1 ]
|
||||||
|
then
|
||||||
|
echo "Usage:$0 scantype"
|
||||||
|
echo "UNLV data comes in several scan types:"
|
||||||
|
echo "3B=300 dpi binary"
|
||||||
|
echo "3A=adaptive thresholded 300 dpi"
|
||||||
|
echo "3G=300 dpi grey"
|
||||||
|
echo "4B=400dpi binary"
|
||||||
|
echo "2B=200dpi binary"
|
||||||
|
echo "For now we only use 3B"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
ext=$1
|
||||||
|
|
||||||
|
#There are several test sets without meaningful names, so rename
|
||||||
|
#them with something a bit more meaningful.
|
||||||
|
#Each s is oldname/newname
|
||||||
|
for s in 3/doe3 B/bus M/mag N/news L/legal R/rep S/spn Z/zset
|
||||||
|
do
|
||||||
|
old=${s%/*}
|
||||||
|
#if this set was downloaded then process it.
|
||||||
|
if [ -r "$old/PAGES" ]
|
||||||
|
then
|
||||||
|
new=${s#*/}.$ext
|
||||||
|
mkdir -p $new
|
||||||
|
echo "Set $old -> $new"
|
||||||
|
#The pages file had - instead of _ so fix it and add the extension.
|
||||||
|
for page in `cat $old/PAGES`
|
||||||
|
do
|
||||||
|
echo "${page%-*}_${page#*-}.$ext"
|
||||||
|
done >$new/pages
|
||||||
|
for f in `cat $new/pages`
|
||||||
|
do
|
||||||
|
#Put a tif extension on the tif files.
|
||||||
|
cp $old/${old}_B/$f $new/$f.tif
|
||||||
|
#Put a uzn extension on the zone files.
|
||||||
|
cp $old/${old}_B/${f}Z $new/$f.uzn
|
||||||
|
#Cat all the truth files together and put into a single txt file.
|
||||||
|
cat $old/${old}_GT/${f%.$ext}.Z* >$new/$f.txt
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
done
|
1
testing/reports/1995.bus.3B.sum
Normal file
1
testing/reports/1995.bus.3B.sum
Normal file
@ -0,0 +1 @@
|
|||||||
|
1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00%
|
1
testing/reports/1995.doe3.3B.sum
Normal file
1
testing/reports/1995.doe3.3B.sum
Normal file
@ -0,0 +1 @@
|
|||||||
|
1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00%
|
1
testing/reports/1995.mag.3B.sum
Normal file
1
testing/reports/1995.mag.3B.sum
Normal file
@ -0,0 +1 @@
|
|||||||
|
1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00%
|
1
testing/reports/1995.news.3B.sum
Normal file
1
testing/reports/1995.news.3B.sum
Normal file
@ -0,0 +1 @@
|
|||||||
|
1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00%
|
110
testing/runalltests.sh
Executable file
110
testing/runalltests.sh
Executable file
@ -0,0 +1,110 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# File: runalltests.sh
|
||||||
|
# Description: Script to run a set of UNLV test sets.
|
||||||
|
# Author: Ray Smith
|
||||||
|
# Created: Thu Jun 14 08:21:01 PDT 2007
|
||||||
|
#
|
||||||
|
# (C) Copyright 2007, Google Inc.
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
if [ $# -ne 2 ]
|
||||||
|
then
|
||||||
|
echo "Usage:$0 unlv-data-dir version-id"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ ! -d ccmain ]
|
||||||
|
then
|
||||||
|
echo "Run $0 from the tesseract-ocr root directory!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ ! -r ccmain/tesseract -a ! -r tesseract.exe ]
|
||||||
|
then
|
||||||
|
echo "Please build tesseract before running $0"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ ! -r testing/unlv/accuracy -a ! -r testing/unlv/accuracy.exe ]
|
||||||
|
then
|
||||||
|
echo "Please download the UNLV accuracy tools (and build) to testing/unlv"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
#deltapc new old calculates the %change from old to new
|
||||||
|
deltapc() {
|
||||||
|
awk ' BEGIN {
|
||||||
|
printf("%.2f", 100.0*('$1'-'$2')/'$2');
|
||||||
|
}'
|
||||||
|
}
|
||||||
|
|
||||||
|
imdir="$1"
|
||||||
|
vid="$2"
|
||||||
|
bindir=${0%/*}
|
||||||
|
if [ "$bindir" = "$0" ]
|
||||||
|
then
|
||||||
|
bindir="./"
|
||||||
|
fi
|
||||||
|
rdir=testing/reports
|
||||||
|
testsets="bus.3B doe3.3B mag.3B news.3B"
|
||||||
|
|
||||||
|
totalerrs=0
|
||||||
|
totalwerrs=0
|
||||||
|
totalnswerrs=0
|
||||||
|
totalolderrs=0
|
||||||
|
totaloldwerrs=0
|
||||||
|
totaloldnswerrs=0
|
||||||
|
for set in $testsets
|
||||||
|
do
|
||||||
|
if [ -r $imdir/$set/pages ]
|
||||||
|
then
|
||||||
|
# Run tesseract on all the pages.
|
||||||
|
$bindir/runtestset.sh $imdir/$set/pages
|
||||||
|
# Count the errors on all the pages.
|
||||||
|
$bindir/counttestset.sh $imdir/$set/pages
|
||||||
|
# Get the old character word and nonstop word errors.
|
||||||
|
olderrs=`cat testing/reports/1995.$set.sum | cut -f3`
|
||||||
|
oldwerrs=`cat testing/reports/1995.$set.sum | cut -f6`
|
||||||
|
oldnswerrs=`cat testing/reports/1995.$set.sum | cut -f9`
|
||||||
|
# Get the new character word and nonstop word errors and accuracy.
|
||||||
|
cherrs=`head -4 testing/reports/$set.characc |tail -1 |cut -c1-9 |
|
||||||
|
tr -d '[:blank:]'`
|
||||||
|
chacc=`head -5 testing/reports/$set.characc |tail -1 |cut -c1-9 |
|
||||||
|
tr -d '[:blank:]'`
|
||||||
|
wderrs=`head -4 testing/reports/$set.wordacc |tail -1 |cut -c1-9 |
|
||||||
|
tr -d '[:blank:]'`
|
||||||
|
wdacc=`head -5 testing/reports/$set.wordacc |tail -1 |cut -c1-9 |
|
||||||
|
tr -d '[:blank:]'`
|
||||||
|
nswderrs=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 |
|
||||||
|
cut -c10-17 |tr -d '[:blank:]'`
|
||||||
|
nswdacc=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 |
|
||||||
|
cut -c19-26 |tr -d '[:blank:]'`
|
||||||
|
# Compute the percent change.
|
||||||
|
chdelta=`deltapc $cherrs $olderrs`
|
||||||
|
wdelta=`deltapc $wderrs $oldwerrs`
|
||||||
|
nswdelta=`deltapc $nswderrs $oldnswerrs`
|
||||||
|
sumfile=$rdir/$vid.$set.sum
|
||||||
|
echo "$vid $set $cherrs $chacc $chdelta% $wderrs $wdacc\
|
||||||
|
$wdelta% $nswderrs $nswdacc $nswdelta%" >$sumfile
|
||||||
|
# Sum totals over all the testsets.
|
||||||
|
let totalerrs=totalerrs+cherrs
|
||||||
|
let totalwerrs=totalwerrs+wderrs
|
||||||
|
let totalnswerrs=totalnswerrs+nswderrs
|
||||||
|
let totalolderrs=totalolderrs+olderrs
|
||||||
|
let totaloldwerrs=totaloldwerrs+oldwerrs
|
||||||
|
let totaloldnswerrs=totaloldnswerrs+oldnswerrs
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
# Compute grand total percent change.
|
||||||
|
chdelta=`deltapc $totalerrs $totalolderrs`
|
||||||
|
wdelta=`deltapc $totalwerrs $totaloldwerrs`
|
||||||
|
nswdelta=`deltapc $totalnswerrs $totaloldnswerrs `
|
||||||
|
tfile=$rdir/$vid.total.sum
|
||||||
|
echo "$vid Total $totalerrs - $chdelta% $totalwerrs\
|
||||||
|
- $wdelta% $totalnswerrs - $nswdelta%" >$tfile
|
||||||
|
cat $rdir/1995.*.sum $rdir/$vid.*.sum >$rdir/$vid.summary
|
61
testing/runtestset.sh
Executable file
61
testing/runtestset.sh
Executable file
@ -0,0 +1,61 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# File: runtestset.sh
|
||||||
|
# Description: Script to run tesseract on a single UNLV set.
|
||||||
|
# Author: Ray Smith
|
||||||
|
# Created: Wed Jun 13 10:13:01 PDT 2007
|
||||||
|
#
|
||||||
|
# (C) Copyright 2007, Google Inc.
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
if [ $# -ne 1 ]
|
||||||
|
then
|
||||||
|
echo "Usage:$0 pagesfile"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ ! -d ccmain ]
|
||||||
|
then
|
||||||
|
echo "Run $0 from the tesseract-ocr root directory!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if [ ! -r ccmain/tesseract ]
|
||||||
|
then
|
||||||
|
if [ ! -r tesseract.exe ]
|
||||||
|
then
|
||||||
|
echo "Please build tesseract before running $0"
|
||||||
|
exit 1
|
||||||
|
else
|
||||||
|
tess="./tesseract.exe"
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
tess="ccmain/tesseract"
|
||||||
|
export TESSDATA_PREFIX=$PWD/
|
||||||
|
fi
|
||||||
|
|
||||||
|
pages=$1
|
||||||
|
|
||||||
|
imdir=${pages%/pages}
|
||||||
|
setname=${imdir##*/}
|
||||||
|
resdir=testing/results/$setname
|
||||||
|
echo "Testing on set $setname in directory $imdir to $resdir"
|
||||||
|
mkdir -p $resdir
|
||||||
|
while read page dir
|
||||||
|
do
|
||||||
|
# A pages file may be a list of files with subdirs or maybe just
|
||||||
|
# a plain list of files so accomodate both.
|
||||||
|
if [ "$dir" ]
|
||||||
|
then
|
||||||
|
srcdir="$imdir/$dir"
|
||||||
|
else
|
||||||
|
srcdir="$imdir"
|
||||||
|
fi
|
||||||
|
# echo "$srcdir/$page.tif"
|
||||||
|
$tess $srcdir/$page.tif $resdir/$page nobatch unlv
|
||||||
|
done <$pages
|
Loading…
Reference in New Issue
Block a user