From 627368df42e0318ba0e6a716f830a92cb49998d0 Mon Sep 17 00:00:00 2001 From: theraysmith Date: Wed, 18 Jul 2007 01:11:18 +0000 Subject: [PATCH] API/output changes to produce unlv-style latin-1 output and test scripts git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@86 d0cd1f9f-072b-0410-8dd7-cf729c803f20 --- ccmain/applybox.cpp | 290 ++++++++++++++++------------- ccmain/baseapi.cpp | 293 ++++++++++++++++++++++++++++-- ccmain/baseapi.h | 29 ++- ccmain/output.cpp | 224 +++++++++++++++-------- ccmain/tesseractmain.cpp | 92 ++++++++-- ccstruct/blread.cpp | 4 +- cutil/tordvars.cpp | 2 +- tessdata/configs/makebox | 1 + tessdata/configs/unlv | 3 + tessdata/tessconfigs/batch | 78 +------- tessdata/tessconfigs/batch.nochop | 2 + tessdata/tessconfigs/matdemo | 74 -------- tessdata/tessconfigs/msdemo | 13 ++ tessdata/tessconfigs/nobatch | 2 + tessdata/tessconfigs/segdemo | 64 ------- testing/Makefile | 185 +++++++++++++++++++ testing/Makefile.am | 2 + testing/Makefile.in | 185 +++++++++++++++++++ testing/README | 43 +++++ testing/counttestset.sh | 61 +++++++ testing/reorgdata.sh | 44 +++++ testing/reports/1995.bus.3B.sum | 1 + testing/reports/1995.doe3.3B.sum | 1 + testing/reports/1995.mag.3B.sum | 1 + testing/reports/1995.news.3B.sum | 1 + testing/runalltests.sh | 110 +++++++++++ testing/runtestset.sh | 61 +++++++ 27 files changed, 1424 insertions(+), 442 deletions(-) create mode 100644 tessdata/configs/makebox create mode 100644 tessdata/configs/unlv create mode 100644 tessdata/tessconfigs/batch.nochop create mode 100644 tessdata/tessconfigs/msdemo create mode 100644 tessdata/tessconfigs/nobatch create mode 100644 testing/Makefile create mode 100644 testing/Makefile.am create mode 100644 testing/Makefile.in create mode 100644 testing/README create mode 100755 testing/counttestset.sh create mode 100755 testing/reorgdata.sh create mode 100644 testing/reports/1995.bus.3B.sum create mode 100644 testing/reports/1995.doe3.3B.sum create mode 100644 testing/reports/1995.mag.3B.sum create mode 100644 testing/reports/1995.news.3B.sum create mode 100755 testing/runalltests.sh create mode 100755 testing/runtestset.sh diff --git a/ccmain/applybox.cpp b/ccmain/applybox.cpp index 41b482259..888acf4fb 100644 --- a/ccmain/applybox.cpp +++ b/ccmain/applybox.cpp @@ -24,20 +24,22 @@ what measures we are interested in. /* #define SECURE_NAMES done in secnames.h when necessary*/ #include "mfcpch.h" -#include "applybox.h" -#include -#include +#include "applybox.h" +#include +#include #ifdef __UNIX__ -#include -#include +#include +#include #endif -#include "mainblk.h" -#include "genblob.h" -#include "fixxht.h" -#include "control.h" -#include "tessbox.h" -#include "globals.h" -#include "secname.h" +#include "mainblk.h" +#include "genblob.h" +#include "fixxht.h" +#include "control.h" +#include "tessbox.h" +#include "globals.h" +#include "secname.h" +#include "unichar.h" +#include "matchdefs.h" #define SECURE_NAMES #ifndef SECURE_NAMES @@ -47,10 +49,13 @@ what measures we are interested in. #define EXTERN EXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead"); EXTERN INT_VAR (applybox_debug, 0, "Debug level"); -EXTERN STRING_VAR (applybox_test_exclusions, "|", +EXTERN STRING_VAR (applybox_test_exclusions, "", "Chars ignored for testing"); EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht"); +// The unicharset used during box training +static UNICHARSET unicharset_boxes; + /************************************************************************* * The code re-assigns outlines to form words each with ONE labelled blob. * Noise is left in UNLABELLED words. The chars on the page are checked crudely @@ -89,7 +94,7 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks INT16 boxfile_lineno = 0; INT16 boxfile_charno = 0; BOX box; //boxfile box - char ch[2]; //correct ch from boxfile + UNICHAR_ID uch_id; //correct ch from boxfile ROW *row; ROW *prev_row = NULL; INT16 prev_box_right = MAX_INT16; @@ -100,15 +105,20 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks INT16 labels_ok; INT16 rows_ok; INT16 bad_blobs; - INT16 tgt_char_counts[128]; //No. of box samples + INT16 tgt_char_counts[MAX_NUM_CLASSES]; //No. of box samples // INT16 labelled_char_counts[128]; //No. of unique labelled samples INT16 i; INT16 rebalance_count = 0; - char min_char; + UNICHAR_ID min_uch_id; INT16 min_samples; INT16 final_labelled_blob_count; - for (i = 0; i < 128; i++) + // Clean the unichar set + unicharset_boxes.clear(); + // Space character needed to represent NIL classification + unicharset_boxes.unichar_insert(" "); + + for (i = 0; i < MAX_NUM_CLASSES; i++) tgt_char_counts[i] = 0; FILE* box_file; @@ -120,11 +130,10 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks filename.string(), errno); } - ch[1] = '\0'; clear_any_old_text(block_list); - while (read_next_box (box_file, &box, &ch[0])) { + while (read_next_box (box_file, &box, &uch_id)) { box_count++; - tgt_char_counts[ch[0]]++; + tgt_char_counts[uch_id]++; row = find_row_of_box (block_list, box, block_id, row_id); if (box.left () < prev_box_right) { boxfile_lineno++; @@ -135,14 +144,16 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks if (row == NULL) { box_failures++; - report_failed_box (boxfile_lineno, boxfile_charno, box, ch, + report_failed_box (boxfile_lineno, boxfile_charno, box, + unicharset_boxes.id_to_unichar(uch_id), "FAILURE! box overlaps no blobs or blobs in multiple rows"); } else { if ((box.left () >= prev_box_right) && (row != prev_row)) - report_failed_box (boxfile_lineno, boxfile_charno, box, ch, + report_failed_box (boxfile_lineno, boxfile_charno, box, + unicharset_boxes.id_to_unichar(uch_id), "WARNING! false row break"); - box_failures += resegment_box (row, box, ch, block_id, row_id, + box_failures += resegment_box (row, box, uch_id, block_id, row_id, boxfile_lineno, boxfile_charno); prev_row = row; } @@ -154,7 +165,7 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks bad_blobs, tgt_char_counts, rebalance_count, - min_char, + &min_uch_id, min_samples, final_labelled_blob_count); tprintf ("APPLY_BOXES:\n"); @@ -163,7 +174,8 @@ void apply_boxes(BLOCK_LIST *block_list //real blocks labels_ok, rows_ok); tprintf (" Box failures detected: %6d\n", box_failures); tprintf (" Duped blobs for rebalance:%6d\n", rebalance_count); - tprintf (" \"%c\" has fewest samples:%6d\n", min_char, min_samples); + tprintf (" \"%s\" has fewest samples:%6d\n", + unicharset_boxes.id_to_unichar(min_uch_id), min_samples); tprintf (" Total unlabelled words: %6d\n", bad_blobs); tprintf (" Final labelled words: %6d\n", @@ -194,7 +206,7 @@ void clear_any_old_text( //remove correct text BOOL8 read_next_box(FILE* box_file, // BOX *box, - char *ch) { + UNICHAR_ID *uch_id) { char buff[256]; //boxfile read buffer char *buffptr = buff; STRING box_filename; @@ -204,23 +216,38 @@ BOOL8 read_next_box(FILE* box_file, // INT32 x_max; INT32 y_max; INT32 count = 0; + char uch[256]; while (!feof (box_file)) { fgets (buff, sizeof (buff) - 1, box_file); line++; + buffptr = buff; + const unsigned char *ubuf = reinterpret_cast(buffptr); + if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) + buffptr += 3; // Skip unicode file designation. /* Check for blank lines in box file */ - for (buffptr = buff; isspace (*buffptr); buffptr++) - ; + while (isspace (*buffptr)) + buffptr++; if (*buffptr != '\0') { count = - sscanf (buff, - "%c " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " " - INT32FORMAT, ch, &x_min, &y_min, &x_max, &y_max); + sscanf (buffptr, + "%s " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " " + INT32FORMAT, uch, &x_min, &y_min, &x_max, &y_max); if (count != 5) { tprintf ("Box file format error on line %i ignored\n", line); } else { + if (!unicharset_boxes.contains_unichar(uch)) + { + unicharset_boxes.unichar_insert(uch); + if (unicharset_boxes.size() > MAX_NUM_CLASSES) { + tprintf("Error: Size of unicharset of boxes is \ +greater than MAX_NUM_CLASSES\n"); + exit(1); + } + } + *uch_id = unicharset_boxes.unichar_to_id(uch); *box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max)); return TRUE; //read a box ok } @@ -314,7 +341,7 @@ ROW *find_row_of_box( // INT16 resegment_box( // ROW *row, BOX box, - char *ch, + UNICHAR_ID uch_id, INT16 block_id, INT16 row_id, INT16 boxfile_lineno, @@ -358,7 +385,7 @@ INT16 resegment_box( // if (applybox_debug > 4) report_failed_box (boxfile_lineno, boxfile_charno, - box, ch, + box, unicharset_boxes.id_to_unichar(uch_id), "FAILURE! box overlaps blob in labelled word"); } if (applybox_debug > 4) @@ -375,7 +402,7 @@ INT16 resegment_box( // if (new_word == NULL) { /* Make a new word with a single blob */ new_word = word->shallow_copy (); - new_word->set_text (ch); + new_word->set_text (unicharset_boxes.id_to_unichar(uch_id)); if (polyg) new_blob = new PBLOB; else @@ -414,63 +441,75 @@ INT16 resegment_box( // word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f; baseline = row->base_line (word_x_centre); - if (STRING (chs_caps_ht).contains (ch[0]) && - (new_word_box.top () < - baseline + (1 + applybox_error_band) * row->x_height ())) { - report_failed_box (boxfile_lineno, boxfile_charno, box, ch, - "FAILURE! caps-ht char didn't ascend"); - new_word->set_text (""); - return 1; - } - if (STRING (chs_odd_top).contains (ch[0]) && - (new_word_box.top () < - baseline + (1 - applybox_error_band) * row->x_height ())) { - report_failed_box (boxfile_lineno, boxfile_charno, box, ch, - "FAILURE! Odd top char below xht"); - new_word->set_text (""); - return 1; - } - if (STRING (chs_x_ht).contains (ch[0]) && - ((new_word_box.top () > - baseline + (1 + applybox_error_band) * row->x_height ()) || - (new_word_box.top () < - baseline + (1 - applybox_error_band) * row->x_height ()))) { - report_failed_box (boxfile_lineno, boxfile_charno, box, ch, - "FAILURE! x-ht char didn't have top near xht"); - new_word->set_text (""); - return 1; - } - if (STRING (chs_non_ambig_bl).contains (ch[0]) && - ((new_word_box.bottom () < - baseline - applybox_error_band * row->x_height ()) || - (new_word_box.bottom () > - baseline + applybox_error_band * row->x_height ()))) { - report_failed_box (boxfile_lineno, boxfile_charno, box, ch, - "FAILURE! non ambig BL char didnt have bottom near baseline"); - new_word->set_text (""); - return 1; - } - if (STRING (chs_odd_bot).contains (ch[0]) && - (new_word_box.bottom () > - baseline + applybox_error_band * row->x_height ())) { - report_failed_box (boxfile_lineno, boxfile_charno, box, ch, - "FAILURE! Odd bottom char above baseline"); - new_word->set_text (""); - return 1; - } - if (STRING (chs_desc).contains (ch[0]) && - (new_word_box.bottom () > - baseline - applybox_error_band * row->x_height ())) { - report_failed_box (boxfile_lineno, boxfile_charno, box, ch, +#if 0 + if (strlen(unicharset_boxes.id_to_unichar(uch_id)) == 1) { + if (STRING (chs_caps_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) && + (new_word_box.top () < + baseline + (1 + applybox_error_band) * row->x_height ())) { + report_failed_box (boxfile_lineno, boxfile_charno, box, + unicharset_boxes.id_to_unichar(uch_id), + "FAILURE! caps-ht char didn't ascend"); + new_word->set_text (""); + return 1; + } + if (STRING (chs_odd_top).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) && + (new_word_box.top () < + baseline + (1 - applybox_error_band) * row->x_height ())) { + report_failed_box (boxfile_lineno, boxfile_charno, box, + unicharset_boxes.id_to_unichar(uch_id), + "FAILURE! Odd top char below xht"); + new_word->set_text (""); + return 1; + } + if (STRING (chs_x_ht).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) && + ((new_word_box.top () > + baseline + (1 + applybox_error_band) * row->x_height ()) || + (new_word_box.top () < + baseline + (1 - applybox_error_band) * row->x_height ()))) { + report_failed_box (boxfile_lineno, boxfile_charno, box, + unicharset_boxes.id_to_unichar(uch_id), + "FAILURE! x-ht char didn't have top near xht"); + new_word->set_text (""); + return 1; + } + if (STRING (chs_non_ambig_bl).contains + (unicharset_boxes.id_to_unichar(uch_id)[0]) && + ((new_word_box.bottom () < + baseline - applybox_error_band * row->x_height ()) || + (new_word_box.bottom () > + baseline + applybox_error_band * row->x_height ()))) { + report_failed_box (boxfile_lineno, boxfile_charno, box, + unicharset_boxes.id_to_unichar(uch_id), + "FAILURE! non ambig BL char didnt have bottom near baseline"); + new_word->set_text (""); + return 1; + } + if (STRING (chs_odd_bot).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) && + (new_word_box.bottom () > + baseline + applybox_error_band * row->x_height ())) { + report_failed_box (boxfile_lineno, boxfile_charno, box, + unicharset_boxes.id_to_unichar(uch_id), + "FAILURE! Odd bottom char above baseline"); + new_word->set_text (""); + return 1; + } + if (STRING (chs_desc).contains (unicharset_boxes.id_to_unichar(uch_id)[0]) && + (new_word_box.bottom () > + baseline - applybox_error_band * row->x_height ())) { + report_failed_box (boxfile_lineno, boxfile_charno, box, + unicharset_boxes.id_to_unichar(uch_id), "FAILURE! Descender doesn't descend"); - new_word->set_text (""); - return 1; + new_word->set_text (""); + return 1; + } } +#endif return 0; } else { - report_failed_box (boxfile_lineno, boxfile_charno, box, ch, - "FAILURE! Couldn't find any blobs"); + report_failed_box (boxfile_lineno, boxfile_charno, box, + unicharset_boxes.id_to_unichar(uch_id), + "FAILURE! Couldn't find any blobs"); return 1; } } @@ -492,7 +531,7 @@ void tidy_up( // INT16 &unlabelled_words, INT16 *tgt_char_counts, INT16 &rebalance_count, - char &min_char, + UNICHAR_ID *min_uch_id, INT16 &min_samples, INT16 &final_labelled_blob_count) { BLOCK_IT block_it(block_list); @@ -507,16 +546,16 @@ void tidy_up( // BOOL8 row_ok; BOOL8 rebalance_needed = FALSE; //No. of unique labelled samples - INT16 labelled_char_counts[128]; + INT16 labelled_char_counts[MAX_NUM_CLASSES]; INT16 i; - char ch; - char prev_ch = '\0'; + UNICHAR_ID uch_id; + UNICHAR_ID prev_uch_id = -1; BOOL8 at_dupe_of_prev_word; ROW *prev_row = NULL; INT16 left; INT16 prev_left = -1; - for (i = 0; i < 128; i++) + for (i = 0; i < MAX_NUM_CLASSES; i++) labelled_char_counts[i] = 0; ok_char_count = 0; @@ -556,7 +595,7 @@ void tidy_up( // block_idx, row_idx, all_row_idx); ok_char_count++; - labelled_char_counts[*word->text ()]++; + labelled_char_counts[unicharset_boxes.unichar_to_id(word->text ())]++; row_ok = TRUE; } } @@ -571,24 +610,24 @@ void tidy_up( // } min_samples = 9999; - for (i = 0; i < 128; i++) { + for (i = 0; i < unicharset_boxes.size(); i++) { if (tgt_char_counts[i] > labelled_char_counts[i]) { if (labelled_char_counts[i] <= 1) { tprintf - ("APPLY_BOXES: FATALITY - %d labelled samples of \"%c\" - target is %d\n", - labelled_char_counts[i], (char) i, tgt_char_counts[i]); + ("APPLY_BOXES: FATALITY - %d labelled samples of \"%s\" - target is %d\n", + labelled_char_counts[i], unicharset_boxes.id_to_unichar(i), tgt_char_counts[i]); } else { rebalance_needed = TRUE; if (applybox_debug > 0) tprintf - ("APPLY_BOXES: REBALANCE REQD \"%c\" - target of %d from %d labelled samples\n", - (char) i, tgt_char_counts[i], labelled_char_counts[i]); + ("APPLY_BOXES: REBALANCE REQD \"%s\" - target of %d from %d labelled samples\n", + unicharset_boxes.id_to_unichar(i), tgt_char_counts[i], labelled_char_counts[i]); } } if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) { min_samples = labelled_char_counts[i]; - min_char = (char) i; + *min_uch_id = i; } } @@ -605,33 +644,36 @@ void tidy_up( // !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); left = word->bounding_box ().left (); - ch = *word->text (); + if (*word->text () != '\0') + uch_id = unicharset_boxes.unichar_to_id(word->text ()); + else + uch_id = -1; at_dupe_of_prev_word = ((row == prev_row) && (left = prev_left) && - (ch == prev_ch)); - if ((ch != '\0') && - (labelled_char_counts[ch] > 1) && - (tgt_char_counts[ch] > labelled_char_counts[ch]) && + (uch_id == prev_uch_id)); + if ((uch_id != -1) && + (labelled_char_counts[uch_id] > 1) && + (tgt_char_counts[uch_id] > labelled_char_counts[uch_id]) && (!at_dupe_of_prev_word)) { /* Duplicate the word to rebalance the labelled samples */ if (applybox_debug > 9) { - tprintf ("Duping \"%c\" from ", ch); + tprintf ("Duping \"%s\" from ", unicharset_boxes.id_to_unichar(uch_id)); word->bounding_box ().print (); } duplicate_word = new WERD; *duplicate_word = *word; word_it.add_after_then_move (duplicate_word); rebalance_count++; - labelled_char_counts[ch]++; + labelled_char_counts[uch_id]++; } prev_row = row; prev_left = left; - prev_ch = ch; + prev_uch_id = uch_id; } } } rebalance_needed = FALSE; - for (i = 0; i < 128; i++) { + for (i = 0; i < unicharset_boxes.size(); i++) { if ((tgt_char_counts[i] > labelled_char_counts[i]) && (labelled_char_counts[i] > 1)) { rebalance_needed = TRUE; @@ -653,7 +695,7 @@ void tidy_up( // for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); - if ((strlen (word->text ()) == 1) && + if ((strlen (word->text ()) > 0) && (word->gblob_list ()->length () == 1)) final_labelled_blob_count++; } @@ -665,7 +707,7 @@ void tidy_up( // void report_failed_box(INT16 boxfile_lineno, INT16 boxfile_charno, BOX box, - char *box_ch, + const char *box_ch, const char *err_msg) { if (applybox_debug > 4) tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n", @@ -687,10 +729,9 @@ void apply_box_training(BLOCK_LIST *block_list) { PBLOB_IT blob_it; DENORM denorm; INT16 count = 0; - char ch[2]; - - ch[1] = '\0'; + char unichar[UNICHAR_LEN + 1]; + unichar[UNICHAR_LEN] = '\0'; tprintf ("Generating training data\n"); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { @@ -701,23 +742,22 @@ void apply_box_training(BLOCK_LIST *block_list) { for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); - if ((strlen (word->text ()) == 1) && + if ((strlen (word->text ()) > 0) && (word->gblob_list ()->length () == 1)) { - /* Here is a word with a single char label and a single blob so train on it */ + /* Here is a word with a single unichar label and a single blob so train on it */ bln_word = make_bln_copy (word, row, row->x_height (), &denorm); blob_it.set_to_list (bln_word->blob_list ()); - ch[0] = *word->text (); + strncpy(unichar, word->text (), UNICHAR_LEN); tess_training_tester (blob_it.data (), //single blob &denorm, TRUE, //correct - ch, //correct ASCII char - 1, //ASCII length + unichar, //correct character + strlen(unichar), //character length NULL); copy_outword = *(bln_word); copy_outword.baseline_denormalise (&denorm); blob_it.set_to_list (copy_outword.blob_list ()); - ch[0] = *word->text (); delete bln_word; count++; } @@ -793,7 +833,7 @@ void apply_box_testing(BLOCK_LIST *block_list) { choice list, outword blob lists and best_choice string are the same length. A TESS screw up is indicated by a blank filled or 0 length string. */ - if ((best_choice->string ().length () == 0) || + if ((best_choice->lengths ().length () == 0) || (strspn (best_choice->string ().string (), " ") == best_choice->string ().length ())) { rej_count++; @@ -804,22 +844,22 @@ void apply_box_testing(BLOCK_LIST *block_list) { #endif } else { - if ((best_choice->string ().length () != + if ((best_choice->lengths ().length () != outword->blob_list ()->length ()) || - (best_choice->string ().length () != + (best_choice->lengths ().length () != blob_choices.length ())) { tprintf ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", best_choice->string ().string (), - best_choice->string ().length (), + best_choice->lengths ().length (), outword->blob_list ()->length (), blob_choices.length ()); } - ASSERT_HOST (best_choice->string ().length () == + ASSERT_HOST (best_choice->lengths ().length () == outword->blob_list ()->length ()); - ASSERT_HOST (best_choice->string ().length () == + ASSERT_HOST (best_choice->lengths ().length () == blob_choices.length ()); - fix_quotes ((char *) best_choice->string ().string (), + fix_quotes (best_choice, //turn to double outword, &blob_choices); if (strcmp (best_choice->string ().string (), ch) != 0) { diff --git a/ccmain/baseapi.cpp b/ccmain/baseapi.cpp index 86934533a..80cb53f58 100644 --- a/ccmain/baseapi.cpp +++ b/ccmain/baseapi.cpp @@ -27,6 +27,7 @@ #include "applybox.h" #include "pgedit.h" #include "varabled.h" +#include "output.h" #include "adaptmatch.h" BOOL_VAR(tessedit_resegment_from_boxes, FALSE, @@ -37,6 +38,8 @@ BOOL_VAR(tessedit_train_from_boxes, FALSE, // Minimum sensible image size to be worth running tesseract. const int kMinRectSize = 10; +static STRING input_file = "noname.tif"; + // Start tesseract. // The datapath must be the name of the data directory or some other file // in which the data directory resides (for instance argv[0].) @@ -70,6 +73,12 @@ int TessBaseAPI::InitWithLanguage(const char* datapath, const char* outputbase, return result; } +// Set the name of the input file. Needed only for training and +// loading a UNLV zone file. +void TessBaseAPI::SetInputName(const char* name) { + input_file = name; +} + // Recognize a rectangle from an image and return the result as a string. // May be called many times for a single Init. // Currently has no error checking. @@ -96,6 +105,52 @@ char* TessBaseAPI::TesseractRect(const unsigned char* imagedata, return RecognizeToString(); } +// As TesseractRect but produces a box file as output. +char* TessBaseAPI::TesseractRectBoxes(const unsigned char* imagedata, + int bytes_per_pixel, + int bytes_per_line, + int left, int top, + int width, int height, + int imageheight) { + if (width < kMinRectSize || height < kMinRectSize) + return NULL; // Nothing worth doing. + + // Copy/Threshold the image to the tesseract global page_image. + CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line, + left, top, width, height); + + BLOCK_LIST block_list; + + FindLines(&block_list); + + // Now run the main recognition. + PAGE_RES* page_res = Recognize(&block_list, NULL); + + return TesseractToBoxText(page_res, left, imageheight - (top + height)); +} + +char* TessBaseAPI::TesseractRectUNLV(const unsigned char* imagedata, + int bytes_per_pixel, + int bytes_per_line, + int left, int top, + int width, int height) { + if (width < kMinRectSize || height < kMinRectSize) + return NULL; // Nothing worth doing. + + // Copy/Threshold the image to the tesseract global page_image. + CopyImageToTesseract(imagedata, bytes_per_pixel, bytes_per_line, + left, top, width, height); + + BLOCK_LIST block_list; + + FindLines(&block_list); + + // Now run the main recognition. + PAGE_RES* page_res = Recognize(&block_list, NULL); + + return TesseractToUNLV(page_res); +} + // Call between pages or documents etc to free up memory and forget // adaptive data. void TessBaseAPI::ClearAdaptiveClassifier() { @@ -326,7 +381,7 @@ void TessBaseAPI::CopyBinaryRect(const unsigned char* imagedata, image.capture(const_cast(imagedata), bytes_per_line*8, top + height, 1); page_image.create(width, height, 1); - copy_sub_image(&image, left, top, width, height, &page_image, 0, 0, false); + copy_sub_image(&image, left, 0, width, height, &page_image, 0, 0, false); } // Low-level function to recognize the current global image to a string. @@ -343,7 +398,6 @@ char* TessBaseAPI::RecognizeToString() { // Find lines from the image making the BLOCK_LIST. void TessBaseAPI::FindLines(BLOCK_LIST* block_list) { - STRING input_file = "noname.tif"; // The following call creates a full-page block and then runs connected // component analysis and text line creation. pgeditor_read_file(input_file, block_list); @@ -369,21 +423,32 @@ PAGE_RES* TessBaseAPI::Recognize(BLOCK_LIST* block_list, ETEXT_DESC* monitor) { return page_res; } +// Return the maximum length that the output text string might occupy. +int TessBaseAPI::TextLength(PAGE_RES* page_res) { + PAGE_RES_IT page_res_it(page_res); + int total_length = 2; + // Iterate over the data structures to extract the recognition result. + for (page_res_it.restart_page(); page_res_it.word () != NULL; + page_res_it.forward()) { + WERD_RES *word = page_res_it.word(); + WERD_CHOICE* choice = word->best_choice; + if (choice != NULL) { + total_length += choice->string().length() + 1; + for (int i = 0; i < word->reject_map.length(); ++i) { + if (word->reject_map[i].rejected()) + ++total_length; + } + } + } + return total_length; +} + // Make a text string from the internal data structures. // The input page_res is deleted. char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) { if (page_res != NULL) { - int total_length = 2; + int total_length = TextLength(page_res); PAGE_RES_IT page_res_it(page_res); - // Iterate over the data structures to extract the recognition result. - for (page_res_it.restart_page(); page_res_it.word () != NULL; - page_res_it.forward()) { - WERD_RES *word = page_res_it.word(); - WERD_CHOICE* choice = word->best_choice; - if (choice != NULL) { - total_length += choice->string().length() + 1; - } - } char* result = new char[total_length]; char* ptr = result; for (page_res_it.restart_page(); page_res_it.word () != NULL; @@ -406,3 +471,207 @@ char* TessBaseAPI::TesseractToText(PAGE_RES* page_res) { } return NULL; } + +static int ConvertWordToBoxText(WERD_RES *word, + ROW_RES* row, + int left, + int bottom, + char* word_str) { + // Copy the output word and denormalize it back to image coords. + WERD copy_outword; + copy_outword = *(word->outword); + copy_outword.baseline_denormalise(&word->denorm); + PBLOB_IT blob_it; + blob_it.set_to_list(copy_outword.blob_list()); + int length = copy_outword.blob_list()->length(); + int output_size = 0; + + if (length > 0) { + for (int index = 0, offset = 0; index < length; + offset += word->best_choice->lengths()[index++], blob_it.forward()) { + PBLOB* blob = blob_it.data(); + BOX blob_box = blob->bounding_box(); + if (word->tess_failed || + blob_box.left() < 0 || + blob_box.right() > page_image.get_xsize() || + blob_box.bottom() < 0 || + blob_box.top() > page_image.get_ysize()) { + // Bounding boxes can be illegal when tess fails on a word. + blob_box = word->word->bounding_box(); // Use original word as backup. + tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n", + blob_box.left(), blob_box.bottom(), + blob_box.right(), blob_box.top()); + } + + // A single classification unit can be composed of several UTF-8 + // characters. Append each of them to the result. + for (int sub = 0; sub < word->best_choice->lengths()[index]; ++sub) { + char ch = word->best_choice->string()[offset + sub]; + // Tesseract uses space for recognition failure. Fix to a reject + // character, '~' so we don't create illegal box files. + if (ch == ' ') + ch = '~'; + word_str[output_size++] = ch; + } + sprintf(word_str + output_size, " %d %d %d %d\n", + blob_box.left() + left, blob_box.bottom() + bottom, + blob_box.right() + left, blob_box.top() + bottom); + output_size += strlen(word_str + output_size); + } + } + return output_size; +} + +// Multiplier for textlength assumes 4 numbers @ 5 digits and a space +// plus the newline and the orginial character = 4*(5+1)+2 +const int kMaxCharsPerChar = 26; + +// Make a text string from the internal data structures. +// The input page_res is deleted. +// The text string takes the form of a box file as needed for training. +char* TessBaseAPI::TesseractToBoxText(PAGE_RES* page_res, + int left, int bottom) { + if (page_res != NULL) { + int total_length = TextLength(page_res) * kMaxCharsPerChar; + PAGE_RES_IT page_res_it(page_res); + char* result = new char[total_length]; + char* ptr = result; + for (page_res_it.restart_page(); page_res_it.word () != NULL; + page_res_it.forward()) { + WERD_RES *word = page_res_it.word(); + ptr += ConvertWordToBoxText(word,page_res_it.row(),left, bottom, ptr); + } + *ptr = '\0'; + delete page_res; + return result; + } + return NULL; +} + +// Make a text string from the internal data structures. +// The input page_res is deleted. The text string is converted +// to UNLV-format: Latin-1 with specific reject and suspect codes. +const char kUnrecognized = '~'; +// Conversion table for non-latin characters. +// Maps characters out of the latin set into the latin set. +// TODO(rays) incorporate this translation into unicharset. +const int kUniChs[] = { + 0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0 +}; +// Latin chars corresponding to the unicode chars above. +const int kLatinChs[] = { + 0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0 +}; + +char* TessBaseAPI::TesseractToUNLV(PAGE_RES* page_res) { + bool tilde_crunch_written = false; + bool last_char_was_newline = true; + bool last_char_was_tilde = false; + + if (page_res != NULL) { + int total_length = TextLength(page_res); + PAGE_RES_IT page_res_it(page_res); + char* result = new char[total_length]; + char* ptr = result; + for (page_res_it.restart_page(); page_res_it.word () != NULL; + page_res_it.forward()) { + WERD_RES *word = page_res_it.word(); + // Process the current word. + if (word->unlv_crunch_mode != CR_NONE) { + if (word->unlv_crunch_mode != CR_DELETE && + (!tilde_crunch_written || + (word->unlv_crunch_mode == CR_KEEP_SPACE && + word->word->space () > 0 && + !word->word->flag (W_FUZZY_NON) && + !word->word->flag (W_FUZZY_SP)))) { + if (!word->word->flag (W_BOL) && + word->word->space () > 0 && + !word->word->flag (W_FUZZY_NON) && + !word->word->flag (W_FUZZY_SP)) { + /* Write a space to separate from preceeding good text */ + *ptr++ = ' '; + last_char_was_tilde = false; + } + if (!last_char_was_tilde) { + // Write a reject char. + last_char_was_tilde = true; + *ptr++ = kUnrecognized; + tilde_crunch_written = true; + last_char_was_newline = false; + } + } + } else { + // NORMAL PROCESSING of non tilde crunched words. + tilde_crunch_written = false; + + if (last_char_was_tilde && + word->word->space () == 0 && + (word->best_choice->string ()[0] == ' ')) { + /* Prevent adjacent tilde across words - we know that adjacent tildes within + words have been removed */ + char* p = (char *) word->best_choice->string().string (); + strcpy (p, p + 1); //shuffle up + p = (char *) word->best_choice->lengths().string (); + strcpy (p, p + 1); //shuffle up + word->reject_map.remove_pos (0); + PBLOB_IT blob_it = word->outword->blob_list (); + delete blob_it.extract (); //get rid of reject blob + } + + if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps) + ensure_rep_chars_are_consistent(word); + + set_unlv_suspects(word); + const char* wordstr = word->best_choice->string().string(); + if (wordstr[0] != 0) { + if (!last_char_was_newline) + *ptr++ = ' '; + else + last_char_was_newline = false; + int offset = 0; + const STRING& lengths = word->best_choice->lengths(); + int length = lengths.length(); + for (int i = 0; i < length; offset += lengths[i++]) { + if (wordstr[offset] == ' ' || + wordstr[offset] == '~' || + wordstr[offset] == '|') { + *ptr++ = kUnrecognized; + last_char_was_tilde = true; + } else { + if (word->reject_map[i].rejected()) + *ptr++ = '^'; + UNICHAR ch(wordstr + offset, lengths[i]); + int uni_ch = ch.first_uni(); + for (int j = 0; kUniChs[j] != 0; ++j) { + if (kUniChs[j] == uni_ch) { + uni_ch = kLatinChs[j]; + break; + } + } + if (uni_ch <= 0xff) { + *ptr++ = static_cast(uni_ch); + last_char_was_tilde = false; + } else { + *ptr++ = kUnrecognized; + last_char_was_tilde = true; + } + } + } + } + } + if (word->word->flag(W_EOL) && !last_char_was_newline) { + /* Add a new line output */ + *ptr++ = '\n'; + tilde_crunch_written = false; + last_char_was_newline = true; + last_char_was_tilde = false; + } + } + *ptr++ = '\n'; + *ptr = '\0'; + delete page_res; + return result; + } + return NULL; +} + diff --git a/ccmain/baseapi.h b/ccmain/baseapi.h index cdb8b251b..d33f9dff0 100644 --- a/ccmain/baseapi.h +++ b/ccmain/baseapi.h @@ -20,8 +20,6 @@ #ifndef THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__ #define THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__ -#include - class PAGE_RES; class BLOCK_LIST; @@ -56,6 +54,10 @@ class TessBaseAPI { const char* language, const char* configfile, bool numeric_mode, int argc, char* argv[]); + // Set the name of the input file. Needed only for training and + // reading a UNLV zone file. + static void SetInputName(const char* name); + // Recognize a rectangle from an image and return the result as a string. // May be called many times for a single Init. // Currently has no error checking. @@ -71,6 +73,19 @@ class TessBaseAPI { int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height); + // As TesseractRect but produces a box file as output. + // Image height is needed as well as rect height, since output y-coords + // will be relative to the bottom of the image. + static char* TesseractRectBoxes(const unsigned char* imagedata, + int bytes_per_pixel, + int bytes_per_line, + int left, int top, int width, int height, + int imageheight); + // As TesseractRect but produces UNLV-style output. + static char* TesseractRectUNLV(const unsigned char* imagedata, + int bytes_per_pixel, + int bytes_per_line, + int left, int top, int width, int height); // Call between pages or documents etc to free up memory and forget // adaptive data. @@ -153,8 +168,18 @@ class TessBaseAPI { static PAGE_RES* Recognize(BLOCK_LIST* block_list, struct ETEXT_STRUCT* monitor); + // Return the maximum length that the output text string might occupy. + static int TextLength(PAGE_RES* page_res); // Convert (and free) the internal data structures into a text string. static char* TesseractToText(PAGE_RES* page_res); + // Make a text string from the internal data structures. + // The input page_res is deleted. + // The text string takes the form of a box file as needed for training. + static char* TesseractToBoxText(PAGE_RES* page_res, int left, int bottom); + // Make a text string from the internal data structures. + // The input page_res is deleted. The text string is converted + // to UNLV-format: Latin-1 with specific reject and suspect codes. + static char* TesseractToUNLV(PAGE_RES* page_res); }; #endif // THIRD_PARTY_TESSERACT_CCMAIN_BASEAPI_H__ diff --git a/ccmain/output.cpp b/ccmain/output.cpp index ed2f8323f..3a703a0de 100644 --- a/ccmain/output.cpp +++ b/ccmain/output.cpp @@ -35,6 +35,7 @@ #include "docqual.h" #include "output.h" #include "bestfirst.h" +#include "globals.h" #define EXTERN @@ -55,12 +56,12 @@ EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE, "Write block separators in output"); EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE, "Write raw stuff to name.raw"); -EXTERN BOOL_EVAR (tessedit_write_output, TRUE, "Write text to name.txt"); +EXTERN BOOL_EVAR (tessedit_write_output, FALSE, "Write text to name.txt"); EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE, "Return ratings in IPEOCRAPI data"); -EXTERN BOOL_EVAR (tessedit_write_txt_map, TRUE, +EXTERN BOOL_EVAR (tessedit_write_txt_map, FALSE, "Write .txt to .etx map file"); -EXTERN BOOL_EVAR (tessedit_write_rep_codes, TRUE, +EXTERN BOOL_EVAR (tessedit_write_rep_codes, FALSE, "Write repetition char code"); EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file"); EXTERN STRING_EVAR (unrecognised_char, "|", @@ -106,7 +107,6 @@ INT32 pixels_to_pts( //convert coords return (INT32) (pts + 0.5); //round it } - void output_pass( //Tess output pass //send to api PAGE_RES_IT &page_res_it, BOOL8 write_to_shm, @@ -119,8 +119,7 @@ void output_pass( //Tess output pass //send to api if (tessedit_write_txt_map) txt_mapfile = open_outfile (".map"); - if (tessedit_write_unlv) - unlv_file = open_outfile (".unlv"); + page_res_it.restart_page (); block_of_last_word = NULL; while (page_res_it.word () != NULL) { @@ -189,7 +188,6 @@ void output_pass( //Tess output pass //send to api } } - /************************************************************************* * write_results() * @@ -211,9 +209,10 @@ void write_results( //output a word ) { //word to do WERD_RES *word = page_res_it.word (); - WERD_CHOICE *ep_choice; //ep format +// WERD_CHOICE *ep_choice; //ep format STRING repetition_code; const STRING *wordstr; + STRING wordstr_lengths; const char *text; int i; char unrecognised = STRING (unrecognised_char)[0]; @@ -312,15 +311,12 @@ void write_results( //output a word if (tessedit_write_output && !NO_BLOCK) fprintf (textfile, "%s", txt_chs); - if (tessedit_write_unlv) - fprintf (unlv_file, "%s", txt_chs); - if (tessedit_write_txt_map) fprintf (txt_mapfile, "%s", map_chs); //terminate string ep_chars[ep_chars_index] = '\0'; - word->ep_choice = new WERD_CHOICE (ep_chars, 0, 0, NO_PERM); + word->ep_choice = new WERD_CHOICE (ep_chars, NULL, 0, 0, NO_PERM); if (force_eol) empty_block = TRUE; @@ -345,6 +341,8 @@ void write_results( //output a word words have been removed */ ptr = (char *) word->best_choice->string ().string (); strcpy (ptr, ptr + 1); //shuffle up + ptr = (char *) word->best_choice->lengths ().string (); + strcpy (ptr, ptr + 1); //shuffle up word->reject_map.remove_pos (0); blob_it = word->outword->blob_list (); delete blob_it.extract (); //get rid of reject blob @@ -354,8 +352,10 @@ void write_results( //output a word last_char_was_tilde = FALSE; else { if (word->reject_map.length () > 0) { - if (word->best_choice->string ()[word->reject_map.length () - 1] == - ' ') + for (i = 0, ptr = (char *) word->best_choice->string().string(); + i < word->reject_map.length () - 1; ++i) + ptr += word->best_choice->lengths()[i]; + if (*ptr == ' ') last_char_was_tilde = TRUE; else last_char_was_tilde = FALSE; @@ -365,7 +365,7 @@ void write_results( //output a word /* else it is unchanged as there are no output chars */ } - ptr = (char *) word->best_choice->string ().string (); + ptr = (char *) word->best_choice->lengths ().string (); ASSERT_HOST (strlen (ptr) == word->reject_map.length ()); if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps) @@ -379,21 +379,26 @@ void write_results( //output a word dict_word (word->best_choice->string ().string ())); } +#if 0 if (tessedit_write_unlv) { write_unlv_text(word); } +#endif if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) { repetition_code = "|^~R"; - repetition_code += get_rep_char (word); + wordstr_lengths = "\001\001\001\001"; + repetition_code += unicharset.id_to_unichar(get_rep_char (word)); + wordstr_lengths += strlen(unicharset.id_to_unichar(get_rep_char (word))); wordstr = &repetition_code; } else { wordstr = &(word->best_choice->string ()); + wordstr_lengths = word->best_choice->lengths (); if (tessedit_zero_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ text = wordstr->string (); - for (i = 0; text[i] != '\0'; i++) { + for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) { if (word->reject_map[i].rejected ()) word->reject_map[i].setrej_minimal_rej_accept (); } @@ -401,8 +406,8 @@ void write_results( //output a word if (tessedit_minimal_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ text = wordstr->string (); - for (i = 0; text[i] != '\0'; i++) { - if ((text[i] != ' ') && word->reject_map[i].rejected ()) + for (i = 0; *text != '\0'; text += word->best_choice->lengths()[i++]) { + if ((*text != ' ') && word->reject_map[i].rejected ()) word->reject_map[i].setrej_minimal_rej_accept (); } } @@ -410,8 +415,9 @@ void write_results( //output a word if (write_to_shm) write_shm_text (word, page_res_it.block ()->block, - page_res_it.row (), *wordstr); + page_res_it.row (), *wordstr, wordstr_lengths); +#if 0 if (tessedit_write_output) write_cooked_text (word->word, *wordstr, TRUE, FALSE, textfile); @@ -424,12 +430,12 @@ void write_results( //output a word ep_choice = make_epaper_choice (word, newline_type); word->ep_choice = ep_choice; +#endif - character_count += word->best_choice->string ().length (); + character_count += word->best_choice->lengths ().length (); word_count++; } - /********************************************************************** * make_epaper_choice * @@ -437,6 +443,7 @@ void write_results( //output a word * determine whether each blob should be rejected. **********************************************************************/ +#if 0 WERD_CHOICE *make_epaper_choice( //convert one word WERD_RES *word, //word to do char newline_type //type of newline @@ -482,7 +489,8 @@ WERD_CHOICE *make_epaper_choice( //convert one word if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) { strcpy (word_string + index, "|^~R"); index += 4; - word_string[index++] = get_rep_char (word); + strcpy(word_string + index, unicharset.id_to_unichar(get_rep_char (word))); + index += strlen(unicharset.id_to_unichar(get_rep_char (word))); } else { if (!blob_it.empty ()) @@ -537,7 +545,7 @@ WERD_CHOICE *make_epaper_choice( //convert one word ASSERT_HOST (strlen (word_string) == index); return new WERD_CHOICE (word_string, 0, 0, NO_PERM); } - +#endif /********************************************************************** * make_reject @@ -653,6 +661,7 @@ char determine_newline_type( //test line ends * to the given file. **********************************************************************/ +#if 0 void write_cooked_text( //write output WERD *word, //word to do const STRING &text, //text to write @@ -749,6 +758,7 @@ void write_cooked_text( //write output if (status != 0) WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno); } +#endif /********************************************************************** @@ -761,7 +771,8 @@ void write_shm_text( //write output WERD_RES *word, //word to do BLOCK *block, //block it is from ROW_RES *row, //row it is from - const STRING &text //text to write + const STRING &text, //text to write + const STRING &text_lengths ) { INT32 index; //char counter INT32 index2; //char counter @@ -777,6 +788,8 @@ void write_shm_text( //write output WERD copy_outword; // copy to denorm UINT32 rating; //of char BOOL8 lineend; //end of line + int offset; + int offset2; //point size ptsize = pixels_to_pts ((INT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300); @@ -786,13 +799,14 @@ void write_shm_text( //write output copy_outword = *(word->outword); copy_outword.baseline_denormalise (&word->denorm); blob_it.set_to_list (copy_outword.blob_list ()); - length = text.length (); + length = text_lengths.length (); if (length > 0) { blanks = word->word->space (); if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL)) blanks = 1; - for (index = 0; index < length; index++, blob_it.forward ()) { + for (index = 0, offset = 0; index < length; + offset += text_lengths[index++], blob_it.forward ()) { blob = blob_it.data (); blob_box = blob->bounding_box (); @@ -804,7 +818,7 @@ void write_shm_text( //write output if (tessedit_write_ratings) rating = (UINT32) (-word->best_choice->certainty () / 0.035); else if (tessedit_zero_rejection) - rating = text[index] == ' ' ? 100 : 0; + rating = text[offset] == ' ' ? 100 : 0; else rating = word->reject_map[index].accepted ()? 0 : 100; if (rating > 255) @@ -819,22 +833,41 @@ void write_shm_text( //write output lineend = word->word->flag (W_EOL) && index == length - 1; if (word->word->flag (W_EOL) && tessedit_zero_rejection - && index < length - 1 && text[index + 1] == ' ') { - for (index2 = index + 1; index2 < length && text[index2] == ' '; - index2++); + && index < length - 1 && text[index + text_lengths[index]] == ' ') { + for (index2 = index + 1, offset2 = offset + text_lengths[index]; + index2 < length && text[offset2] == ' '; + offset2 += text_lengths[index2++]); if (index2 == length) lineend = TRUE; } - if (!tessedit_zero_rejection || text[index] != ' ' + if (!tessedit_zero_rejection || text[offset] != ' ' || tessedit_word_for_word) { //confidence - ocr_append_char (text[index] == ' ' ? unrecognised : text[index], blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, (UINT8) rating, - ptsize, //point size - blanks, enhancement, //enhancement - OCR_CDIR_LEFT_RIGHT, - OCR_LDIR_DOWN_RIGHT, - lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); + if (text[offset] == ' ') { + ocr_append_char (unrecognised, + blob_box.left (), blob_box.right (), + page_image.get_ysize () - 1 - blob_box.top (), + page_image.get_ysize () - 1 - blob_box.bottom (), + font, (UINT8) rating, + ptsize, //point size + blanks, enhancement, //enhancement + OCR_CDIR_LEFT_RIGHT, + OCR_LDIR_DOWN_RIGHT, + lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); + } else { + for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset) + ocr_append_char (text[offset + suboffset], + blob_box.left (), blob_box.right (), + page_image.get_ysize () - 1 - blob_box.top (), + page_image.get_ysize () - 1 - blob_box.bottom (), + font, (UINT8) rating, + ptsize, //point size + blanks, enhancement, //enhancement + OCR_CDIR_LEFT_RIGHT, + OCR_LDIR_DOWN_RIGHT, + lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); + } blanks = 0; } @@ -863,13 +896,17 @@ void write_shm_text( //write output lineend = word->word->flag (W_EOL); //font index - ocr_append_char (unrecognised, blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, - rating, //confidence - ptsize, //point size - blanks, enhancement, //enhancement - OCR_CDIR_LEFT_RIGHT, - OCR_LDIR_DOWN_RIGHT, - lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); + ocr_append_char (unrecognised, + blob_box.left (), blob_box.right (), + page_image.get_ysize () - 1 - blob_box.top (), + page_image.get_ysize () - 1 - blob_box.bottom (), + font, + rating, //confidence + ptsize, //point size + blanks, enhancement, //enhancement + OCR_CDIR_LEFT_RIGHT, + OCR_LDIR_DOWN_RIGHT, + lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); } } @@ -888,6 +925,7 @@ void write_shm_text( //write output * newdiff needs etx files! **********************************************************************/ +#if 0 void write_map( //output a map file FILE *mapfile, //mapfile to write to WERD_RES *word) { @@ -937,6 +975,7 @@ void write_map( //output a map file if (status != 0) WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno); } +#endif /************************************************************************* @@ -957,6 +996,7 @@ FILE *open_outfile( //open .map & .unlv file } +#if 0 void write_unlv_text(WERD_RES *word) { const char *wordstr; @@ -1015,6 +1055,7 @@ void write_unlv_text(WERD_RES *word) { if (status != 0) WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno); } +#endif /************************************************************************* @@ -1022,21 +1063,24 @@ void write_unlv_text(WERD_RES *word) { * Return the first accepted character from the repetition string. This is the * character which is repeated - as determined earlier by fix_rep_char() *************************************************************************/ -char get_rep_char( // what char is repeated? - WERD_RES *word) { +UNICHAR_ID get_rep_char(WERD_RES *word) { // what char is repeated? int i; + int offset; - for (i = 0; + for (i = 0, offset = 0; ((i < word->reject_map.length ()) && - (word->reject_map[i].rejected ())); i++); + (word->reject_map[i].rejected ())); + offset += word->best_choice->lengths()[i++]); if (i < word->reject_map.length ()) - return word->best_choice->string ()[i]; + return unicharset.unichar_to_id(word->best_choice->string().string() + + offset, + word->best_choice->lengths()[i]); else - return STRING (unrecognised_char)[0]; + return unicharset.unichar_to_id(unrecognised_char.string()); } - void ensure_rep_chars_are_consistent(WERD_RES *word) { +#if 0 char rep_char = get_rep_char (word); char *ptr; @@ -1045,8 +1089,24 @@ void ensure_rep_chars_are_consistent(WERD_RES *word) { if (*ptr != rep_char) *ptr = rep_char; } -} +#endif +#if 0 + UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate + int i; + char *ptr; + STRING consistent_string; + STRING consistent_string_lengths; + + ptr = (char *) word->best_choice->string ().string (); + for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) { + consistent_string += unicharset.id_to_unichar(rep_char); + consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char)); + } + word->best_choice->string() = consistent_string; + word->best_choice->lengths() = consistent_string_lengths; +#endif +} /************************************************************************* * SUSPECT LEVELS @@ -1062,7 +1122,9 @@ void ensure_rep_chars_are_consistent(WERD_RES *word) { void set_unlv_suspects(WERD_RES *word) { int len = word->reject_map.length (); int i; + int offset; const char *ptr; + const char *lengths = word->best_choice->lengths ().string (); float rating_per_ch; ptr = word->best_choice->string ().string (); @@ -1080,10 +1142,12 @@ void set_unlv_suspects(WERD_RES *word) { /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/ - if (safe_dict_word (ptr) && (count_alphas (ptr) > suspect_short_words)) { + if (safe_dict_word (ptr) && (count_alphas (ptr, lengths) > + suspect_short_words)) { /* Unreject alphas in dictionary words */ - for (i = 0; i < len; i++) { - if (word->reject_map[i].rejected () && isalpha (ptr[i])) + for (i = 0, offset = 0; i < len; offset += lengths[i++]) { + if (word->reject_map[i].rejected () && + unicharset.get_isalpha (ptr + offset, lengths[i])) word->reject_map[i].setrej_minimal_rej_accept (); } } @@ -1095,8 +1159,8 @@ void set_unlv_suspects(WERD_RES *word) { if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) { /* Unreject any Tess Acceptable word - but NOT tess reject chs*/ - for (i = 0; i < len; i++) { - if (word->reject_map[i].rejected () && (ptr[i] != ' ')) + for (i = 0, offset = 0; i < len; offset += lengths[i++]) { + if (word->reject_map[i].rejected () && (ptr[offset] != ' ')) word->reject_map[i].setrej_minimal_rej_accept (); } } @@ -1130,9 +1194,11 @@ void set_unlv_suspects(WERD_RES *word) { } } - if ((acceptable_word_string (word->best_choice->string ().string ()) + if ((acceptable_word_string (word->best_choice->string ().string (), + word->best_choice->lengths ().string ()) != AC_UNACCEPTABLE) || - acceptable_number_string (word->best_choice->string ().string ())) { + acceptable_number_string (word->best_choice->string ().string (), + word->best_choice->lengths ().string ())) { if (word->reject_map.length () > suspect_short_words) { for (i = 0; i < len; i++) { if (word->reject_map[i].rejected () && @@ -1149,11 +1215,12 @@ void set_unlv_suspects(WERD_RES *word) { INT16 count_alphas( //how many alphas - const char *s) { + const char *s, + const char *lengths) { int count = 0; - for (; *s != '\0'; s++) { - if (isalpha (*s)) + for (; *s != '\0'; s += *(lengths++)) { + if (unicharset.get_isalpha(s, *lengths)) count++; } return count; @@ -1161,36 +1228,43 @@ INT16 count_alphas( //how many alphas INT16 count_alphanums( //how many alphanums - const char *s) { + const char *s, + const char *lengths) { int count = 0; - for (; *s != '\0'; s++) { - if (isalnum (*s)) + for (; *s != '\0'; s += *(lengths++)) { + if (unicharset.get_isalpha(s, *lengths) || + unicharset.get_isdigit(s, *lengths)) count++; } return count; } -BOOL8 acceptable_number_string(const char *s) { +BOOL8 acceptable_number_string(const char *s, + const char *lengths) { BOOL8 prev_digit = FALSE; - if (*s == '(') + if (*lengths == 1 && *s == '(') s++; - if ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')) + if (*lengths == 1 && + ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) s++; - for (; *s != '\0'; s++) { - if (isdigit (*s)) + for (; *s != '\0'; s += *(lengths++)) { + if (unicharset.get_isdigit (s, *lengths)) prev_digit = TRUE; - else if (prev_digit && ((*s == '.') || (*s == ',') || (*s == '-'))) - prev_digit = FALSE; else if (prev_digit && - (*(s + 1) == '\0') && ((*s == '%') || (*s == ')'))) + (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) + prev_digit = FALSE; + else if (prev_digit && *lengths == 1 && + (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')'))) return TRUE; else if (prev_digit && - (*s == '%') && (*(s + 1) == ')') && (*(s + 2) == '\0')) + *lengths == 1 && (*s == '%') && + (*(lengths + 1) == 1 && *(s + *lengths) == ')') && + (*(s + *lengths + *(lengths + 1)) == '\0')) return TRUE; else return FALSE; diff --git a/ccmain/tesseractmain.cpp b/ccmain/tesseractmain.cpp index 865f2df7e..8c051ff67 100644 --- a/ccmain/tesseractmain.cpp +++ b/ccmain/tesseractmain.cpp @@ -31,7 +31,9 @@ #include "stderr.h" #include "notdll.h" #include "mainblk.h" +#include "output.h" #include "globals.h" +#include "blread.h" #include "tfacep.h" #include "callnet.h" @@ -40,7 +42,10 @@ #define API_CONFIG "configs/api_config" #define EXTERN +EXTERN BOOL_VAR (tessedit_create_boxfile, FALSE, "Output text with boxes"); EXTERN BOOL_VAR (tessedit_read_image, TRUE, "Ensure the image is read"); +EXTERN INT_VAR (tessedit_serial_unlv, 0, + "0->Whole page, 1->serial no adapt, 2->serial with adapt"); EXTERN BOOL_VAR (tessedit_write_images, FALSE, "Capture the image from the IPE"); EXTERN BOOL_VAR (tessedit_debug_to_screen, FALSE, "Dont use debug file"); @@ -63,15 +68,30 @@ int main(int argc, char **argv) { if (argc < 3) { USAGE.error (argv[0], EXIT, - "%s imagename outputbase [configfile [[+|-]varfile]...]\n", argv[0]); + "%s imagename outputbase [-l lang] [configfile [[+|-]varfile]...]\n", + argv[0]); + } + // Find the required language. + const char* lang = "eng"; + int arg = 3; + if (argc >= 5 && strcmp(argv[3], "-l") == 0) { + lang = argv[4]; + arg = 5; + } + // Find the basename of the input file. + STRING infile(argv[1]); + const char* lastdot = strrchr(argv[1], '.'); + if (lastdot != NULL) { + infile[lastdot - argv[1]] = '\0'; } - if (argc == 3) - TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL, - NULL, false, 0, argv + 2); + if (argc == arg) + TessBaseAPI::InitWithLanguage(argv[0], infile.string(), lang, + NULL, false, 0, argv + arg); else - TessBaseAPI::InitWithLanguage(argv[0], argv[1], NULL, - argv[3], false, argc - 4, argv + 4); + TessBaseAPI::InitWithLanguage(argv[0], infile.string(), lang, + argv[arg], false, + argc - arg - 1, argv + arg + 1); tprintf ("Tesseract Open Source OCR Engine\n"); @@ -92,20 +112,70 @@ int main(int argc, char **argv) { argv[1]); } #endif + STRING text_out; int bytes_per_line = check_legal_image_size(image.get_xsize(), image.get_ysize(), image.get_bpp()); - char* text = TessBaseAPI::TesseractRect(image.get_buffer(), image.get_bpp()/8, - bytes_per_line, 0, 0, - image.get_xsize(), image.get_ysize()); + if (tessedit_serial_unlv == 0) { + TessBaseAPI::SetInputName(argv[1]); + char* text; + if (tessedit_create_boxfile) + text = TessBaseAPI::TesseractRectBoxes(image.get_buffer(), + image.get_bpp()/8, + bytes_per_line, 0, 0, + image.get_xsize(), + image.get_ysize(), + image.get_ysize()); + else if (tessedit_write_unlv) + text = TessBaseAPI::TesseractRectUNLV(image.get_buffer(), + image.get_bpp()/8, + bytes_per_line, 0, 0, + image.get_xsize(), + image.get_ysize()); + else + text = TessBaseAPI::TesseractRect(image.get_buffer(), image.get_bpp()/8, + bytes_per_line, 0, 0, + image.get_xsize(), image.get_ysize()); + text_out = text; + delete [] text; + } else { + BLOCK_LIST blocks; + STRING filename = argv[1]; + int len = filename.length(); + if (len > 4 && filename[len - 4] == '.') { + filename[len - 4] = '\0'; + } + if (!read_unlv_file(filename, image.get_xsize(), image.get_ysize(), + &blocks)) { + fprintf(stderr, "Error: Must have a unlv zone file %s to read!\n", + filename.string()); + return 1; + } + BLOCK_IT b_it = &blocks; + for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { + BLOCK* block = b_it.data(); + BOX box = block->bounding_box(); + char* text = TessBaseAPI::TesseractRectUNLV(image.get_buffer(), + image.get_bpp()/8, + bytes_per_line, + box.left(), + image.get_ysize() - box.top(), + box.width(), + box.height()); + text_out += text; + delete [] text; + if (tessedit_serial_unlv == 1) + TessBaseAPI::ClearAdaptiveClassifier(); + } + } + outfile = argv[2]; outfile += ".txt"; FILE* fp = fopen(outfile.string(), "w"); if (fp != NULL) { - fwrite(text, 1, strlen(text), fp); + fwrite(text_out.string(), 1, text_out.length(), fp); fclose(fp); } - delete [] text; TessBaseAPI::End(); return 0; //Normal exit diff --git a/ccstruct/blread.cpp b/ccstruct/blread.cpp index 93a3412a1..915490cf4 100644 --- a/ccstruct/blread.cpp +++ b/ccstruct/blread.cpp @@ -527,7 +527,9 @@ BOOL8 read_unlv_file( //print list of sides else { while (fscanf (pdfp, "%d %d %d %d %*s", &x, &y, &width, &height) >= 4) { //make rect block - block = new BLOCK (name.string (), TRUE, 0, 0, (INT16) x, (INT16) (ysize - 1 - y - height), (INT16) (x + width), (INT16) (ysize - 1 - y)); + block = new BLOCK (name.string (), TRUE, 0, 0, + (INT16) x, (INT16) (ysize - y - height), + (INT16) (x + width), (INT16) (ysize - y)); //on end of list block_it.add_to_end (block); } diff --git a/cutil/tordvars.cpp b/cutil/tordvars.cpp index 7ef1bbc95..204f765f8 100644 --- a/cutil/tordvars.cpp +++ b/cutil/tordvars.cpp @@ -63,7 +63,7 @@ make_toggle_var (debug_8, 0, make_debug_8, 6, 8, toggle_debug_8, "Debug #8"); make_toggle_var (display_ratings, 0, make_display_ratings, 6, 9, toggle_ratings, "Ratings display"); -make_toggle_var (display_text, 1, make_display_text, +make_toggle_var (display_text, 0, make_display_text, 6, 10, toggle_text, "Display Text"); make_toggle_var (show_bold, 1, make_show_bold, diff --git a/tessdata/configs/makebox b/tessdata/configs/makebox new file mode 100644 index 000000000..3d90ac26f --- /dev/null +++ b/tessdata/configs/makebox @@ -0,0 +1 @@ +tessedit_create_boxfile 1 diff --git a/tessdata/configs/unlv b/tessdata/configs/unlv new file mode 100644 index 000000000..537ad77a1 --- /dev/null +++ b/tessdata/configs/unlv @@ -0,0 +1,3 @@ +tessedit_write_unlv 1 +tessedit_write_output 0 +tessedit_write_txt_map 0 diff --git a/tessdata/tessconfigs/batch b/tessdata/tessconfigs/batch index f0c729c18..619b64675 100644 --- a/tessdata/tessconfigs/batch +++ b/tessdata/tessconfigs/batch @@ -1,78 +1,2 @@ -################################################# -# Adaptive Matcher Using PreAdapted Templates -################################################# - -acts_fx 0x800 -acts_ocr 0x20 - -RatingScale 30.0 -CertaintyScale 20.0 - -#EnableMatcher 0 -#CurrentFx 2 -MinSlope 0.414213562 -MaxSlope 2.414213562 -#ExtremityMode 1 -NormMethod 1 -EnableAdaptiveMatcher 1 - -NormAdjMidpoint 32.0 -NormAdjCurl 2.0 - -MinNormScaleX 0.0 -MaxNormScaleX 0.325 -MinNormScaleY 0.0 -MaxNormScaleY 0.325 - -BuiltInTemplatesFile tessdata/inttemp -BuiltInCutoffsFile tessdata/pffmtable - -EnableLearning 0 -SaveAdaptedTemplates 0 -UsePreAdaptedTemplates 0 -ReliableConfigThreshold 2 -MinNumPermClasses 3 - -#EnableStopper 1 -GoodAdaptiveMatch 0.125 -GreatAdaptiveMatch 0.0 - -EnableIntFX 1 -EnableNewAdaptRules 1 -################################################################################ -# -# File: marks/configs/knobs -# Description: Control variables for 'marks' code -# Author: Mark Seaman, OCR Technology -# Created: Wed Feb 27 11:27:27 1991 -# Modified: Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt -# Language: Text -# Package: N/A -# Status: Experimental (Do Not Distribute) -# -# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved. -# -################################################################################ - -#hidden_edges 1 - -save_doc_words 1 -doc_dict_enable 1 -ClassPrunerThreshold 229 -ClassPrunerMultiplier 15 -IntThetaFudge 128 -CPCutoffStrength 0.15 -EvidenceTableBits 9 -IntEvidenceTruncBits 14 -SEExponentialMultiplier 0 -SimilarityCenter 0.0075 -################################################# -# Adaptive Matcher Using 2 Passes -################################################# - -EnableLearning 1 -SaveAdaptedTemplates 0 -UsePreAdaptedTemplates 0 - -#save_errors 0 +# No content needed as all defaults are correct. diff --git a/tessdata/tessconfigs/batch.nochop b/tessdata/tessconfigs/batch.nochop new file mode 100644 index 000000000..93ae70046 --- /dev/null +++ b/tessdata/tessconfigs/batch.nochop @@ -0,0 +1,2 @@ +chop_enable 0 +enable_assoc 0 diff --git a/tessdata/tessconfigs/matdemo b/tessdata/tessconfigs/matdemo index c1440a9e4..f3ad41d34 100755 --- a/tessdata/tessconfigs/matdemo +++ b/tessdata/tessconfigs/matdemo @@ -2,80 +2,6 @@ # Adaptive Matcher Using PreAdapted Templates ################################################# -acts_fx 0x800 -acts_ocr 0x20 - -RatingScale 30.0 -CertaintyScale 20.0 - -#EnableMatcher 0 -#CurrentFx 2 -EnableAdaptiveMatcher 1 - -NormAdjMidpoint 32.0 -NormAdjCurl 2.0 - -MinNormScaleX 0.0 -MaxNormScaleX 0.325 -MinNormScaleY 0.0 -MaxNormScaleY 0.325 - -BuiltInTemplatesFile tessdata/inttemp -BuiltInCutoffsFile tessdata/pffmtable - -EnableLearning 0 -SaveAdaptedTemplates 0 -UsePreAdaptedTemplates 0 -ReliableConfigThreshold 2 -MinNumPermClasses 3 - -#EnableStopper 1 -GoodAdaptiveMatch 0.125 -GreatAdaptiveMatch 0.0 - -EnableIntFX 1 -EnableNewAdaptRules 1 EnableAdaptiveDebugger 1 MatchDebugFlags 6 MatcherDebugLevel 1 -################################################################################ -# -# File: marks/configs/knobs -# Description: Control variables for 'marks' code -# Author: Mark Seaman, OCR Technology -# Created: Wed Feb 27 11:27:27 1991 -# Modified: Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt -# Language: Text -# Package: N/A -# Status: Experimental (Do Not Distribute) -# -# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved. -# -################################################################################ - -#hidden_edges 1 - -save_doc_words 1 -doc_dict_enable 1 -ClassPrunerThreshold 229 -ClassPrunerMultiplier 15 -IntThetaFudge 128 -CPCutoffStrength 0.15 -EvidenceTableBits 9 -IntEvidenceTruncBits 14 -SEExponentialMultiplier 0 -SimilarityCenter 0.0075 -################################################# -# Adaptive Matcher Using 2 Passes -################################################# - -display_splits 0 -display_all_words 0 -display_all_blobs 0 -display_segmentations 0 -EnableLearning 1 -SaveAdaptedTemplates 0 -UsePreAdaptedTemplates 0 - -#save_errors 0 - diff --git a/tessdata/tessconfigs/msdemo b/tessdata/tessconfigs/msdemo new file mode 100644 index 000000000..9f312feac --- /dev/null +++ b/tessdata/tessconfigs/msdemo @@ -0,0 +1,13 @@ +################################################# +# Adaptive Matcher Using PreAdapted Templates +################################################# + +EnableAdaptiveDebugger 1 +MatchDebugFlags 6 +MatcherDebugLevel 1 + +display_splits 0 +display_all_words 1 +display_all_blobs 1 +display_segmentations 2 +display_ratings 1 diff --git a/tessdata/tessconfigs/nobatch b/tessdata/tessconfigs/nobatch new file mode 100644 index 000000000..b042c2701 --- /dev/null +++ b/tessdata/tessconfigs/nobatch @@ -0,0 +1,2 @@ +display_text 0 + diff --git a/tessdata/tessconfigs/segdemo b/tessdata/tessconfigs/segdemo index 244386ebd..d1487bb3e 100755 --- a/tessdata/tessconfigs/segdemo +++ b/tessdata/tessconfigs/segdemo @@ -2,70 +2,6 @@ # Adaptive Matcher Using PreAdapted Templates ################################################# -acts_fx 0x800 -acts_ocr 0x20 - -RatingScale 30.0 -CertaintyScale 20.0 - -#EnableMatcher 0 -#CurrentFx 2 -EnableAdaptiveMatcher 1 - -NormAdjMidpoint 32.0 -NormAdjCurl 2.0 - -MinNormScaleX 0.0 -MaxNormScaleX 0.325 -MinNormScaleY 0.0 -MaxNormScaleY 0.325 - -BuiltInTemplatesFile tessdata/inttemp -BuiltInCutoffsFile tessdata/pffmtable - -EnableLearning 0 -SaveAdaptedTemplates 0 -UsePreAdaptedTemplates 0 -ReliableConfigThreshold 2 -MinNumPermClasses 3 - -#EnableStopper 1 -GoodAdaptiveMatch 0.125 -GreatAdaptiveMatch 0.0 - -EnableIntFX 1 -EnableNewAdaptRules 1 -################################################################################ -# -# File: marks/configs/knobs -# Description: Control variables for 'marks' code -# Author: Mark Seaman, OCR Technology -# Created: Wed Feb 27 11:27:27 1991 -# Modified: Tue Jul 30 16:25:37 1991 (Mark Seaman) marks@hpgrlt -# Language: Text -# Package: N/A -# Status: Experimental (Do Not Distribute) -# -# (c) Copyright 1991, Hewlett-Packard Company, all rights reserved. -# -################################################################################ - -#hidden_edges 1 - -save_doc_words 1 -doc_dict_enable 1 -ClassPrunerThreshold 229 -ClassPrunerMultiplier 15 -IntThetaFudge 128 -CPCutoffStrength 0.15 -EvidenceTableBits 9 -IntEvidenceTruncBits 14 -SEExponentialMultiplier 0 -SimilarityCenter 0.0075 -################################################# -# Adaptive Matcher Using 2 Passes -################################################# - display_splits 0 display_all_words 1 display_all_blobs 1 diff --git a/testing/Makefile b/testing/Makefile new file mode 100644 index 000000000..b1e1132fc --- /dev/null +++ b/testing/Makefile @@ -0,0 +1,185 @@ +# Makefile.in generated automatically by automake 1.4-p6 from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999, 2001 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + +SHELL = /bin/sh + +srcdir = . +top_srcdir = .. + +prefix = /usr/local +exec_prefix = ${prefix} + +bindir = ${exec_prefix}/bin +sbindir = ${exec_prefix}/sbin +libexecdir = ${exec_prefix}/libexec +datadir = ${prefix}/share +sysconfdir = ${prefix}/etc +sharedstatedir = ${prefix}/com +localstatedir = ${prefix}/var +libdir = ${exec_prefix}/lib +infodir = ${prefix}/info +mandir = ${prefix}/man +includedir = ${prefix}/include/tesseract +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/ +pkglibdir = $(libdir)/ +pkgincludedir = $(includedir)/ + +top_builddir = .. + +ACLOCAL = aclocal-1.4 +AUTOCONF = autoconf +AUTOMAKE = automake-1.4 +AUTOHEADER = autoheader + +INSTALL = /usr/bin/install -c +INSTALL_PROGRAM = ${INSTALL} $(AM_INSTALL_PROGRAM_FLAGS) +INSTALL_DATA = ${INSTALL} -m 644 +INSTALL_SCRIPT = ${INSTALL} +transform = s,x,x, + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +host_alias = +host_triplet = x86_64-unknown-linux-gnu +CC = gcc +CXX = g++ +HAVE_LIB = @HAVE_LIB@ +LIB = @LIB@ +LTLIB = @LTLIB@ +MAINT = # +MAKEINFO = /home/rays/src/opensrc/tesseract-ocr/config/missing makeinfo +PACKAGE = +PACKAGE_DATE = 07/2007 +PACKAGE_NAME = tesseract +PACKAGE_VERSION = 2.00 +PACKAGE_YEAR = 2007 +RANLIB = ranlib +VERSION = + +EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum +mkinstalldirs = $(SHELL) $(top_srcdir)/config/mkinstalldirs +CONFIG_HEADER = ../config_auto.h +CONFIG_CLEAN_FILES = +DIST_COMMON = README Makefile.am Makefile.in + + +DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST) + +TAR = tar +GZIP_ENV = --best +all: all-redirect +.SUFFIXES: +$(srcdir)/Makefile.in: # Makefile.am $(top_srcdir)/configure.ac $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu testing/Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status $(BUILT_SOURCES) + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +tags: TAGS +TAGS: + + +distdir = $(top_builddir)/$(PACKAGE_NAME)-$(PACKAGE_VERSION)/$(subdir) + +subdir = testing + +distdir: $(DISTFILES) + here=`cd $(top_builddir) && pwd`; \ + top_distdir=`cd $(top_distdir) && pwd`; \ + distdir=`cd $(distdir) && pwd`; \ + cd $(top_srcdir) \ + && $(AUTOMAKE) --include-deps --build-dir=$$here --srcdir-name=$(top_srcdir) --output-dir=$$top_distdir --gnu testing/Makefile + $(mkinstalldirs) $(distdir)/reports + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pr $$d/$$file $(distdir)/$$file; \ + else \ + test -f $(distdir)/$$file \ + || ln $$d/$$file $(distdir)/$$file 2> /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done +info-am: +info: info-am +dvi-am: +dvi: dvi-am +check-am: all-am +check: check-am +installcheck-am: +installcheck: installcheck-am +install-exec-am: +install-exec: install-exec-am + +install-data-am: +install-data: install-data-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-am +uninstall-am: +uninstall: uninstall-am +all-am: Makefile +all-redirect: all-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-generic + +mostlyclean: mostlyclean-am + +clean-am: clean-generic mostlyclean-am + +clean: clean-am + +distclean-am: distclean-generic clean-am + +distclean: distclean-am + +maintainer-clean-am: maintainer-clean-generic distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-am + +.PHONY: tags distdir info-am info dvi-am dvi check check-am \ +installcheck-am installcheck install-exec-am install-exec \ +install-data-am install-data install-am install uninstall-am uninstall \ +all-redirect all-am all installdirs mostlyclean-generic \ +distclean-generic clean-generic maintainer-clean-generic clean \ +mostlyclean distclean maintainer-clean + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/testing/Makefile.am b/testing/Makefile.am new file mode 100644 index 000000000..d7254b532 --- /dev/null +++ b/testing/Makefile.am @@ -0,0 +1,2 @@ + +EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum diff --git a/testing/Makefile.in b/testing/Makefile.in new file mode 100644 index 000000000..061d682aa --- /dev/null +++ b/testing/Makefile.in @@ -0,0 +1,185 @@ +# Makefile.in generated automatically by automake 1.4-p6 from Makefile.am + +# Copyright (C) 1994, 1995-8, 1999, 2001 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + + +SHELL = @SHELL@ + +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +VPATH = @srcdir@ +prefix = @prefix@ +exec_prefix = @exec_prefix@ + +bindir = @bindir@ +sbindir = @sbindir@ +libexecdir = @libexecdir@ +datadir = @datadir@ +sysconfdir = @sysconfdir@ +sharedstatedir = @sharedstatedir@ +localstatedir = @localstatedir@ +libdir = @libdir@ +infodir = @infodir@ +mandir = @mandir@ +includedir = @includedir@ +oldincludedir = /usr/include + +DESTDIR = + +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ + +top_builddir = .. + +ACLOCAL = @ACLOCAL@ +AUTOCONF = @AUTOCONF@ +AUTOMAKE = @AUTOMAKE@ +AUTOHEADER = @AUTOHEADER@ + +INSTALL = @INSTALL@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ $(AM_INSTALL_PROGRAM_FLAGS) +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +transform = @program_transform_name@ + +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +host_alias = @host_alias@ +host_triplet = @host@ +CC = @CC@ +CXX = @CXX@ +HAVE_LIB = @HAVE_LIB@ +LIB = @LIB@ +LTLIB = @LTLIB@ +MAINT = @MAINT@ +MAKEINFO = @MAKEINFO@ +PACKAGE = @PACKAGE@ +PACKAGE_DATE = @PACKAGE_DATE@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PACKAGE_YEAR = @PACKAGE_YEAR@ +RANLIB = @RANLIB@ +VERSION = @VERSION@ + +EXTRA_DIST = README counttestset.sh reorgdata.sh runalltests.sh runtestset.sh reports/1995.bus.3B.sum reports/1995.doe3.3B.sum reports/1995.mag.3B.sum reports/1995.news.3B.sum +mkinstalldirs = $(SHELL) $(top_srcdir)/config/mkinstalldirs +CONFIG_HEADER = ../config_auto.h +CONFIG_CLEAN_FILES = +DIST_COMMON = README Makefile.am Makefile.in + + +DISTFILES = $(DIST_COMMON) $(SOURCES) $(HEADERS) $(TEXINFOS) $(EXTRA_DIST) + +TAR = tar +GZIP_ENV = --best +all: all-redirect +.SUFFIXES: +$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am $(top_srcdir)/configure.ac $(ACLOCAL_M4) + cd $(top_srcdir) && $(AUTOMAKE) --gnu testing/Makefile + +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status $(BUILT_SOURCES) + cd $(top_builddir) \ + && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status + +tags: TAGS +TAGS: + + +distdir = $(top_builddir)/$(PACKAGE_NAME)-$(PACKAGE_VERSION)/$(subdir) + +subdir = testing + +distdir: $(DISTFILES) + here=`cd $(top_builddir) && pwd`; \ + top_distdir=`cd $(top_distdir) && pwd`; \ + distdir=`cd $(distdir) && pwd`; \ + cd $(top_srcdir) \ + && $(AUTOMAKE) --include-deps --build-dir=$$here --srcdir-name=$(top_srcdir) --output-dir=$$top_distdir --gnu testing/Makefile + $(mkinstalldirs) $(distdir)/reports + @for file in $(DISTFILES); do \ + d=$(srcdir); \ + if test -d $$d/$$file; then \ + cp -pr $$d/$$file $(distdir)/$$file; \ + else \ + test -f $(distdir)/$$file \ + || ln $$d/$$file $(distdir)/$$file 2> /dev/null \ + || cp -p $$d/$$file $(distdir)/$$file || :; \ + fi; \ + done +info-am: +info: info-am +dvi-am: +dvi: dvi-am +check-am: all-am +check: check-am +installcheck-am: +installcheck: installcheck-am +install-exec-am: +install-exec: install-exec-am + +install-data-am: +install-data: install-data-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am +install: install-am +uninstall-am: +uninstall: uninstall-am +all-am: Makefile +all-redirect: all-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) AM_INSTALL_PROGRAM_FLAGS=-s install +installdirs: + + +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -rm -f Makefile $(CONFIG_CLEAN_FILES) + -rm -f config.cache config.log stamp-h stamp-h[0-9]* + +maintainer-clean-generic: +mostlyclean-am: mostlyclean-generic + +mostlyclean: mostlyclean-am + +clean-am: clean-generic mostlyclean-am + +clean: clean-am + +distclean-am: distclean-generic clean-am + +distclean: distclean-am + +maintainer-clean-am: maintainer-clean-generic distclean-am + @echo "This command is intended for maintainers to use;" + @echo "it deletes files that may require special tools to rebuild." + +maintainer-clean: maintainer-clean-am + +.PHONY: tags distdir info-am info dvi-am dvi check check-am \ +installcheck-am installcheck install-exec-am install-exec \ +install-data-am install-data install-am install uninstall-am uninstall \ +all-redirect all-am all installdirs mostlyclean-generic \ +distclean-generic clean-generic maintainer-clean-generic clean \ +mostlyclean distclean maintainer-clean + + +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/testing/README b/testing/README new file mode 100644 index 000000000..48024a908 --- /dev/null +++ b/testing/README @@ -0,0 +1,43 @@ +How to run UNLV tests. + +The scripts in this directory make it possible to duplicate the tests +published in the Fourth Annual Test of OCR Accuracy. +See http://www.isri.unlv.edu/downloads/AT-1995.pdf +but first you have to get the tools and data from UNLV: + +Step 1: to download the images goto +http://www.isri.unlv.edu/ISRI/OCRtk +and get 3b.tgz, Bb.tgz, Mb.tgz and Nb.tgz. + +Step 2: extract the files. It doesn't really matter where +in your filesystem you put them, but they must go under a common +root so you have directories 3, B, M and N in, for example, +/users/me/ISRI-OCRtk. + +Step 3: Reorg the files +The lack of tif extensions on the images is inconvenient, so there +is a script to reorganize the data to match the rest of the test +scripts. +cd to /users/me/ISRI-OCRtk or wherever 3, B, M and N ended up and run +/blah/blah/tesseract-ocr/testing/reorgdata.sh 3B +This makes directories doe3.3B, bus.3B, mag.3B and news.3B. +You can now get rid of 3, B, M, and N unless you want to get some of the +other scanning resolutions out of them. + +Step 4: Download the ISRI toolkit from: +http://www.isri.unlv.edu/downloads/ftk-1.0.tgz + +Step 5: If they work for you, use the binaries directly from the bin +directory and put them in tesseract-ocr/testing/unlv +otherwise build the tools for yourself and put them there. + +Step 6: cd back to your main tesseract-ocr dir and Build tesseract. + +Step 7: run testing/runalltests.sh with the root data dir and testname: +testing/runalltests.sh /users/me/ISRI-OCRtk tess2.0 +and go to the gym, have lunch etc. + +Step 8: There should be a file +testing/reports/tess2.0.summary that contains the final summarized accuracy +report and comparison with the 1995 results. + diff --git a/testing/counttestset.sh b/testing/counttestset.sh new file mode 100755 index 000000000..408a93c17 --- /dev/null +++ b/testing/counttestset.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# File: counttestset.sh +# Description: Script to count the errors on a single UNLV set. +# Author: Ray Smith +# Created: Wed Jun 13 11:58:01 PDT 2007 +# +# (C) Copyright 2007, Google Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [ $# -ne 1 ] +then + echo "Usage:$0 pagesfile" + exit 1 +fi +if [ ! -d ccmain ] +then + echo "Run $0 from the tesseract-ocr root directory!" + exit 1 +fi +if [ ! -r testing/unlv/accuracy ] +then + echo "Please download the UNLV accuracy tools (and build) to testing/unlv" + exit 1 +fi +pages=$1 + +imdir=${pages%/pages} +setname=${imdir##*/} +resdir=testing/results/$setname +mkdir -p testing/reports +echo "Counting on set $setname in directory $imdir to $resdir" +accfiles="" +wafiles="" +while read page dir +do + if [ "$dir" ] + then + srcdir="$imdir/$dir" + else + srcdir="$imdir" + fi +# echo "$srcdir/$page.tif" + # Count character errors. + testing/unlv/accuracy $srcdir/$page.txt $resdir/$page.txt $resdir/$page.acc + accfiles="$accfiles $resdir/$page.acc" + # Count word errors. + testing/unlv/wordacc $srcdir/$page.txt $resdir/$page.txt $resdir/$page.wa + wafiles="$wafiles $resdir/$page.wa" +done <$pages +testing/unlv/accsum $accfiles >testing/reports/$setname.characc +testing/unlv/wordaccsum $wafiles >testing/reports/$setname.wordacc + + diff --git a/testing/reorgdata.sh b/testing/reorgdata.sh new file mode 100755 index 000000000..141de4a6f --- /dev/null +++ b/testing/reorgdata.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +if [ $# -ne 1 ] +then + echo "Usage:$0 scantype" + echo "UNLV data comes in several scan types:" + echo "3B=300 dpi binary" + echo "3A=adaptive thresholded 300 dpi" + echo "3G=300 dpi grey" + echo "4B=400dpi binary" + echo "2B=200dpi binary" + echo "For now we only use 3B" + exit 1 +fi +ext=$1 + +#There are several test sets without meaningful names, so rename +#them with something a bit more meaningful. +#Each s is oldname/newname +for s in 3/doe3 B/bus M/mag N/news L/legal R/rep S/spn Z/zset +do + old=${s%/*} + #if this set was downloaded then process it. + if [ -r "$old/PAGES" ] + then + new=${s#*/}.$ext + mkdir -p $new + echo "Set $old -> $new" + #The pages file had - instead of _ so fix it and add the extension. + for page in `cat $old/PAGES` + do + echo "${page%-*}_${page#*-}.$ext" + done >$new/pages + for f in `cat $new/pages` + do + #Put a tif extension on the tif files. + cp $old/${old}_B/$f $new/$f.tif + #Put a uzn extension on the zone files. + cp $old/${old}_B/${f}Z $new/$f.uzn + #Cat all the truth files together and put into a single txt file. + cat $old/${old}_GT/${f%.$ext}.Z* >$new/$f.txt + done + fi +done diff --git a/testing/reports/1995.bus.3B.sum b/testing/reports/1995.bus.3B.sum new file mode 100644 index 000000000..00eb97a86 --- /dev/null +++ b/testing/reports/1995.bus.3B.sum @@ -0,0 +1 @@ +1995 bus.3B 5959 98.14% 0.00% 1631 96.83% 0.00% 1293 95.73% 0.00% diff --git a/testing/reports/1995.doe3.3B.sum b/testing/reports/1995.doe3.3B.sum new file mode 100644 index 000000000..7eb753aee --- /dev/null +++ b/testing/reports/1995.doe3.3B.sum @@ -0,0 +1 @@ +1995 doe3.3B 36349 97.52% 0.00% 7826 96.34% 0.00% 7042 94.87% 0.00% diff --git a/testing/reports/1995.mag.3B.sum b/testing/reports/1995.mag.3B.sum new file mode 100644 index 000000000..e718c5433 --- /dev/null +++ b/testing/reports/1995.mag.3B.sum @@ -0,0 +1 @@ +1995 mag.3B 15043 97.74% 0.00% 4566 96.01% 0.00% 3379 94.99% 0.00% diff --git a/testing/reports/1995.news.3B.sum b/testing/reports/1995.news.3B.sum new file mode 100644 index 000000000..bd0b7c68d --- /dev/null +++ b/testing/reports/1995.news.3B.sum @@ -0,0 +1 @@ +1995 news.3B 6432 98.69% 0.00% 1946 97.68% 0.00% 1502 96.94% 0.00% diff --git a/testing/runalltests.sh b/testing/runalltests.sh new file mode 100755 index 000000000..6a3fdc1fd --- /dev/null +++ b/testing/runalltests.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# File: runalltests.sh +# Description: Script to run a set of UNLV test sets. +# Author: Ray Smith +# Created: Thu Jun 14 08:21:01 PDT 2007 +# +# (C) Copyright 2007, Google Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [ $# -ne 2 ] +then + echo "Usage:$0 unlv-data-dir version-id" + exit 1 +fi +if [ ! -d ccmain ] +then + echo "Run $0 from the tesseract-ocr root directory!" + exit 1 +fi +if [ ! -r ccmain/tesseract -a ! -r tesseract.exe ] +then + echo "Please build tesseract before running $0" + exit 1 +fi +if [ ! -r testing/unlv/accuracy -a ! -r testing/unlv/accuracy.exe ] +then + echo "Please download the UNLV accuracy tools (and build) to testing/unlv" + exit 1 +fi + +#deltapc new old calculates the %change from old to new +deltapc() { +awk ' BEGIN { +printf("%.2f", 100.0*('$1'-'$2')/'$2'); +}' +} + +imdir="$1" +vid="$2" +bindir=${0%/*} +if [ "$bindir" = "$0" ] +then + bindir="./" +fi +rdir=testing/reports +testsets="bus.3B doe3.3B mag.3B news.3B" + +totalerrs=0 +totalwerrs=0 +totalnswerrs=0 +totalolderrs=0 +totaloldwerrs=0 +totaloldnswerrs=0 +for set in $testsets +do + if [ -r $imdir/$set/pages ] + then + # Run tesseract on all the pages. + $bindir/runtestset.sh $imdir/$set/pages + # Count the errors on all the pages. + $bindir/counttestset.sh $imdir/$set/pages + # Get the old character word and nonstop word errors. + olderrs=`cat testing/reports/1995.$set.sum | cut -f3` + oldwerrs=`cat testing/reports/1995.$set.sum | cut -f6` + oldnswerrs=`cat testing/reports/1995.$set.sum | cut -f9` + # Get the new character word and nonstop word errors and accuracy. + cherrs=`head -4 testing/reports/$set.characc |tail -1 |cut -c1-9 | + tr -d '[:blank:]'` + chacc=`head -5 testing/reports/$set.characc |tail -1 |cut -c1-9 | + tr -d '[:blank:]'` + wderrs=`head -4 testing/reports/$set.wordacc |tail -1 |cut -c1-9 | + tr -d '[:blank:]'` + wdacc=`head -5 testing/reports/$set.wordacc |tail -1 |cut -c1-9 | + tr -d '[:blank:]'` + nswderrs=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 | + cut -c10-17 |tr -d '[:blank:]'` + nswdacc=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 | + cut -c19-26 |tr -d '[:blank:]'` + # Compute the percent change. + chdelta=`deltapc $cherrs $olderrs` + wdelta=`deltapc $wderrs $oldwerrs` + nswdelta=`deltapc $nswderrs $oldnswerrs` + sumfile=$rdir/$vid.$set.sum + echo "$vid $set $cherrs $chacc $chdelta% $wderrs $wdacc\ + $wdelta% $nswderrs $nswdacc $nswdelta%" >$sumfile + # Sum totals over all the testsets. + let totalerrs=totalerrs+cherrs + let totalwerrs=totalwerrs+wderrs + let totalnswerrs=totalnswerrs+nswderrs + let totalolderrs=totalolderrs+olderrs + let totaloldwerrs=totaloldwerrs+oldwerrs + let totaloldnswerrs=totaloldnswerrs+oldnswerrs + fi +done +# Compute grand total percent change. +chdelta=`deltapc $totalerrs $totalolderrs` +wdelta=`deltapc $totalwerrs $totaloldwerrs` +nswdelta=`deltapc $totalnswerrs $totaloldnswerrs ` +tfile=$rdir/$vid.total.sum +echo "$vid Total $totalerrs - $chdelta% $totalwerrs\ + - $wdelta% $totalnswerrs - $nswdelta%" >$tfile +cat $rdir/1995.*.sum $rdir/$vid.*.sum >$rdir/$vid.summary diff --git a/testing/runtestset.sh b/testing/runtestset.sh new file mode 100755 index 000000000..b44d51c9e --- /dev/null +++ b/testing/runtestset.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# File: runtestset.sh +# Description: Script to run tesseract on a single UNLV set. +# Author: Ray Smith +# Created: Wed Jun 13 10:13:01 PDT 2007 +# +# (C) Copyright 2007, Google Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if [ $# -ne 1 ] +then + echo "Usage:$0 pagesfile" + exit 1 +fi +if [ ! -d ccmain ] +then + echo "Run $0 from the tesseract-ocr root directory!" + exit 1 +fi +if [ ! -r ccmain/tesseract ] +then + if [ ! -r tesseract.exe ] + then + echo "Please build tesseract before running $0" + exit 1 + else + tess="./tesseract.exe" + fi +else + tess="ccmain/tesseract" + export TESSDATA_PREFIX=$PWD/ +fi + +pages=$1 + +imdir=${pages%/pages} +setname=${imdir##*/} +resdir=testing/results/$setname +echo "Testing on set $setname in directory $imdir to $resdir" +mkdir -p $resdir +while read page dir +do + # A pages file may be a list of files with subdirs or maybe just + # a plain list of files so accomodate both. + if [ "$dir" ] + then + srcdir="$imdir/$dir" + else + srcdir="$imdir" + fi +# echo "$srcdir/$page.tif" + $tess $srcdir/$page.tif $resdir/$page nobatch unlv +done <$pages