/********************************************************************** * File: tfacepp.cpp (Formerly tface++.c) * Description: C++ side of the C/C++ Tess/Editor interface. * Author: Ray Smith * Created: Thu Apr 23 15:39:23 BST 1992 * * (C) Copyright 1992, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/ #include "mfcpch.h" #ifdef __UNIX__ #include #endif #include "errcode.h" #include "ratngs.h" #include "reject.h" #include "werd.h" #include "tfacep.h" #include "tstruct.h" #include "tfacepp.h" #include "tessvars.h" #include "globals.h" #include "reject.h" #include "tesseractclass.h" #define EXTERN EXTERN BOOL_VAR (tessedit_override_permuter, TRUE, "According to dict_word"); #define MAX_UNDIVIDED_LENGTH 24 /********************************************************************** * recog_word * * Convert the word to tess form and pass it to the tess segmenter. * Convert the output back to editor form. **********************************************************************/ namespace tesseract { WERD_CHOICE *Tesseract::recog_word( //recog one owrd WERD *word, //word to do DENORM *denorm, //de-normaliser //matcher function POLY_MATCHER matcher, POLY_TESTER tester, //tester function POLY_TESTER trainer, //trainer function BOOL8 testing, //true if answer driven //raw result WERD_CHOICE *&raw_choice, //list of blob lists BLOB_CHOICE_LIST_CLIST *blob_choices, WERD *&outword //bln word output ) { WERD_CHOICE *word_choice; uinT8 perm_type; uinT8 real_dict_perm_type; if (word->blob_list ()->empty ()) { word_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f, TOP_CHOICE_PERM, unicharset); raw_choice = new WERD_CHOICE("", NULL, 10.0f, -1.0f, TOP_CHOICE_PERM, unicharset); outword = word->poly_copy (denorm->row ()->x_height ()); } else word_choice = recog_word_recursive (word, denorm, matcher, tester, trainer, testing, raw_choice, blob_choices, outword); if ((word_choice->length() != outword->blob_list()->length()) || (word_choice->length() != blob_choices->length())) { tprintf ("recog_word ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", word_choice->debug_string(unicharset).string(), word_choice->length(), outword->blob_list()->length(), blob_choices->length()); } ASSERT_HOST(word_choice->length() == outword->blob_list()->length()); ASSERT_HOST(word_choice->length() == blob_choices->length()); /* Copy any reject blobs into the outword */ outword->rej_blob_list()->deep_copy(word->rej_blob_list(), &PBLOB::deep_copy); if (tessedit_override_permuter) { /* Override the permuter type if a straight dictionary check disagrees. */ perm_type = word_choice->permuter(); if ((perm_type != SYSTEM_DAWG_PERM) && (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) { real_dict_perm_type = dict_word(*word_choice); if (((real_dict_perm_type == SYSTEM_DAWG_PERM) || (real_dict_perm_type == FREQ_DAWG_PERM) || (real_dict_perm_type == USER_DAWG_PERM)) && (alpha_count(word_choice->unichar_string().string(), word_choice->unichar_lengths().string()) > 0)) { word_choice->set_permuter (real_dict_perm_type); // use dict perm } } if (tessedit_rejection_debug && perm_type != word_choice->permuter ()) { tprintf ("Permuter Type Flipped from %d to %d\n", perm_type, word_choice->permuter ()); } } assert ((word_choice == NULL) == (raw_choice == NULL)); return word_choice; } /********************************************************************** * recog_word_recursive * * Convert the word to tess form and pass it to the tess segmenter. * Convert the output back to editor form. **********************************************************************/ WERD_CHOICE * Tesseract::recog_word_recursive( WERD *word, // word to do DENORM *denorm, // de-normaliser POLY_MATCHER matcher, // matcher function POLY_TESTER tester, // tester function POLY_TESTER trainer, // trainer function BOOL8 testing, // true if answer driven WERD_CHOICE *&raw_choice, // raw result BLOB_CHOICE_LIST_CLIST *blob_choices, // list of blob lists WERD *&outword // bln word output ) { inT32 initial_blob_choice_len; inT32 word_length; // no of blobs STRING word_string; // converted from tess STRING word_string_lengths; BLOB_CHOICE_LIST_VECTOR *tess_ratings; // tess results TWERD *tessword; // tess format BLOB_CHOICE_LIST_C_IT blob_choices_it; // iterator tess_matcher = matcher; // install matcher tess_tester = testing ? tester : NULL; tess_trainer = testing ? trainer : NULL; tess_denorm = denorm; tess_word = word; // blob_matchers[1]=call_matcher; if (word->blob_list ()->length () > MAX_UNDIVIDED_LENGTH) { return split_and_recog_word (word, denorm, matcher, tester, trainer, testing, raw_choice, blob_choices, outword); } else { UNICHAR_ID space_id = unicharset.unichar_to_id(" "); WERD_CHOICE *best_choice = new WERD_CHOICE(); raw_choice = new WERD_CHOICE(); initial_blob_choice_len = blob_choices->length(); tessword = make_tess_word (word, NULL); tess_ratings = cc_recog(tessword, best_choice, raw_choice, testing && tester != NULL, testing && trainer != NULL, word->flag(W_EOL)); outword = make_ed_word (tessword, word); // convert word if (outword == NULL) { outword = word->poly_copy (denorm->row ()->x_height ()); } delete_word(tessword); // get rid of it word_length = outword->blob_list()->length(); // no of blobs // Put BLOB_CHOICE_LISTs from tess_ratings into blob_choices. blob_choices_it.set_to_list(blob_choices); for (int i = 0; i < tess_ratings->length(); ++i) { blob_choices_it.add_to_end(tess_ratings->get(i)); } delete tess_ratings; // Pad raw_choice with spaces if needed. if (raw_choice->length() < word_length) { while (raw_choice->length() < word_length) { raw_choice->append_unichar_id(space_id, 1, 0.0, raw_choice->certainty()); } raw_choice->populate_unichars(unicharset); } // Do sanity checks and minor fixes on best_choice. if (best_choice->length() > word_length) { tprintf("recog_word: Discarded long string \"%s\"" " (%d characters vs %d blobs)\n", best_choice->unichar_string().string (), best_choice->length(), word_length); best_choice->make_bad(); // should never happen tprintf("Word is at (%g,%g)\n", denorm->origin(), denorm->y(word->bounding_box().bottom(), 0.0)); } if (blob_choices->length() - initial_blob_choice_len != word_length) { best_choice->make_bad(); // force rejection tprintf ("recog_word: Choices list len:%d; blob lists len:%d\n", blob_choices->length(), word_length); blob_choices_it.set_to_list(blob_choices); // list of lists while (blob_choices->length() - initial_blob_choice_len < word_length) { blob_choices_it.add_to_end(new BLOB_CHOICE_LIST()); // add a fake one tprintf("recog_word: Added dummy choice list\n"); } while (blob_choices->length() - initial_blob_choice_len > word_length) { blob_choices_it.move_to_last(); // should never happen delete blob_choices_it.extract(); tprintf("recog_word: Deleted choice list\n"); } } if (best_choice->length() < word_length) { while (best_choice->length() < word_length) { best_choice->append_unichar_id(space_id, 1, 0.0, best_choice->certainty()); } best_choice->populate_unichars(unicharset); } return best_choice; } } /********************************************************************** * split_and_recog_word * * Convert the word to tess form and pass it to the tess segmenter. * Convert the output back to editor form. **********************************************************************/ WERD_CHOICE * Tesseract::split_and_recog_word( //recog one owrd WERD *word, //word to do DENORM *denorm, //de-normaliser POLY_MATCHER matcher, //matcher function POLY_TESTER tester, //tester function POLY_TESTER trainer, //trainer function BOOL8 testing, //true if answer driven //raw result WERD_CHOICE *&raw_choice, //list of blob lists BLOB_CHOICE_LIST_CLIST *blob_choices, WERD *&outword //bln word output ) { // inT32 outword1_len; // inT32 outword2_len; WERD *first_word; //poly copy of word WERD *second_word; //fabricated word WERD *outword2; //2nd output word PBLOB *blob; WERD_CHOICE *result; //return value WERD_CHOICE *result2; //output of 2nd word WERD_CHOICE *raw_choice2; //raw version of 2nd float gap; //blob gap float bestgap; //biggest gap PBLOB_LIST new_blobs; //list of gathered blobs PBLOB_IT blob_it; //iterator PBLOB_IT new_blob_it = &new_blobs; first_word = word->poly_copy (denorm->row ()->x_height ()); blob_it.set_to_list (first_word->blob_list ()); bestgap = -MAX_INT32; while (!blob_it.at_last ()) { blob = blob_it.data (); //gap to next gap = blob_it.data_relative(1)->bounding_box().left() - blob->bounding_box().right(); blob_it.forward (); if (gap > bestgap) { bestgap = gap; //find biggest new_blob_it = blob_it; //save position } } //take 2nd half new_blobs.assign_to_sublist (&new_blob_it, &blob_it); //make it a word second_word = new WERD (&new_blobs, 1, NULL); ASSERT_HOST (word->blob_list ()->length () == first_word->blob_list ()->length () + second_word->blob_list ()->length ()); result = recog_word_recursive (first_word, denorm, matcher, tester, trainer, testing, raw_choice, blob_choices, outword); delete first_word; //done that one result2 = recog_word_recursive (second_word, denorm, matcher, tester, trainer, testing, raw_choice2, blob_choices, outword2); delete second_word; //done that too *result += *result2; //combine ratings delete result2; *raw_choice += *raw_choice2; delete raw_choice2; //finished with it // outword1_len= outword->blob_list()->length(); // outword2_len= outword2->blob_list()->length(); outword->join_on (outword2); //join words delete outword2; // if ( outword->blob_list()->length() != outword1_len + outword2_len ) // tprintf( "Split&Recog: part1len=%d; part2len=%d; combinedlen=%d\n", // outword1_len, outword2_len, outword->blob_list()->length() ); // ASSERT_HOST( outword->blob_list()->length() == outword1_len + outword2_len ); return result; } } // namespace tesseract /********************************************************************** * call_tester * * Called from Tess with a blob in tess form. * Convert the blob to editor form. * Call the tester setup by the segmenter in tess_tester. **********************************************************************/ #if 0 // dead code void call_tester( //call a tester const STRING& filename, TBLOB *tessblob, //blob to test BOOL8 correct_blob, //true if good char *text, //source text inT32 count, //chars in text LIST result //output of matcher ) { PBLOB *blob; //converted blob BLOB_CHOICE_LIST ratings; //matcher result blob = make_ed_blob (tessblob);//convert blob if (blob == NULL) return; //make it right type convert_choice_list(result, ratings); if (tess_tester != NULL) (*tess_tester) (filename, blob, tess_denorm, correct_blob, text, count, &ratings); delete blob; //don't need that now } #endif /********************************************************************** * call_train_tester * * Called from Tess with a blob in tess form. * Convert the blob to editor form. * Call the trainer setup by the segmenter in tess_trainer. **********************************************************************/ #if 0 // dead code void call_train_tester( //call a tester const STRING& filename, TBLOB *tessblob, //blob to test BOOL8 correct_blob, //true if good char *text, //source text inT32 count, //chars in text LIST result //output of matcher ) { PBLOB *blob; //converted blob BLOB_CHOICE_LIST ratings; //matcher result blob = make_ed_blob (tessblob);//convert blob if (blob == NULL) return; //make it right type convert_choice_list(result, ratings); if (tess_trainer != NULL) (*tess_trainer) (filename, blob, tess_denorm, correct_blob, text, count, &ratings); delete blob; //don't need that now } #endif