/////////////////////////////////////////////////////////////////////// // File: blamer.cpp // Description: Module allowing precise error causes to be allocated. // Author: Rike Antonova // Refactored: Ray Smith // Created: Mon Feb 04 14:37:01 PST 2013 // // (C) Copyright 2013, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // /////////////////////////////////////////////////////////////////////// #include "blamer.h" #include "blobs.h" #include "matrix.h" #include "normalis.h" #include "pageres.h" // Names for each value of IncorrectResultReason enum. Keep in sync. const char kBlameCorrect[] = "corr"; const char kBlameClassifier[] = "cl"; const char kBlameChopper[] = "chop"; const char kBlameClassLMTradeoff[] = "cl/LM"; const char kBlamePageLayout[] = "pglt"; const char kBlameSegsearchHeur[] = "ss_heur"; const char kBlameSegsearchPP[] = "ss_pp"; const char kBlameClassOldLMTradeoff[] = "cl/old_LM"; const char kBlameAdaption[] = "adapt"; const char kBlameNoTruthSplit[] = "no_tr_spl"; const char kBlameNoTruth[] = "no_tr"; const char kBlameUnknown[] = "unkn"; const char * const kIncorrectResultReasonNames[] = { kBlameCorrect, kBlameClassifier, kBlameChopper, kBlameClassLMTradeoff, kBlamePageLayout, kBlameSegsearchHeur, kBlameSegsearchPP, kBlameClassOldLMTradeoff, kBlameAdaption, kBlameNoTruthSplit, kBlameNoTruth, kBlameUnknown }; const char *BlamerBundle::IncorrectReasonName(IncorrectResultReason irr) { return kIncorrectResultReasonNames[irr]; } const char *BlamerBundle::IncorrectReason() const { return kIncorrectResultReasonNames[incorrect_result_reason_]; } // Functions to setup the blamer. // Whole word string, whole word bounding box. void BlamerBundle::SetWordTruth(const UNICHARSET& unicharset, const char* truth_str, const TBOX& word_box) { truth_word_.InsertBox(0, word_box); truth_has_char_boxes_ = false; // Encode the string as UNICHAR_IDs. GenericVector encoding; GenericVector lengths; unicharset.encode_string(truth_str, false, &encoding, &lengths, NULL); int total_length = 0; for (int i = 0; i < encoding.size(); total_length += lengths[i++]) { STRING uch(truth_str + total_length); uch.truncate_at(lengths[i] - total_length); UNICHAR_ID id = encoding[i]; if (id != INVALID_UNICHAR_ID) uch = unicharset.get_normed_unichar(id); truth_text_.push_back(uch); } } // Single "character" string, "character" bounding box. // May be called multiple times to indicate the characters in a word. void BlamerBundle::SetSymbolTruth(const UNICHARSET& unicharset, const char* char_str, const TBOX& char_box) { STRING symbol_str(char_str); UNICHAR_ID id = unicharset.unichar_to_id(char_str); if (id != INVALID_UNICHAR_ID) { STRING normed_uch(unicharset.get_normed_unichar(id)); if (normed_uch.length() > 0) symbol_str = normed_uch; } int length = truth_word_.length(); truth_text_.push_back(symbol_str); truth_word_.InsertBox(length, char_box); if (length == 0) truth_has_char_boxes_ = true; else if (truth_word_.BlobBox(length - 1) == char_box) truth_has_char_boxes_ = false; } // Marks that there is something wrong with the truth text, like it contains // reject characters. void BlamerBundle::SetRejectedTruth() { incorrect_result_reason_ = IRR_NO_TRUTH; truth_has_char_boxes_ = false; } // Returns true if the provided word_choice is correct. bool BlamerBundle::ChoiceIsCorrect(const WERD_CHOICE* word_choice) const { if (word_choice == NULL) return false; const UNICHARSET* uni_set = word_choice->unicharset(); STRING normed_choice_str; for (int i = 0; i < word_choice->length(); ++i) { normed_choice_str += uni_set->get_normed_unichar(word_choice->unichar_id(i)); } STRING truth_str = TruthString(); return truth_str == normed_choice_str; } void BlamerBundle::FillDebugString(const STRING &msg, const WERD_CHOICE *choice, STRING *debug) { (*debug) += "Truth "; for (int i = 0; i < this->truth_text_.length(); ++i) { (*debug) += this->truth_text_[i]; } if (!this->truth_has_char_boxes_) (*debug) += " (no char boxes)"; if (choice != NULL) { (*debug) += " Choice "; STRING choice_str; choice->string_and_lengths(&choice_str, NULL); (*debug) += choice_str; } if (msg.length() > 0) { (*debug) += "\n"; (*debug) += msg; } (*debug) += "\n"; } // Sets up the norm_truth_word from truth_word using the given DENORM. void BlamerBundle::SetupNormTruthWord(const DENORM& denorm) { // TODO(rays) Is this the last use of denorm in WERD_RES and can it go? norm_box_tolerance_ = kBlamerBoxTolerance * denorm.x_scale(); TPOINT topleft; TPOINT botright; TPOINT norm_topleft; TPOINT norm_botright; for (int b = 0; b < truth_word_.length(); ++b) { const TBOX &box = truth_word_.BlobBox(b); topleft.x = box.left(); topleft.y = box.top(); botright.x = box.right(); botright.y = box.bottom(); denorm.NormTransform(NULL, topleft, &norm_topleft); denorm.NormTransform(NULL, botright, &norm_botright); TBOX norm_box(norm_topleft.x, norm_botright.y, norm_botright.x, norm_topleft.y); norm_truth_word_.InsertBox(b, norm_box); } } // Splits *this into two pieces in bundle1 and bundle2 (preallocated, empty // bundles) where the right edge/ of the left-hand word is word1_right, // and the left edge of the right-hand word is word2_left. void BlamerBundle::SplitBundle(int word1_right, int word2_left, bool debug, BlamerBundle* bundle1, BlamerBundle* bundle2) const { STRING debug_str; // Find truth boxes that correspond to the split in the blobs. int b; int begin2_truth_index = -1; if (incorrect_result_reason_ != IRR_NO_TRUTH && truth_has_char_boxes_) { debug_str = "Looking for truth split at"; debug_str.add_str_int(" end1_x ", word1_right); debug_str.add_str_int(" begin2_x ", word2_left); debug_str += "\nnorm_truth_word boxes:\n"; if (norm_truth_word_.length() > 1) { norm_truth_word_.BlobBox(0).print_to_str(&debug_str); for (b = 1; b < norm_truth_word_.length(); ++b) { norm_truth_word_.BlobBox(b).print_to_str(&debug_str); if ((abs(word1_right - norm_truth_word_.BlobBox(b - 1).right()) < norm_box_tolerance_) && (abs(word2_left - norm_truth_word_.BlobBox(b).left()) < norm_box_tolerance_)) { begin2_truth_index = b; debug_str += "Split found"; break; } } debug_str += '\n'; } } // Populate truth information in word and word2 with the first and second // part of the original truth. if (begin2_truth_index > 0) { bundle1->truth_has_char_boxes_ = true; bundle1->norm_box_tolerance_ = norm_box_tolerance_; bundle2->truth_has_char_boxes_ = true; bundle2->norm_box_tolerance_ = norm_box_tolerance_; BlamerBundle *curr_bb = bundle1; for (b = 0; b < norm_truth_word_.length(); ++b) { if (b == begin2_truth_index) curr_bb = bundle2; curr_bb->norm_truth_word_.InsertBox(b, norm_truth_word_.BlobBox(b)); curr_bb->truth_word_.InsertBox(b, truth_word_.BlobBox(b)); curr_bb->truth_text_.push_back(truth_text_[b]); } } else if (incorrect_result_reason_ == IRR_NO_TRUTH) { bundle1->incorrect_result_reason_ = IRR_NO_TRUTH; bundle2->incorrect_result_reason_ = IRR_NO_TRUTH; } else { debug_str += "Truth split not found"; debug_str += truth_has_char_boxes_ ? "\n" : " (no truth char boxes)\n"; bundle1->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, NULL, debug); bundle2->SetBlame(IRR_NO_TRUTH_SPLIT, debug_str, NULL, debug); } } // "Joins" the blames from bundle1 and bundle2 into *this. void BlamerBundle::JoinBlames(const BlamerBundle& bundle1, const BlamerBundle& bundle2, bool debug) { STRING debug_str; IncorrectResultReason irr = incorrect_result_reason_; if (irr != IRR_NO_TRUTH_SPLIT) debug_str = ""; if (bundle1.incorrect_result_reason_ != IRR_CORRECT && bundle1.incorrect_result_reason_ != IRR_NO_TRUTH && bundle1.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) { debug_str += "Blame from part 1: "; debug_str += bundle1.debug_; irr = bundle1.incorrect_result_reason_; } if (bundle2.incorrect_result_reason_ != IRR_CORRECT && bundle2.incorrect_result_reason_ != IRR_NO_TRUTH && bundle2.incorrect_result_reason_ != IRR_NO_TRUTH_SPLIT) { debug_str += "Blame from part 2: "; debug_str += bundle2.debug_; if (irr == IRR_CORRECT) { irr = bundle2.incorrect_result_reason_; } else if (irr != bundle2.incorrect_result_reason_) { irr = IRR_UNKNOWN; } } incorrect_result_reason_ = irr; if (irr != IRR_CORRECT && irr != IRR_NO_TRUTH) { SetBlame(irr, debug_str, NULL, debug); } } // If a blob with the same bounding box as one of the truth character // bounding boxes is not classified as the corresponding truth character // blames character classifier for incorrect answer. void BlamerBundle::BlameClassifier(const UNICHARSET& unicharset, const TBOX& blob_box, const BLOB_CHOICE_LIST& choices, bool debug) { if (!truth_has_char_boxes_ || incorrect_result_reason_ != IRR_CORRECT) return; // Nothing to do here. for (int b = 0; b < norm_truth_word_.length(); ++b) { const TBOX &truth_box = norm_truth_word_.BlobBox(b); // Note that we are more strict on the bounding box boundaries here // than in other places (chopper, segmentation search), since we do // not have the ability to check the previous and next bounding box. if (blob_box.x_almost_equal(truth_box, norm_box_tolerance_/2)) { bool found = false; bool incorrect_adapted = false; UNICHAR_ID incorrect_adapted_id = INVALID_UNICHAR_ID; const char *truth_str = truth_text_[b].string(); // We promise not to modify the list or its contents, using a // const BLOB_CHOICE* below. BLOB_CHOICE_IT choices_it(const_cast(&choices)); for (choices_it.mark_cycle_pt(); !choices_it.cycled_list(); choices_it.forward()) { const BLOB_CHOICE* choice = choices_it.data(); if (strcmp(truth_str, unicharset.get_normed_unichar( choice->unichar_id())) == 0) { found = true; break; } else if (choice->IsAdapted()) { incorrect_adapted = true; incorrect_adapted_id = choice->unichar_id(); } } // end choices_it for loop if (!found) { STRING debug_str = "unichar "; debug_str += truth_str; debug_str += " not found in classification list"; SetBlame(IRR_CLASSIFIER, debug_str, NULL, debug); } else if (incorrect_adapted) { STRING debug_str = "better rating for adapted "; debug_str += unicharset.id_to_unichar(incorrect_adapted_id); debug_str += " than for correct "; debug_str += truth_str; SetBlame(IRR_ADAPTION, debug_str, NULL, debug); } break; } } // end iterating over blamer_bundle->norm_truth_word } // Checks whether chops were made at all the character bounding box // boundaries in word->truth_word. If not - blames the chopper for an // incorrect answer. void BlamerBundle::SetChopperBlame(const WERD_RES* word, bool debug) { if (NoTruth() || !truth_has_char_boxes_ || word->chopped_word->blobs.empty()) { return; } STRING debug_str; bool missing_chop = false; int num_blobs = word->chopped_word->blobs.size(); int box_index = 0; int blob_index = 0; inT16 truth_x = -1; while (box_index < truth_word_.length() && blob_index < num_blobs) { truth_x = norm_truth_word_.BlobBox(box_index).right(); TBLOB * curr_blob = word->chopped_word->blobs[blob_index]; if (curr_blob->bounding_box().right() < truth_x - norm_box_tolerance_) { ++blob_index; continue; // encountered an extra chop, keep looking } else if (curr_blob->bounding_box().right() > truth_x + norm_box_tolerance_) { missing_chop = true; break; } else { ++blob_index; } } if (missing_chop || box_index < norm_truth_word_.length()) { STRING debug_str; if (missing_chop) { debug_str.add_str_int("Detected missing chop (tolerance=", norm_box_tolerance_); debug_str += ") at Bounding Box="; TBLOB * curr_blob = word->chopped_word->blobs[blob_index]; curr_blob->bounding_box().print_to_str(&debug_str); debug_str.add_str_int("\nNo chop for truth at x=", truth_x); } else { debug_str.add_str_int("Missing chops for last ", norm_truth_word_.length() - box_index); debug_str += " truth box(es)"; } debug_str += "\nMaximally chopped word boxes:\n"; for (blob_index = 0; blob_index < num_blobs; ++blob_index) { TBLOB * curr_blob = word->chopped_word->blobs[blob_index]; curr_blob->bounding_box().print_to_str(&debug_str); debug_str += '\n'; } debug_str += "Truth bounding boxes:\n"; for (box_index = 0; box_index < norm_truth_word_.length(); ++box_index) { norm_truth_word_.BlobBox(box_index).print_to_str(&debug_str); debug_str += '\n'; } SetBlame(IRR_CHOPPER, debug_str, word->best_choice, debug); } } // Blames the classifier or the language model if, after running only the // chopper, best_choice is incorrect and no blame has been yet set. // Blames the classifier if best_choice is classifier's top choice and is a // dictionary word (i.e. language model could not have helped). // Otherwise, blames the language model (formerly permuter word adjustment). void BlamerBundle::BlameClassifierOrLangModel( const WERD_RES* word, const UNICHARSET& unicharset, bool valid_permuter, bool debug) { if (valid_permuter) { // Find out whether best choice is a top choice. best_choice_is_dict_and_top_choice_ = true; for (int i = 0; i < word->best_choice->length(); ++i) { BLOB_CHOICE_IT blob_choice_it(word->GetBlobChoices(i)); ASSERT_HOST(!blob_choice_it.empty()); BLOB_CHOICE *first_choice = NULL; for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) { // find first non-fragment choice if (!(unicharset.get_fragment(blob_choice_it.data()->unichar_id()))) { first_choice = blob_choice_it.data(); break; } } ASSERT_HOST(first_choice != NULL); if (first_choice->unichar_id() != word->best_choice->unichar_id(i)) { best_choice_is_dict_and_top_choice_ = false; break; } } } STRING debug_str; if (best_choice_is_dict_and_top_choice_) { debug_str = "Best choice is: incorrect, top choice, dictionary word"; debug_str += " with permuter "; debug_str += word->best_choice->permuter_name(); } else { debug_str = "Classifier/Old LM tradeoff is to blame"; } SetBlame(best_choice_is_dict_and_top_choice_ ? IRR_CLASSIFIER : IRR_CLASS_OLD_LM_TRADEOFF, debug_str, word->best_choice, debug); } // Sets up the correct_segmentation_* to mark the correct bounding boxes. void BlamerBundle::SetupCorrectSegmentation(const TWERD* word, bool debug) { params_training_bundle_.StartHypothesisList(); if (incorrect_result_reason_ != IRR_CORRECT || !truth_has_char_boxes_) return; // Nothing to do here. STRING debug_str; debug_str += "Blamer computing correct_segmentation_cols\n"; int curr_box_col = 0; int next_box_col = 0; int num_blobs = word->NumBlobs(); if (num_blobs == 0) return; // No blobs to play with. int blob_index = 0; inT16 next_box_x = word->blobs[blob_index]->bounding_box().right(); for (int truth_idx = 0; blob_index < num_blobs && truth_idx < norm_truth_word_.length(); ++blob_index) { ++next_box_col; inT16 curr_box_x = next_box_x; if (blob_index + 1 < num_blobs) next_box_x = word->blobs[blob_index + 1]->bounding_box().right(); inT16 truth_x = norm_truth_word_.BlobBox(truth_idx).right(); debug_str.add_str_int("Box x coord vs. truth: ", curr_box_x); debug_str.add_str_int(" ", truth_x); debug_str += "\n"; if (curr_box_x > (truth_x + norm_box_tolerance_)) { break; // failed to find a matching box } else if (curr_box_x >= truth_x - norm_box_tolerance_ && // matched (blob_index + 1 >= num_blobs || // next box can't be included next_box_x > truth_x + norm_box_tolerance_)) { correct_segmentation_cols_.push_back(curr_box_col); correct_segmentation_rows_.push_back(next_box_col-1); ++truth_idx; debug_str.add_str_int("col=", curr_box_col); debug_str.add_str_int(" row=", next_box_col-1); debug_str += "\n"; curr_box_col = next_box_col; } } if (blob_index < num_blobs || // trailing blobs correct_segmentation_cols_.length() != norm_truth_word_.length()) { debug_str.add_str_int("Blamer failed to find correct segmentation" " (tolerance=", norm_box_tolerance_); if (blob_index >= num_blobs) debug_str += " blob == NULL"; debug_str += ")\n"; debug_str.add_str_int(" path length ", correct_segmentation_cols_.length()); debug_str.add_str_int(" vs. truth ", norm_truth_word_.length()); debug_str += "\n"; SetBlame(IRR_UNKNOWN, debug_str, NULL, debug); correct_segmentation_cols_.clear(); correct_segmentation_rows_.clear(); } } // Returns true if a guided segmentation search is needed. bool BlamerBundle::GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const { return incorrect_result_reason_ == IRR_CORRECT && !segsearch_is_looking_for_blame_ && truth_has_char_boxes_ && !ChoiceIsCorrect(best_choice); } // Setup ready to guide the segmentation search to the correct segmentation. // The callback pp_cb is used to avoid a cyclic dependency. // It calls into LMPainPoints::GenerateForBlamer by pre-binding the // WERD_RES, and the LMPainPoints itself. // pp_cb must be a permanent callback, and should be deleted by the caller. void BlamerBundle::InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX* ratings, UNICHAR_ID wildcard_id, bool debug, STRING *debug_str, TessResultCallback2* cb) { segsearch_is_looking_for_blame_ = true; if (debug) { tprintf("segsearch starting to look for blame\n"); } // Fill pain points for any unclassifed blob corresponding to the // correct segmentation state. *debug_str += "Correct segmentation:\n"; for (int idx = 0; idx < correct_segmentation_cols_.length(); ++idx) { debug_str->add_str_int("col=", correct_segmentation_cols_[idx]); debug_str->add_str_int(" row=", correct_segmentation_rows_[idx]); *debug_str += "\n"; if (!ratings->Classified(correct_segmentation_cols_[idx], correct_segmentation_rows_[idx], wildcard_id) && !cb->Run(correct_segmentation_cols_[idx], correct_segmentation_rows_[idx])) { segsearch_is_looking_for_blame_ = false; *debug_str += "\nFailed to insert pain point\n"; SetBlame(IRR_SEGSEARCH_HEUR, *debug_str, best_choice, debug); break; } } // end for blamer_bundle->correct_segmentation_cols/rows } // Returns true if the guided segsearch is in progress. bool BlamerBundle::GuidedSegsearchStillGoing() const { return segsearch_is_looking_for_blame_; } // The segmentation search has ended. Sets the blame appropriately. void BlamerBundle::FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, STRING *debug_str) { // If we are still looking for blame (i.e. best_choice is incorrect, but a // path representing the correct segmentation could be constructed), we can // blame segmentation search pain point prioritization if the rating of the // path corresponding to the correct segmentation is better than that of // best_choice (i.e. language model would have done the correct thing, but // because of poor pain point prioritization the correct segmentation was // never explored). Otherwise we blame the tradeoff between the language model // and the classifier, since even after exploring the path corresponding to // the correct segmentation incorrect best_choice would have been chosen. // One special case when we blame the classifier instead is when best choice // is incorrect, but it is a dictionary word and it classifier's top choice. if (segsearch_is_looking_for_blame_) { segsearch_is_looking_for_blame_ = false; if (best_choice_is_dict_and_top_choice_) { *debug_str = "Best choice is: incorrect, top choice, dictionary word"; *debug_str += " with permuter "; *debug_str += best_choice->permuter_name(); SetBlame(IRR_CLASSIFIER, *debug_str, best_choice, debug); } else if (best_correctly_segmented_rating_ < best_choice->rating()) { *debug_str += "Correct segmentation state was not explored"; SetBlame(IRR_SEGSEARCH_PP, *debug_str, best_choice, debug); } else { if (best_correctly_segmented_rating_ >= WERD_CHOICE::kBadRating) { *debug_str += "Correct segmentation paths were pruned by LM\n"; } else { debug_str->add_str_double("Best correct segmentation rating ", best_correctly_segmented_rating_); debug_str->add_str_double(" vs. best choice rating ", best_choice->rating()); } SetBlame(IRR_CLASS_LM_TRADEOFF, *debug_str, best_choice, debug); } } } // If the bundle is null or still does not indicate the correct result, // fix it and use some backup reason for the blame. void BlamerBundle::LastChanceBlame(bool debug, WERD_RES* word) { if (word->blamer_bundle == NULL) { word->blamer_bundle = new BlamerBundle(); word->blamer_bundle->SetBlame(IRR_PAGE_LAYOUT, "LastChanceBlame", word->best_choice, debug); } else if (word->blamer_bundle->incorrect_result_reason_ == IRR_NO_TRUTH) { word->blamer_bundle->SetBlame(IRR_NO_TRUTH, "Rejected truth", word->best_choice, debug); } else { bool correct = word->blamer_bundle->ChoiceIsCorrect(word->best_choice); IncorrectResultReason irr = word->blamer_bundle->incorrect_result_reason_; if (irr == IRR_CORRECT && !correct) { STRING debug_str = "Choice is incorrect after recognition"; word->blamer_bundle->SetBlame(IRR_UNKNOWN, debug_str, word->best_choice, debug); } else if (irr != IRR_CORRECT && correct) { if (debug) { tprintf("Corrected %s\n", word->blamer_bundle->debug_.string()); } word->blamer_bundle->incorrect_result_reason_ = IRR_CORRECT; word->blamer_bundle->debug_ = ""; } } } // Sets the misadaption debug if this word is incorrect, as this word is // being adapted to. void BlamerBundle::SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug) { if (incorrect_result_reason_ != IRR_NO_TRUTH && !ChoiceIsCorrect(best_choice)) { misadaption_debug_ ="misadapt to word ("; misadaption_debug_ += best_choice->permuter_name(); misadaption_debug_ += "): "; FillDebugString("", best_choice, &misadaption_debug_); if (debug) { tprintf("%s\n", misadaption_debug_.string()); } } }