tesseract/wordrec/chopper.cpp

/* -*-C-*-
 ********************************************************************************
 *
 * File:         chopper.cpp  (Formerly chopper.c)
 * Description:
 * Author:       Mark Seaman, OCR Technology
 * Created:      Fri Oct 16 14:37:00 1987
 * Modified:     Tue Jul 30 16:18:52 1991 (Mark Seaman) marks@hpgrlt
 * Language:     C
 * Package:      N/A
 * Status:       Reusable Software Component
 *
 * (c) Copyright 1987, Hewlett-Packard Company.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **************************************************************************/

/*----------------------------------------------------------------------
          I n c l u d e s
----------------------------------------------------------------------*/

#include <math.h>

#include "chopper.h"

#include "assert.h"
#include "associate.h"
#include "blobs.h"
#include "callcpp.h"
#include "const.h"
#include "findseam.h"
#include "globals.h"
#include "render.h"
#include "pageres.h"
#include "seam.h"
#include "stopper.h"
#include "structures.h"
#include "unicharset.h"
#include "wordrec.h"

// Include automatically generated configuration file if running autoconf.
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif

// Even though the limit on the number of chunks may now be removed, keep
// the same limit for repeatable behavior, and it may be a speed advantage.
static const int kMaxNumChunks = 64;

/*----------------------------------------------------------------------
          F u n c t i o n s
----------------------------------------------------------------------*/
/**
 * @name preserve_outline_tree
 *
 * Copy the list of outlines.
 */
void preserve_outline(EDGEPT *start) {
  EDGEPT *srcpt;

  if (start == NULL)
    return;
  srcpt = start;
  do {
    srcpt->flags[1] = 1;
    srcpt = srcpt->next;
  }
  while (srcpt != start);
  srcpt->flags[1] = 2;
}


/**************************************************************************/
void preserve_outline_tree(TESSLINE *srcline) {
  TESSLINE *outline;

  for (outline = srcline; outline != NULL; outline = outline->next) {
    preserve_outline (outline->loop);
  }
}


/**
 * @name restore_outline_tree
 *
 * Copy the list of outlines.
 */
EDGEPT *restore_outline(EDGEPT *start) {
  EDGEPT *srcpt;
  EDGEPT *real_start;

  if (start == NULL)
    return NULL;
  srcpt = start;
  do {
    if (srcpt->flags[1] == 2)
      break;
    srcpt = srcpt->next;
  }
  while (srcpt != start);
  real_start = srcpt;
  do {
    srcpt = srcpt->next;
    if (srcpt->prev->flags[1] == 0) {
      remove_edgept(srcpt->prev);
    }
  }
  while (srcpt != real_start);
  return real_start;
}


/******************************************************************************/
void restore_outline_tree(TESSLINE *srcline) {
  TESSLINE *outline;

  for (outline = srcline; outline != NULL; outline = outline->next) {
    outline->loop = restore_outline (outline->loop);
    outline->start = outline->loop->pos;
  }
}

// Helper runs all the checks on a seam to make sure it is valid.
// Returns the seam if OK, otherwise deletes the seam and returns NULL.
static SEAM* CheckSeam(int debug_level, int32_t blob_number, TWERD* word,
                       TBLOB* blob, TBLOB* other_blob,
                       const GenericVector<SEAM*>& seams, SEAM* seam) {
  if (seam == NULL || blob->outlines == NULL || other_blob->outlines == NULL ||
      total_containment(blob, other_blob) || check_blob(other_blob) ||
      !seam->ContainedByBlob(*blob) || !seam->ContainedByBlob(*other_blob) ||
      any_shared_split_points(seams, seam) ||
      !seam->PrepareToInsertSeam(seams, word->blobs, blob_number, false)) {
    word->blobs.remove(blob_number + 1);
    if (seam) {
      seam->UndoSeam(blob, other_blob);
      delete seam;
      seam = NULL;
#ifndef GRAPHICS_DISABLED
      if (debug_level) {
        if (debug_level >2)
          display_blob(blob, Red);
        tprintf("\n** seam being removed ** \n");
      }
#endif
    } else {
      delete other_blob;
    }
    return NULL;
  }
  return seam;
}


/**
 * @name attempt_blob_chop
 *
 * Try to split the this blob after this one.  Check to make sure that
 * it was successful.
 */
namespace tesseract {
SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number,
                                 bool italic_blob,
                                 const GenericVector<SEAM*>& seams) {
  if (repair_unchopped_blobs)
    preserve_outline_tree (blob->outlines);
  TBLOB *other_blob = TBLOB::ShallowCopy(*blob);       /* Make new blob */
  // Insert it into the word.
  word->blobs.insert(other_blob, blob_number + 1);

  SEAM *seam = NULL;
  if (prioritize_division) {
    TPOINT location;
    if (divisible_blob(blob, italic_blob, &location)) {
      seam = new SEAM(0.0f, location);
    }
  }
  if (seam == NULL)
    seam = pick_good_seam(blob);
  if (chop_debug) {
    if (seam != NULL)
      seam->Print("Good seam picked=");
    else
      tprintf("\n** no seam picked *** \n");
  }
  if (seam) {
    seam->ApplySeam(italic_blob, blob, other_blob);
  }

  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
                   seams, seam);
  if (seam == NULL) {
    if (repair_unchopped_blobs)
      restore_outline_tree(blob->outlines);
    if (allow_blob_division && !prioritize_division) {
      // If the blob can simply be divided into outlines, then do that.
      TPOINT location;
      if (divisible_blob(blob, italic_blob, &location)) {
        other_blob = TBLOB::ShallowCopy(*blob);       /* Make new blob */
        word->blobs.insert(other_blob, blob_number + 1);
        seam = new SEAM(0.0f, location);
        seam->ApplySeam(italic_blob, blob, other_blob);
        seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
                         seams, seam);
      }
    }
  }
  if (seam != NULL) {
    // Make sure this seam doesn't get chopped again.
    seam->Finalize();
  }
  return seam;
}


SEAM *Wordrec::chop_numbered_blob(TWERD *word, int32_t blob_number,
                                  bool italic_blob,
                                  const GenericVector<SEAM*>& seams) {
  return attempt_blob_chop(word, word->blobs[blob_number], blob_number,
                           italic_blob, seams);
}


SEAM *Wordrec::chop_overlapping_blob(const GenericVector<TBOX>& boxes,
                                     bool italic_blob, WERD_RES *word_res,
                                     int *blob_number) {
  TWERD *word = word_res->chopped_word;
  for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) {
    TBLOB *blob = word->blobs[*blob_number];
    TPOINT topleft, botright;
    topleft.x = blob->bounding_box().left();
    topleft.y = blob->bounding_box().top();
    botright.x = blob->bounding_box().right();
    botright.y = blob->bounding_box().bottom();

    TPOINT original_topleft, original_botright;
    word_res->denorm.DenormTransform(NULL, topleft, &original_topleft);
    word_res->denorm.DenormTransform(NULL, botright, &original_botright);

    TBOX original_box = TBOX(original_topleft.x, original_botright.y,
                             original_botright.x, original_topleft.y);

    bool almost_equal_box = false;
    int num_overlap = 0;
    for (int i = 0; i < boxes.size(); i++) {
      if (original_box.overlap_fraction(boxes[i]) > 0.125)
        num_overlap++;
      if (original_box.almost_equal(boxes[i], 3))
        almost_equal_box = true;
    }

    TPOINT location;
    if (divisible_blob(blob, italic_blob, &location) ||
        (!almost_equal_box && num_overlap > 1)) {
      SEAM *seam = attempt_blob_chop(word, blob, *blob_number,
                                     italic_blob, word_res->seam_array);
      if (seam != NULL)
        return seam;
    }
  }

  *blob_number = -1;
  return NULL;
}

}  // namespace tesseract


/**
 * @name any_shared_split_points
 *
 * Return true if any of the splits share a point with this one.
 */
int any_shared_split_points(const GenericVector<SEAM*>& seams, SEAM *seam) {
  int length;
  int index;

  length = seams.size();
  for (index = 0; index < length; index++)
    if (seam->SharesPosition(*seams[index])) return TRUE;
  return FALSE;
}


/**
 * @name check_blob
 *
 * @return true if blob has a non whole outline.
 */
int check_blob(TBLOB *blob) {
  TESSLINE *outline;
  EDGEPT *edgept;

  for (outline = blob->outlines; outline != NULL; outline = outline->next) {
    edgept = outline->loop;
    do {
      if (edgept == NULL)
        break;
      edgept = edgept->next;
    }
    while (edgept != outline->loop);
    if (edgept == NULL)
      return 1;
  }
  return 0;
}


namespace tesseract {
/**
 * @name improve_one_blob
 *
 * Finds the best place to chop, based on the worst blob, fixpt, or next to
 * a fragment, according to the input. Returns the SEAM corresponding to the
 * chop point, if any is found, and the index in the ratings_matrix of the
 * chopped blob. Note that blob_choices is just a copy of the pointers in the
 * leading diagonal of the ratings MATRIX.
 * Although the blob is chopped, the returned SEAM is yet to be inserted into
 * word->seam_array and the resulting blobs are unclassified, so this function
 * can be used by ApplyBox as well as during recognition.
 */
SEAM* Wordrec::improve_one_blob(const GenericVector<BLOB_CHOICE*>& blob_choices,
                                DANGERR *fixpt,
                                bool split_next_to_fragment,
                                bool italic_blob,
                                WERD_RES* word,
                                int* blob_number) {
  float rating_ceiling = MAX_FLOAT32;
  SEAM *seam = NULL;
  do {
    *blob_number = select_blob_to_split_from_fixpt(fixpt);
    if (chop_debug) tprintf("blob_number from fixpt = %d\n", *blob_number);
    bool split_point_from_dict = (*blob_number != -1);
    if (split_point_from_dict) {
      fixpt->clear();
    } else {
      *blob_number = select_blob_to_split(blob_choices, rating_ceiling,
                                          split_next_to_fragment);
    }
    if (chop_debug) tprintf("blob_number = %d\n", *blob_number);
    if (*blob_number == -1)
      return NULL;

    // TODO(rays) it may eventually help to allow italic_blob to be true,
    seam = chop_numbered_blob(word->chopped_word, *blob_number, italic_blob,
                              word->seam_array);
    if (seam != NULL)
      return seam;  // Success!
    if (blob_choices[*blob_number] == NULL)
      return NULL;
    if (!split_point_from_dict) {
      // We chopped the worst rated blob, try something else next time.
      rating_ceiling = blob_choices[*blob_number]->rating();
    }
  } while (true);
  return seam;
}

/**
 * @name chop_one_blob
 *
 * Start with the current one-blob word and its classification.  Find
 * the worst blobs and try to divide it up to improve the ratings.
 * Used for testing chopper.
 */
SEAM* Wordrec::chop_one_blob(const GenericVector<TBOX>& boxes,
                             const GenericVector<BLOB_CHOICE*>& blob_choices,
                             WERD_RES* word_res,
                             int* blob_number) {
  if (prioritize_division) {
    return chop_overlapping_blob(boxes, true, word_res, blob_number);
  } else {
    return improve_one_blob(blob_choices, NULL, false, true, word_res,
                            blob_number);
  }
}

/**
 * @name chop_word_main
 *
 * Classify the blobs in this word and permute the results.  Find the
 * worst blob in the word and chop it up.  Continue this process until
 * a good answer has been found or all the blobs have been chopped up
 * enough.  The results are returned in the WERD_RES.
 */
void Wordrec::chop_word_main(WERD_RES *word) {
  int num_blobs = word->chopped_word->NumBlobs();
  if (word->ratings == NULL) {
    word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks);
  }
  if (word->ratings->get(0, 0) == NULL) {
    // Run initial classification.
    for (int b = 0; b < num_blobs; ++b) {
      BLOB_CHOICE_LIST* choices = classify_piece(word->seam_array, b, b,
                                                 "Initial:", word->chopped_word,
                                                 word->blamer_bundle);
      word->ratings->put(b, b, choices);
    }
  } else {
    // Blobs have been pre-classified. Set matrix cell for all blob choices
    for (int col = 0; col < word->ratings->dimension(); ++col) {
      for (int row = col; row < word->ratings->dimension() &&
           row < col + word->ratings->bandwidth(); ++row) {
        BLOB_CHOICE_LIST* choices = word->ratings->get(col, row);
        if (choices != NULL) {
          BLOB_CHOICE_IT bc_it(choices);
          for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
            bc_it.data()->set_matrix_cell(col, row);
          }
        }
      }
    }
  }

  // Run Segmentation Search.
  BestChoiceBundle best_choice_bundle(word->ratings->dimension());
  SegSearch(word, &best_choice_bundle, word->blamer_bundle);

  if (word->best_choice == NULL) {
    // SegSearch found no valid paths, so just use the leading diagonal.
    word->FakeWordFromRatings(TOP_CHOICE_PERM);
  }
  word->RebuildBestState();
  // If we finished without a hyphen at the end of the word, let the next word
  // be found in the dictionary.
  if (word->word->flag(W_EOL) &&
      !getDict().has_hyphen_end(*word->best_choice)) {
    getDict().reset_hyphen_vars(true);
  }

  if (word->blamer_bundle != NULL && this->fill_lattice_ != NULL) {
    CallFillLattice(*word->ratings, word->best_choices,
                    *word->uch_set, word->blamer_bundle);
  }
  if (wordrec_debug_level > 0) {
    tprintf("Final Ratings Matrix:\n");
    word->ratings->print(getDict().getUnicharset());
  }
  word->FilterWordChoices(getDict().stopper_debug_level);
}

/**
 * @name improve_by_chopping
 *
 * Repeatedly chops the worst blob, classifying the new blobs fixing up all
 * the data, and incrementally runs the segmentation search until a good word
 * is found, or no more chops can be found.
 */
void Wordrec::improve_by_chopping(float rating_cert_scale,
                                  WERD_RES* word,
                                  BestChoiceBundle* best_choice_bundle,
                                  BlamerBundle* blamer_bundle,
                                  LMPainPoints* pain_points,
                                  GenericVector<SegSearchPending>* pending) {
  int blob_number;
  do {  // improvement loop.
    // Make a simple vector of BLOB_CHOICEs to make it easy to pick which
    // one to chop.
    GenericVector<BLOB_CHOICE*> blob_choices;
    int num_blobs = word->ratings->dimension();
    for (int i = 0; i < num_blobs; ++i) {
      BLOB_CHOICE_LIST* choices = word->ratings->get(i, i);
      if (choices == NULL || choices->empty()) {
        blob_choices.push_back(NULL);
      } else {
        BLOB_CHOICE_IT bc_it(choices);
        blob_choices.push_back(bc_it.data());
      }
    }
    SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt,
                                  false, false, word, &blob_number);
    if (seam == NULL) break;
    // A chop has been made. We have to correct all the data structures to
    // take into account the extra bottom-level blob.
    // Put the seam into the seam_array and correct everything else on the
    // word: ratings matrix (including matrix location in the BLOB_CHOICES),
    // states in WERD_CHOICEs, and blob widths.
    word->InsertSeam(blob_number, seam);
    // Insert a new entry in the beam array.
    best_choice_bundle->beam.insert(new LanguageModelState, blob_number);
    // Fixpts are outdated, but will get recalculated.
    best_choice_bundle->fixpt.clear();
    // Remap existing pain points.
    pain_points->RemapForSplit(blob_number);
    // Insert a new pending at the chop point.
    pending->insert(SegSearchPending(), blob_number);

    // Classify the two newly created blobs using ProcessSegSearchPainPoint,
    // as that updates the pending correctly and adds new pain points.
    MATRIX_COORD pain_point(blob_number, blob_number);
    ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word,
                              pain_points, blamer_bundle);
    pain_point.col = blob_number + 1;
    pain_point.row = blob_number + 1;
    ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word,
                              pain_points, blamer_bundle);
    if (language_model_->language_model_ngram_on) {
      // N-gram evaluation depends on the number of blobs in a chunk, so we
      // have to re-evaluate everything in the word.
      ResetNGramSearch(word, best_choice_bundle, pending);
      blob_number = 0;
    }
    // Run language model incrementally. (Except with the n-gram model on.)
    UpdateSegSearchNodes(rating_cert_scale, blob_number, pending,
                         word, pain_points, best_choice_bundle, blamer_bundle);
  } while (!language_model_->AcceptableChoiceFound() &&
           word->ratings->dimension() < kMaxNumChunks);

  // If after running only the chopper best_choice is incorrect and no blame
  // has been yet set, blame the classifier if best_choice is classifier's
  // top choice and is a dictionary word (i.e. language model could not have
  // helped). Otherwise blame the tradeoff between the classifier and
  // the old language model (permuters).
  if (word->blamer_bundle != NULL &&
      word->blamer_bundle->incorrect_result_reason() == IRR_CORRECT &&
      !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) {
    bool valid_permuter = word->best_choice != NULL &&
        Dict::valid_word_permuter(word->best_choice->permuter(), false);
    word->blamer_bundle->BlameClassifierOrLangModel(word,
                                                    getDict().getUnicharset(),
                                                    valid_permuter,
                                                    wordrec_debug_blamer);
  }
}


/**********************************************************************
 * select_blob_to_split
 *
 * These are the results of the last classification.  Find a likely
 * place to apply splits.  If none, return -1.
 **********************************************************************/
int Wordrec::select_blob_to_split(
    const GenericVector<BLOB_CHOICE*>& blob_choices,
    float rating_ceiling, bool split_next_to_fragment) {
  BLOB_CHOICE *blob_choice;
  int x;
  float worst = -MAX_FLOAT32;
  int worst_index = -1;
  float worst_near_fragment = -MAX_FLOAT32;
  int worst_index_near_fragment = -1;
  const CHAR_FRAGMENT **fragments = NULL;

  if (chop_debug) {
    if (rating_ceiling < MAX_FLOAT32)
      tprintf("rating_ceiling = %8.4f\n", rating_ceiling);
    else
      tprintf("rating_ceiling = No Limit\n");
  }

  if (split_next_to_fragment && blob_choices.size() > 0) {
    fragments = new const CHAR_FRAGMENT *[blob_choices.length()];
    if (blob_choices[0] != NULL) {
      fragments[0] = getDict().getUnicharset().get_fragment(
          blob_choices[0]->unichar_id());
    } else {
      fragments[0] = NULL;
    }
  }

  for (x = 0; x < blob_choices.size(); ++x) {
    if (blob_choices[x] == NULL) {
      delete[] fragments;
      return x;
    } else {
      blob_choice = blob_choices[x];
      // Populate fragments for the following position.
      if (split_next_to_fragment && x+1 < blob_choices.size()) {
        if (blob_choices[x + 1] != NULL) {
          fragments[x + 1] = getDict().getUnicharset().get_fragment(
              blob_choices[x + 1]->unichar_id());
        } else {
          fragments[x + 1] = NULL;
        }
      }
      if (blob_choice->rating() < rating_ceiling &&
          blob_choice->certainty() < tessedit_certainty_threshold) {
        // Update worst and worst_index.
        if (blob_choice->rating() > worst) {
          worst_index = x;
          worst = blob_choice->rating();
        }
        if (split_next_to_fragment) {
          // Update worst_near_fragment and worst_index_near_fragment.
          bool expand_following_fragment =
            (x + 1 < blob_choices.size() &&
             fragments[x+1] != NULL && !fragments[x+1]->is_beginning());
          bool expand_preceding_fragment =
            (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending());
          if ((expand_following_fragment || expand_preceding_fragment) &&
              blob_choice->rating() > worst_near_fragment) {
            worst_index_near_fragment = x;
            worst_near_fragment = blob_choice->rating();
            if (chop_debug) {
              tprintf("worst_index_near_fragment=%d"
                      " expand_following_fragment=%d"
                      " expand_preceding_fragment=%d\n",
                      worst_index_near_fragment,
                      expand_following_fragment,
                      expand_preceding_fragment);
            }
          }
        }
      }
    }
  }
  delete[] fragments;
  // TODO(daria): maybe a threshold of badness for
  // worst_near_fragment would be useful.
  return worst_index_near_fragment != -1 ?
    worst_index_near_fragment : worst_index;
}

/**********************************************************************
 * select_blob_to_split_from_fixpt
 *
 * Given the fix point from a dictionary search, if there is a single
 * dangerous blob that maps to multiple characters, return that blob
 * index as a place we need to split.  If none, return -1.
 **********************************************************************/
int Wordrec::select_blob_to_split_from_fixpt(DANGERR *fixpt) {
  if (!fixpt)
    return -1;
  for (int i = 0; i < fixpt->size(); i++) {
    if ((*fixpt)[i].begin + 1 == (*fixpt)[i].end &&
        (*fixpt)[i].dangerous &&
        (*fixpt)[i].correct_is_ngram) {
      return (*fixpt)[i].begin;
    }
  }
  return -1;
}


}  // namespace tesseract


/**********************************************************************
 * total_containment
 *
 * Check to see if one of these outlines is totally contained within
 * the bounding box of the other.
 **********************************************************************/
int16_t total_containment(TBLOB *blob1, TBLOB *blob2) {
  TBOX box1 = blob1->bounding_box();
  TBOX box2 = blob2->bounding_box();
  return box1.contains(box2) || box2.contains(box1);
}