tesseract/ccmain/ambigsrecog.cpp

///////////////////////////////////////////////////////////////////////
// File:        genericvector.h
// Description: Functions for producing classifications
//              for the input to ambigstraining.
// Author:      Daria Antonova
// Created:     Mon Jun 23 11:26:43 PDT 2008
//
// (C) Copyright 2007, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#include "ambigs.h"

#include "applybox.h"
#include "boxread.h"
#include "control.h"
#include "permute.h"
#include "ratngs.h"
#include "reject.h"
#include "stopper.h"
#include "tesseractclass.h"

namespace tesseract {

// Sets flags necessary for ambigs training mode.
// Opens and returns the pointer to the output file.
FILE *Tesseract::init_ambigs_training(const STRING &fname) {
  permute_only_top = 1;                        // use only top choice permuter
  tessedit_tess_adaption_mode.set_value(0);    // turn off adaption
  tessedit_ok_mode.set_value(0);               // turn off context checking
  tessedit_enable_doc_dict.set_value(0);       // turn off document dictionary
  save_best_choices.set_value(1);              // save individual char choices
  stopper_no_acceptable_choices.set_value(1);  // explore all segmentations
  save_raw_choices.set_value(1);               // save raw choices

  // Open ambigs output file.
  STRING output_fname = fname;
  const char *lastdot = strrchr(output_fname.string(), '.');
  if (lastdot != NULL) {
    output_fname[lastdot - output_fname.string()] = '\0';
  }
  output_fname += ".txt";
  FILE *output_file;
  if (!(output_file = fopen(output_fname.string(), "a+"))) {
    CANTOPENFILE.error("ambigs_training", EXIT,
                       "Can't open box file %s\n", output_fname.string());
  }
  return output_file;
}

// This function takes tif/box pair of files and runs recognition on the image,
// while making sure that the word bounds that tesseract identified roughly
// match to those specified by the input box file. For each word (ngram in a
// single bounding box from the input box file) it outputs the ocred result,
// the correct label, rating and certainty.
void Tesseract::ambigs_training_segmented(const STRING &fname,
                                          PAGE_RES *page_res,
                                          volatile ETEXT_DESC *monitor,
                                          FILE *output_file) {
  STRING box_fname = fname;
  const char *lastdot = strrchr(box_fname.string(), '.');
  if (lastdot != NULL) {
    box_fname[lastdot - box_fname.string()] = '\0';
  }
  box_fname += ".box";
  FILE *box_file;
  if (!(box_file = fopen(box_fname.string(), "r"))) {
    CANTOPENFILE.error("ambigs_training", EXIT,
                       "Can't open box file %s\n", box_fname.string());
  }

  static PAGE_RES_IT page_res_it;
  page_res_it.page_res = page_res;
  page_res_it.restart_page();
  int x_min, y_min, x_max, y_max;
  char label[UNICHAR_LEN * 10];

  // Process all the words on this page.
  while (page_res_it.word() != NULL &&
         read_next_box(applybox_page, box_file, label,
                       &x_min, &y_min, &x_max, &y_max)) {
    // Init bounding box of the current word bounding box and from box file.
    TBOX box = TBOX(ICOORD(x_min, y_min), ICOORD(x_max, y_max));
    TBOX word_box(page_res_it.word()->word->bounding_box());
    bool one_word = true;
    // Check whether the bounding box of the next word overlaps with the
    // current box from box file.
    while (page_res_it.next_word() != NULL &&
           box.x_overlap(page_res_it.next_word()->word->bounding_box())) {
      word_box = word_box.bounding_union(
          page_res_it.next_word()->word->bounding_box());
      page_res_it.forward();
      one_word = false;
    }
    if (!word_box.major_overlap(box)) {
      if (!word_box.x_overlap(box)) {
        // We must be looking at the word that belongs in the "next" bounding
        // box from the box file. The ngram that was supposed to appear in
        // the current box read from the box file must have been dropped by
        // tesseract as noise.
        tprintf("Word %s was dropped as noise.\n", label);
        continue;  // stay on this blob, but read next box from box file
      } else {
        tprintf("Error: Insufficient overlap for word box"
                " and box from file for %s\n", label);
        word_box.print();
        box.print();
        exit(1);
      }
    }
    // Skip recognizing the ngram if tesseract is sure it's not
    // one word, otherwise run one recognition pass on this word.
    if (!one_word) {
      tprintf("Tesseract segmented %s as multiple words\n", label);
    } else {
      ambigs_classify_and_output(&page_res_it, label, output_file);
    }
    page_res_it.forward();
  }
  fclose(box_file);
}

// Run classify_word_pass1() on the current word. Output tesseract's raw choice
// as a result of the classification. For words labeled with a single unichar
// also output all alternatives from blob_choices of the best choice.
void Tesseract::ambigs_classify_and_output(PAGE_RES_IT *page_res_it,
                                           const char *label,
                                           FILE *output_file) {
  int offset;
  // Classify word.
  classify_word_pass1(page_res_it->word(), page_res_it->row()->row,
                      page_res_it->block()->block,
                      FALSE, NULL, NULL);
  WERD_CHOICE *best_choice = page_res_it->word()->best_choice;
  ASSERT_HOST(best_choice != NULL);
  ASSERT_HOST(best_choice->blob_choices() != NULL);

  // Compute the number of unichars in the label.
  int label_num_unichars = 0;
  int step = 1;  // should be non-zero on the first iteration
  for (offset = 0; label[offset] != '\0' && step > 0;
       step = getDict().getUnicharset().step(label + offset),
       offset += step, ++label_num_unichars);
  if (step == 0) {
    tprintf("Not outputting illegal unichar %s\n", label);
    return;
  }

  // Output all classifier choices for the unigrams (1-1 classifications).
  if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) {
    BLOB_CHOICE_LIST_C_IT outer_blob_choice_it;
    outer_blob_choice_it.set_to_list(best_choice->blob_choices());
    BLOB_CHOICE_IT blob_choice_it;
    blob_choice_it.set_to_list(outer_blob_choice_it.data());
    for (blob_choice_it.mark_cycle_pt();
         !blob_choice_it.cycled_list();
         blob_choice_it.forward()) {
      BLOB_CHOICE *blob_choice = blob_choice_it.data();
      if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) {
        fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n",
               unicharset.id_to_unichar(blob_choice->unichar_id()),
               label, blob_choice->rating(), blob_choice->certainty());
      }
    }
  }
  // Output the raw choice for succesful non 1-1 classifications.
  getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars);
}

}  // namespace tesseract