Major change to improve layout analysis for heavily diacritic languages:

Tha, Vie, Kan, Tel etc. There is a new overlap detector that detects when diacritics cause a big increase in textline overlap. In such cases, diacritics from overlap regions are kept separate from layout analysis completely, allowing textline formation to happen without them. The diacritics are then assigned to 0, 1 or 2 close words at the end of layout analysis, using and modifying an old noise detection data path. The stored diacritics are used or not during recognition according to the character classifier's liking for them.
2025-06-07 18:02:40 +08:00 · 2015-05-12 16:47:02 -07:00 · 2015-05-12 16:47:02 -07:00 · 0e868ef377
commit 0e868ef377
parent b6d0184806
34 changed files with 1841 additions and 729 deletions
--- a/ccmain/control.cpp
+++ b/ccmain/control.cpp
@ -93,8 +93,7 @@ BOOL8 Tesseract::recog_interactive(PAGE_RES_IT* pr_it) {
  WordData word_data(*pr_it);
  SetupWordPassN(2, &word_data);
-  classify_word_and_language(&Tesseract::classify_word_pass2, pr_it,
+  classify_word_and_language(2, pr_it, &word_data);
                             &word_data);
  if (tessedit_debug_quality_metrics) {
    WERD_RES* word_res = pr_it->word();
    word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
@ -190,6 +189,7 @@ void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
      if (word->word->x_height == 0.0f)
        word->word->x_height = word->row->x_height();
    }
    word->lang_words.truncate(0);
    for (int s = 0; s <= sub_langs_.size(); ++s) {
      // The sub_langs_.size() entry is for the master language.
      Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
@ -249,15 +249,23 @@ bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
    while (pr_it->word() != NULL && pr_it->word() != word->word)
      pr_it->forward();
    ASSERT_HOST(pr_it->word() != NULL);
-    WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
+    bool make_next_word_fuzzy = false;
-                                            : &Tesseract::classify_word_pass2;
+    if (!AnyLSTMLang() &&
-    classify_word_and_language(recognizer, pr_it, word);
+        ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
-    if (tessedit_dump_choices) {
+      // Needs to be setup again to see the new outlines in the chopped_word.
      SetupWordPassN(pass_n, word);
    }
    classify_word_and_language(pass_n, pr_it, word);
    if (tessedit_dump_choices || debug_noise_removal) {
      tprintf("Pass%d: %s [%s]\n", pass_n,
              word->word->best_choice->unichar_string().string(),
              word->word->best_choice->debug_string().string());
    }
    pr_it->forward();
    if (make_next_word_fuzzy && pr_it->word() != NULL) {
      pr_it->MakeCurrentWordFuzzy();
    }
  }
  return true;
 }
@ -898,6 +906,359 @@ static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
  return true;
 }
 // Moves good-looking "noise"/diacritics from the reject list to the main
 // blob list on the current word. Returns true if anything was done, and
 // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
 bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,
                                   bool* make_next_word_fuzzy) {
  *make_next_word_fuzzy = false;
  WERD* real_word = pr_it->word()->word;
  if (real_word->rej_cblob_list()->empty() ||
      real_word->cblob_list()->empty() ||
      real_word->rej_cblob_list()->length() > noise_maxperword)
    return false;
  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
  // Get the noise outlines into a vector with matching bool map.
  GenericVector<C_OUTLINE*> outlines;
  real_word->GetNoiseOutlines(&outlines);
  GenericVector<bool> word_wanted;
  GenericVector<bool> overlapped_any_blob;
  GenericVector<C_BLOB*> target_blobs;
  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
                                     &word_wanted, &overlapped_any_blob,
                                     &target_blobs);
  // Filter the outlines that overlapped any blob and put them into the word
  // now. This simplifies the remaining task and also makes it more accurate
  // as it has more completed blobs to work on.
  GenericVector<bool> wanted;
  GenericVector<C_BLOB*> wanted_blobs;
  GenericVector<C_OUTLINE*> wanted_outlines;
  int num_overlapped = 0;
  int num_overlapped_used = 0;
  for (int i = 0; i < overlapped_any_blob.size(); ++i) {
    if (overlapped_any_blob[i]) {
      ++num_overlapped;
      if (word_wanted[i]) ++num_overlapped_used;
      wanted.push_back(word_wanted[i]);
      wanted_blobs.push_back(target_blobs[i]);
      wanted_outlines.push_back(outlines[i]);
      outlines[i] = NULL;
    }
  }
  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, NULL);
  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
                             &target_blobs);
  int non_overlapped = 0;
  int non_overlapped_used = 0;
  for (int i = 0; i < word_wanted.size(); ++i) {
    if (word_wanted[i]) ++non_overlapped_used;
    if (outlines[i] != NULL) ++non_overlapped_used;
  }
  if (debug_noise_removal) {
    tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
            num_overlapped_used, num_overlapped, non_overlapped_used,
            non_overlapped);
    real_word->bounding_box().print();
  }
  // Now we have decided which outlines we want, put them into the real_word.
  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
                                     make_next_word_fuzzy)) {
    pr_it->MakeCurrentWordFuzzy();
  }
  // TODO(rays) Parts of combos have a deep copy of the real word, and need
  // to have their noise outlines moved/assigned in the same way!!
  return num_overlapped_used != 0 || non_overlapped_used != 0;
 }
 // Attempts to put noise/diacritic outlines into the blobs that they overlap.
 // Input: a set of noisy outlines that probably belong to the real_word.
 // Output: word_wanted indicates which outlines are to be assigned to a blob,
 //   target_blobs indicates which to assign to, and overlapped_any_blob is
 //   true for all outlines that overlapped a blob.
 void Tesseract::AssignDiacriticsToOverlappingBlobs(
    const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
    PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
    GenericVector<bool>* overlapped_any_blob,
    GenericVector<C_BLOB*>* target_blobs) {
  GenericVector<bool> blob_wanted;
  word_wanted->init_to_size(outlines.size(), false);
  overlapped_any_blob->init_to_size(outlines.size(), false);
  target_blobs->init_to_size(outlines.size(), NULL);
  // For each real blob, find the outlines that seriously overlap it.
  // A single blob could be several merged characters, so there can be quite
  // a few outlines overlapping, and the full engine needs to be used to chop
  // and join to get a sensible result.
  C_BLOB_IT blob_it(real_word->cblob_list());
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    C_BLOB* blob = blob_it.data();
    TBOX blob_box = blob->bounding_box();
    blob_wanted.init_to_size(outlines.size(), false);
    int num_blob_outlines = 0;
    for (int i = 0; i < outlines.size(); ++i) {
      if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
          !(*word_wanted)[i]) {
        blob_wanted[i] = true;
        (*overlapped_any_blob)[i] = true;
        ++num_blob_outlines;
      }
    }
    if (debug_noise_removal) {
      tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
      blob_box.print();
    }
    // If any outlines overlap the blob, and not too many, classify the blob
    // (using the full engine, languages and all), and choose the maximal
    // combination of outlines that doesn't hurt the end-result classification
    // by too much. Mark them as wanted.
    if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
      if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
                                      outlines, num_blob_outlines,
                                      &blob_wanted)) {
        for (int i = 0; i < blob_wanted.size(); ++i) {
          if (blob_wanted[i]) {
            // Claim the outline and record where it is going.
            (*word_wanted)[i] = true;
            (*target_blobs)[i] = blob;
          }
        }
      }
    }
  }
 }
 // Attempts to assign non-overlapping outlines to their nearest blobs or
 // make new blobs out of them.
 void Tesseract::AssignDiacriticsToNewBlobs(
    const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
    PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
    GenericVector<C_BLOB*>* target_blobs) {
  GenericVector<bool> blob_wanted;
  word_wanted->init_to_size(outlines.size(), false);
  target_blobs->init_to_size(outlines.size(), NULL);
  // Check for outlines that need to be turned into stand-alone blobs.
  for (int i = 0; i < outlines.size(); ++i) {
    if (outlines[i] == NULL) continue;
    // Get a set of adjacent outlines that don't overlap any existing blob.
    blob_wanted.init_to_size(outlines.size(), false);
    int num_blob_outlines = 0;
    TBOX total_ol_box(outlines[i]->bounding_box());
    while (i < outlines.size() && outlines[i] != NULL) {
      blob_wanted[i] = true;
      total_ol_box += outlines[i]->bounding_box();
      ++i;
      ++num_blob_outlines;
    }
    // Find the insertion point.
    C_BLOB_IT blob_it(real_word->cblob_list());
    while (!blob_it.at_last() &&
           blob_it.data_relative(1)->bounding_box().left() <=
               total_ol_box.left()) {
      blob_it.forward();
    }
    // Choose which combination of them we actually want and where to put
    // them.
    if (debug_noise_removal)
      tprintf("Num blobless outlines = %d\n", num_blob_outlines);
    C_BLOB* left_blob = blob_it.data();
    TBOX left_box = left_blob->bounding_box();
    C_BLOB* right_blob = blob_it.at_last() ? NULL : blob_it.data_relative(1);
    if ((left_box.x_overlap(total_ol_box) || right_blob == NULL ||
         !right_blob->bounding_box().x_overlap(total_ol_box)) &&
        SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
                                    outlines, num_blob_outlines,
                                    &blob_wanted)) {
      if (debug_noise_removal) tprintf("Added to left blob\n");
      for (int j = 0; j < blob_wanted.size(); ++j) {
        if (blob_wanted[j]) {
          (*word_wanted)[j] = true;
          (*target_blobs)[j] = left_blob;
        }
      }
    } else if (right_blob != NULL &&
               (!left_box.x_overlap(total_ol_box) ||
                right_blob->bounding_box().x_overlap(total_ol_box)) &&
               SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it,
                                           right_blob, outlines,
                                           num_blob_outlines, &blob_wanted)) {
      if (debug_noise_removal) tprintf("Added to right blob\n");
      for (int j = 0; j < blob_wanted.size(); ++j) {
        if (blob_wanted[j]) {
          (*word_wanted)[j] = true;
          (*target_blobs)[j] = right_blob;
        }
      }
    } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, NULL,
                                           outlines, num_blob_outlines,
                                           &blob_wanted)) {
      if (debug_noise_removal) tprintf("Fitted between blobs\n");
      for (int j = 0; j < blob_wanted.size(); ++j) {
        if (blob_wanted[j]) {
          (*word_wanted)[j] = true;
          (*target_blobs)[j] = NULL;
        }
      }
    }
  }
 }
 // Starting with ok_outlines set to indicate which outlines overlap the blob,
 // chooses the optimal set (approximately) and returns true if any outlines
 // are desired, in which case ok_outlines indicates which ones.
 bool Tesseract::SelectGoodDiacriticOutlines(
    int pass, float certainty_threshold, PAGE_RES_IT* pr_it, C_BLOB* blob,
    const GenericVector<C_OUTLINE*>& outlines, int num_outlines,
    GenericVector<bool>* ok_outlines) {
  STRING best_str;
  float target_cert = certainty_threshold;
  if (blob != NULL) {
    float target_c2;
    target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
    if (debug_noise_removal) {
      tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(),
              target_cert, target_c2);
      blob->bounding_box().print();
    }
    target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
  }
  GenericVector<bool> test_outlines = *ok_outlines;
  // Start with all the outlines in.
  STRING all_str;
  GenericVector<bool> best_outlines = *ok_outlines;
  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
                                             pr_it, blob, &all_str);
  if (debug_noise_removal) {
    TBOX ol_box;
    for (int i = 0; i < test_outlines.size(); ++i) {
      if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
    }
    tprintf("All Noise blob classified as %s=%g, delta=%g at:",
            all_str.string(), best_cert, best_cert - target_cert);
    ol_box.print();
  }
  // Iteratively zero out the bit that improves the certainty the most, until
  // we get past the threshold, have zero bits, or fail to improve.
  int best_index = 0;  // To zero out.
  while (num_outlines > 1 && best_index >= 0 &&
         (blob == NULL || best_cert < target_cert || blob != NULL)) {
    // Find the best bit to zero out.
    best_index = -1;
    for (int i = 0; i < outlines.size(); ++i) {
      if (test_outlines[i]) {
        test_outlines[i] = false;
        STRING str;
        float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
                                              pr_it, blob, &str);
        if (debug_noise_removal) {
          TBOX ol_box;
          for (int j = 0; j < outlines.size(); ++j) {
            if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
            tprintf("%d", test_outlines[j]);
          }
          tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(),
                  cert, cert - target_cert);
          ol_box.print();
        }
        if (cert > best_cert) {
          best_cert = cert;
          best_index = i;
          best_outlines = test_outlines;
        }
        test_outlines[i] = true;
      }
    }
    if (best_index >= 0) {
      test_outlines[best_index] = false;
      --num_outlines;
    }
  }
  if (best_cert >= target_cert) {
    // Save the best combination.
    *ok_outlines = best_outlines;
    if (debug_noise_removal) {
      tprintf("%s noise combination ", blob ? "Adding" : "New");
      for (int i = 0; i < best_outlines.size(); ++i) {
        tprintf("%d", best_outlines[i]);
      }
      tprintf(" yields certainty %g, beating target of %g\n", best_cert,
              target_cert);
    }
    return true;
  }
  return false;
 }
 // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
 // the inclusion of the outlines, and returns the certainty of the raw choice.
 float Tesseract::ClassifyBlobPlusOutlines(
    const GenericVector<bool>& ok_outlines,
    const GenericVector<C_OUTLINE*>& outlines, int pass_n, PAGE_RES_IT* pr_it,
    C_BLOB* blob, STRING* best_str) {
  C_OUTLINE_IT ol_it;
  C_OUTLINE* first_to_keep = NULL;
  if (blob != NULL) {
    // Add the required outlines to the blob.
    ol_it.set_to_list(blob->out_list());
    first_to_keep = ol_it.data();
  }
  for (int i = 0; i < ok_outlines.size(); ++i) {
    if (ok_outlines[i]) {
      // This outline is to be added.
      if (blob == NULL) {
        blob = new C_BLOB(outlines[i]);
        ol_it.set_to_list(blob->out_list());
      } else {
        ol_it.add_before_stay_put(outlines[i]);
      }
    }
  }
  float c2;
  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
  ol_it.move_to_first();
  if (first_to_keep == NULL) {
    // We created blob. Empty its outlines and delete it.
    for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
    delete blob;
    cert = -c2;
  } else {
    // Remove the outlines that we put in.
    for (; ol_it.data() != first_to_keep; ol_it.forward()) {
      ol_it.extract();
    }
  }
  return cert;
 }
 // Classifies the given blob (part of word_data->word->word) as an individual
 // word, using languages, chopper etc, returning only the certainty of the
 // best raw choice, and undoing all the work done to fake out the word.
 float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it,
                                    C_BLOB* blob, STRING* best_str, float* c2) {
  WERD* real_word = pr_it->word()->word;
  WERD* word = real_word->ConstructFromSingleBlob(
      real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
  WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
  // Get a new iterator that points to the new word.
  PAGE_RES_IT it(pr_it->page_res);
  while (it.word() != word_res && it.word() != NULL) it.forward();
  ASSERT_HOST(it.word() == word_res);
  WordData wd(it);
  // Force full initialization.
  SetupWordPassN(1, &wd);
  classify_word_and_language(pass_n, &it, &wd);
  if (debug_noise_removal) {
    tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
            wd.row->x_height(), wd.word->raw_choice->min_x_height(),
            wd.word->raw_choice->max_x_height());
  }
  float cert = wd.word->raw_choice->certainty();
  float rat = wd.word->raw_choice->rating();
  *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
  *best_str = wd.word->raw_choice->unichar_string();
  it.DeleteCurrentWord();
  pr_it->ResetWordIterator();
  return cert;
 }
 // Generic function for classifying a word. Can be used either for pass1 or
 // pass2 according to the function passed to recognizer.
 // word_data holds the word to be recognized, and its block and row, and
@ -906,9 +1267,10 @@ static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
 // Recognizes in the current language, and if successful that is all.
 // If recognition was not successful, tries all available languages until
 // it gets a successful result or runs out of languages. Keeps the best result.
-void Tesseract::classify_word_and_language(WordRecognizer recognizer,
+void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
                                           PAGE_RES_IT* pr_it,
                                           WordData* word_data) {
  WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
                                          : &Tesseract::classify_word_pass2;
  // Best result so far.
  PointerVector<WERD_RES> best_words;
  // Points to the best result. May be word or in lang_words.
--- a/ccmain/fixspace.cpp
+++ b/ccmain/fixspace.cpp
@ -205,8 +205,7 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
    if ((!word->part_of_combo) && (word->box_word == NULL)) {
      WordData word_data(block, row, word);
      SetupWordPassN(2, &word_data);
-      classify_word_and_language(&Tesseract::classify_word_pass2, NULL,
+      classify_word_and_language(2, NULL, &word_data);
                                 &word_data);
    }
    prev_word_best_choice_ = word->best_choice;
  }
--- a/ccmain/pageiterator.cpp
+++ b/ccmain/pageiterator.cpp
@ -26,15 +26,23 @@
 namespace tesseract {
-PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
+PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale,
-                           int scale, int scaled_yres,
+                           int scaled_yres, int rect_left, int rect_top,
                           int rect_left, int rect_top,
                           int rect_width, int rect_height)
-  : page_res_(page_res), tesseract_(tesseract),
+    : page_res_(page_res),
-    word_(NULL), word_length_(0), blob_index_(0), cblob_it_(NULL),
+      tesseract_(tesseract),
-    scale_(scale), scaled_yres_(scaled_yres),
+      word_(NULL),
-    rect_left_(rect_left), rect_top_(rect_top),
+      word_length_(0),
-    rect_width_(rect_width), rect_height_(rect_height) {
+      blob_index_(0),
      cblob_it_(NULL),
      include_upper_dots_(false),
      include_lower_dots_(false),
      scale_(scale),
      scaled_yres_(scaled_yres),
      rect_left_(rect_left),
      rect_top_(rect_top),
      rect_width_(rect_width),
      rect_height_(rect_height) {
  it_ = new PAGE_RES_IT(page_res);
  PageIterator::Begin();
 }
@ -50,12 +58,20 @@ PageIterator::~PageIterator() {
 * objects at a higher level.
 */
 PageIterator::PageIterator(const PageIterator& src)
-  : page_res_(src.page_res_), tesseract_(src.tesseract_),
+    : page_res_(src.page_res_),
-    word_(NULL), word_length_(src.word_length_),
+      tesseract_(src.tesseract_),
-    blob_index_(src.blob_index_), cblob_it_(NULL),
+      word_(NULL),
-    scale_(src.scale_), scaled_yres_(src.scaled_yres_),
+      word_length_(src.word_length_),
-    rect_left_(src.rect_left_), rect_top_(src.rect_top_),
+      blob_index_(src.blob_index_),
-    rect_width_(src.rect_width_), rect_height_(src.rect_height_) {
+      cblob_it_(NULL),
      include_upper_dots_(src.include_upper_dots_),
      include_lower_dots_(src.include_lower_dots_),
      scale_(src.scale_),
      scaled_yres_(src.scaled_yres_),
      rect_left_(src.rect_left_),
      rect_top_(src.rect_top_),
      rect_width_(src.rect_width_),
      rect_height_(src.rect_height_) {
  it_ = new PAGE_RES_IT(*src.it_);
  BeginWord(src.blob_index_);
 }
@ -63,6 +79,8 @@ PageIterator::PageIterator(const PageIterator& src)
 const PageIterator& PageIterator::operator=(const PageIterator& src) {
  page_res_ = src.page_res_;
  tesseract_ = src.tesseract_;
  include_upper_dots_ = src.include_upper_dots_;
  include_lower_dots_ = src.include_lower_dots_;
  scale_ = src.scale_;
  scaled_yres_ = src.scaled_yres_;
  rect_left_ = src.rect_left_;
@ -252,16 +270,19 @@ bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
  PARA *para = NULL;
  switch (level) {
    case RIL_BLOCK:
-      box = it_->block()->block->bounding_box();
+      box = it_->block()->block->restricted_bounding_box(include_upper_dots_,
                                                         include_lower_dots_);
      break;
    case RIL_PARA:
      para = it_->row()->row->para();
      // explicit fall-through.
    case RIL_TEXTLINE:
-      box = it_->row()->row->bounding_box();
+      box = it_->row()->row->restricted_bounding_box(include_upper_dots_,
                                                     include_lower_dots_);
      break;
    case RIL_WORD:
-      box = it_->word()->word->bounding_box();
+      box = it_->word()->word->restricted_bounding_box(include_upper_dots_,
                                                       include_lower_dots_);
      break;
    case RIL_SYMBOL:
      if (cblob_it_ == NULL)
@ -387,39 +408,23 @@ Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const {
  int left, top, right, bottom;
  if (!BoundingBoxInternal(level, &left, &top, &right, &bottom))
    return NULL;
-  Pix* pix = NULL;
+  if (level == RIL_SYMBOL && cblob_it_ != NULL &&
-  switch (level) {
+      cblob_it_->data()->area() != 0)
-    case RIL_BLOCK:
+    return cblob_it_->data()->render();
-    case RIL_PARA:
+  Box* box = boxCreate(left, top, right - left, bottom - top);
-      int bleft, btop, bright, bbottom;
+  Pix* pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL);
-      BoundingBoxInternal(RIL_BLOCK, &bleft, &btop, &bright, &bbottom);
+  boxDestroy(&box);
-      pix = it_->block()->block->render_mask();
+  if (level == RIL_BLOCK || level == RIL_PARA) {
-      // AND the mask and the image.
+    // Clip to the block polygon as well.
-      pixRasterop(pix, 0, 0, pixGetWidth(pix), pixGetHeight(pix),
+    TBOX mask_box;
-                  PIX_SRC & PIX_DST, tesseract_->pix_binary(),
+    Pix* mask = it_->block()->block->render_mask(&mask_box);
-                  bleft, btop);
+    int mask_x = left - mask_box.left();
-      if (level == RIL_PARA) {
+    int mask_y = top - (tesseract_->ImageHeight() - mask_box.top());
-        // RIL_PARA needs further attention:
+    // AND the mask and pix, putting the result in pix.
-        //   clip the paragraph from the block mask.
+    pixRasterop(pix, MAX(0, -mask_x), MAX(0, -mask_y), pixGetWidth(pix),
-        Box* box = boxCreate(left - bleft, top - btop,
+                pixGetHeight(pix), PIX_SRC & PIX_DST, mask, MAX(0, mask_x),
-                             right - left, bottom - top);
+                MAX(0, mask_y));
-        Pix* pix2 = pixClipRectangle(pix, box, NULL);
+    pixDestroy(&mask);
        boxDestroy(&box);
        pixDestroy(&pix);
        pix = pix2;
      }
      break;
    case RIL_TEXTLINE:
    case RIL_WORD:
    case RIL_SYMBOL:
      if (level == RIL_SYMBOL && cblob_it_ != NULL &&
          cblob_it_->data()->area() != 0)
        return cblob_it_->data()->render();
      // Just clip from the bounding box.
      Box* box = boxCreate(left, top, right - left, bottom - top);
      pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL);
      boxDestroy(&box);
      break;
  }
  return pix;
 }
@ -452,17 +457,24 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
  Box* box = boxCreate(*left, *top, right - *left, bottom - *top);
  Pix* grey_pix = pixClipRectangle(original_img, box, NULL);
  boxDestroy(&box);
-  if (level == RIL_BLOCK) {
+  if (level == RIL_BLOCK || level == RIL_PARA) {
-    Pix* mask = it_->block()->block->render_mask();
+    // Clip to the block polygon as well.
-    Pix* expanded_mask = pixCreate(right - *left, bottom - *top, 1);
+    TBOX mask_box;
-    pixRasterop(expanded_mask, padding, padding,
+    Pix* mask = it_->block()->block->render_mask(&mask_box);
-                pixGetWidth(mask), pixGetHeight(mask),
+    // Copy the mask registered correctly into an image the size of grey_pix.
-                PIX_SRC, mask, 0, 0);
+    int mask_x = *left - mask_box.left();
    int mask_y = *top - (pixGetHeight(original_img) - mask_box.top());
    int width = pixGetWidth(grey_pix);
    int height = pixGetHeight(grey_pix);
    Pix* resized_mask = pixCreate(width, height, 1);
    pixRasterop(resized_mask, MAX(0, -mask_x), MAX(0, -mask_y), width, height,
                PIX_SRC, mask, MAX(0, mask_x), MAX(0, mask_y));
    pixDestroy(&mask);
-    pixDilateBrick(expanded_mask, expanded_mask, 2*padding + 1, 2*padding + 1);
+    pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1,
-    pixInvert(expanded_mask, expanded_mask);
+                   2 * padding + 1);
-    pixSetMasked(grey_pix, expanded_mask, MAX_UINT32);
+    pixInvert(resized_mask, resized_mask);
-    pixDestroy(&expanded_mask);
+    pixSetMasked(grey_pix, resized_mask, MAX_UINT32);
    pixDestroy(&resized_mask);
  }
  return grey_pix;
 }
--- a/ccmain/pageiterator.h
+++ b/ccmain/pageiterator.h
@ -179,6 +179,21 @@ class TESS_API PageIterator {
  // If an image rectangle has been set in the API, then returned coordinates
  // relate to the original (full) image, rather than the rectangle.
  /**
   * Controls what to include in a bounding box. Bounding boxes of all levels
   * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
   * Between layout analysis and recognition, it isn't known where all
   * diacritics belong, so this control is used to include or exclude some
   * diacritics that are above or below the main body of the word. In most cases
   * where the placement is obvious, and after recognition, it doesn't make as
   * much difference, as the diacritics will already be included in the word.
   */
  void SetBoundingBoxComponents(bool include_upper_dots,
                                bool include_lower_dots) {
    include_upper_dots_ = include_upper_dots;
    include_lower_dots_ = include_lower_dots;
  }
  /**
   * Returns the bounding rectangle of the current object at the given level.
   * See comment on coordinate system above.
@ -332,6 +347,9 @@ class TESS_API PageIterator {
   * Owned by this ResultIterator.
   */
  C_BLOB_IT* cblob_it_;
  /** Control over what to include in bounding boxes. */
  bool include_upper_dots_;
  bool include_lower_dots_;
  /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
  int scale_;
  int scaled_yres_;
--- a/ccmain/pagesegmain.cpp
+++ b/ccmain/pagesegmain.cpp
@ -134,12 +134,20 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
    // UNLV file present. Use PSM_SINGLE_BLOCK.
    pageseg_mode = PSM_SINGLE_BLOCK;
  }
  // The diacritic_blobs holds noise blobs that may be diacritics. They
  // are separated out on areas of the image that seem noisy and short-circuit
  // the layout process, going straight from the initial partition creation
  // right through to after word segmentation, where they are added to the
  // rej_cblobs list of the most appropriate word. From there classification
  // will determine whether they are used.
  BLOBNBOX_LIST diacritic_blobs;
  int auto_page_seg_ret_val = 0;
  TO_BLOCK_LIST to_blocks;
  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
      PSM_SPARSE(pageseg_mode)) {
-    auto_page_seg_ret_val =
+    auto_page_seg_ret_val = AutoPageSeg(
-        AutoPageSeg(pageseg_mode, blocks, &to_blocks, osd_tess, osr);
+        pageseg_mode, blocks, &to_blocks,
        enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr);
    if (pageseg_mode == PSM_OSD_ONLY)
      return auto_page_seg_ret_val;
    // To create blobs from the image region bounds uncomment this line:
@ -171,7 +179,7 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
  textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
                       pix_thresholds_, pix_grey_, splitting || cjk_mode,
-                       blocks, &to_blocks);
+                       &diacritic_blobs, blocks, &to_blocks);
  return auto_page_seg_ret_val;
 }
@ -197,7 +205,6 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
  pixDestroy(&grey_pix);
 }
 /**
 * Auto page segmentation. Divide the page image into blocks of uniform
 * text linespacing and images.
@ -207,9 +214,14 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
 * The output goes in the blocks list with corresponding TO_BLOCKs in the
 * to_blocks list.
 *
- * If single_column is true, then no attempt is made to divide the image
+ * If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide
- * into columns, but multiple blocks are still made if the text is of
+ * the image into columns, but multiple blocks are still made if the text is
- * non-uniform linespacing.
+ * of non-uniform linespacing.
 *
 * If diacritic_blobs is non-null, then diacritics/noise blobs, that would
 * confuse layout anaylsis by causing textline overlap, are placed there,
 * with the expectation that they will be reassigned to words later and
 * noise/diacriticness determined via classification.
 *
 * If osd (orientation and script detection) is true then that is performed
 * as well. If only_osd is true, then only orientation and script detection is
@ -217,9 +229,10 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
 * another Tesseract that was initialized especially for osd, and the results
 * will be output into osr (orientation and script result).
 */
-int Tesseract::AutoPageSeg(PageSegMode pageseg_mode,
+int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
-                           BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
+                           TO_BLOCK_LIST* to_blocks,
-                           Tesseract* osd_tess, OSResults* osr) {
+                           BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess,
                           OSResults* osr) {
  if (textord_debug_images) {
    WriteDebugBackgroundImage(textord_debug_printable, pix_binary_);
  }
@ -247,10 +260,9 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode,
    if (equ_detect_) {
      finder->SetEquationDetect(equ_detect_);
    }
-    result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
+    result = finder->FindBlocks(
-                                to_block, photomask_pix,
+        pageseg_mode, scaled_color_, scaled_factor_, to_block, photomask_pix,
-                                pix_thresholds_, pix_grey_,
+        pix_thresholds_, pix_grey_, &found_blocks, diacritic_blobs, to_blocks);
                                &found_blocks, to_blocks);
    if (result >= 0)
      finder->GetDeskewVectors(&deskew_, &reskew_);
    delete finder;
--- a/ccmain/pgedit.cpp
+++ b/ccmain/pgedit.cpp
@ -655,7 +655,8 @@ void show_point(PAGE_RES* page_res, float x, float y) {
  FCOORD pt(x, y);
  PAGE_RES_IT pr_it(page_res);
-  char msg[160];
+  const int kBufsize = 512;
  char msg[kBufsize];
  char *msg_ptr = msg;
  msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y);
--- a/ccmain/recogtraining.cpp
+++ b/ccmain/recogtraining.cpp
@ -207,8 +207,7 @@ void Tesseract::ambigs_classify_and_output(const char *label,
  fflush(stdout);
  WordData word_data(*pr_it);
  SetupWordPassN(1, &word_data);
-  classify_word_and_language(&Tesseract::classify_word_pass1,
+  classify_word_and_language(1, pr_it, &word_data);
                             pr_it, &word_data);
  WERD_RES* werd_res = word_data.word;
  WERD_CHOICE *best_choice = werd_res->best_choice;
  ASSERT_HOST(best_choice != NULL);
--- a/ccmain/tesseractclass.cpp
+++ b/ccmain/tesseractclass.cpp
--- a/ccmain/tesseractclass.h
+++ b/ccmain/tesseractclass.h
@ -283,8 +283,8 @@ class Tesseract : public Wordrec {
  int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
                  Tesseract* osd_tess, OSResults* osr);
  void SetupWordScripts(BLOCK_LIST* blocks);
-  int AutoPageSeg(PageSegMode pageseg_mode,
+  int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
-                  BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
+                  TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
                  Tesseract* osd_tess, OSResults* osr);
  ColumnFinder* SetupPageSegAndDetectOrientation(
      bool single_column, bool osd, bool only_osd,
@ -328,8 +328,46 @@ class Tesseract : public Wordrec {
                        WordRecognizer recognizer,
                        WERD_RES** in_word,
                        PointerVector<WERD_RES>* best_words);
-  void classify_word_and_language(WordRecognizer recognizer,
+  // Moves good-looking "noise"/diacritics from the reject list to the main
-                                  PAGE_RES_IT* pr_it,
+  // blob list on the current word. Returns true if anything was done, and
  // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
  bool ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,
                          bool* make_next_word_fuzzy);
  // Attempts to put noise/diacritic outlines into the blobs that they overlap.
  // Input: a set of noisy outlines that probably belong to the real_word.
  // Output: outlines that overlapped blobs are set to NULL and put back into
  // the word, either in the blobs or in the reject list.
  void AssignDiacriticsToOverlappingBlobs(
      const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
      PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
      GenericVector<bool>* overlapped_any_blob,
      GenericVector<C_BLOB*>* target_blobs);
  // Attempts to assign non-overlapping outlines to their nearest blobs or
  // make new blobs out of them.
  void AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE*>& outlines,
                                  int pass, WERD* real_word, PAGE_RES_IT* pr_it,
                                  GenericVector<bool>* word_wanted,
                                  GenericVector<C_BLOB*>* target_blobs);
  // Starting with ok_outlines set to indicate which outlines overlap the blob,
  // chooses the optimal set (approximately) and returns true if any outlines
  // are desired, in which case ok_outlines indicates which ones.
  bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold,
                                   PAGE_RES_IT* pr_it, C_BLOB* blob,
                                   const GenericVector<C_OUTLINE*>& outlines,
                                   int num_outlines,
                                   GenericVector<bool>* ok_outlines);
  // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
  // the inclusion of the outlines, and returns the certainty of the raw choice.
  float ClassifyBlobPlusOutlines(const GenericVector<bool>& ok_outlines,
                                 const GenericVector<C_OUTLINE*>& outlines,
                                 int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
                                 STRING* best_str);
  // Classifies the given blob (part of word_data->word->word) as an individual
  // word, using languages, chopper etc, returning only the certainty of the
  // best raw choice, and undoing all the work done to fake out the word.
  float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
                           STRING* best_str, float* c2);
  void classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
                                  WordData* word_data);
  void classify_word_pass1(const WordData& word_data,
                           WERD_RES** in_word,
@ -808,6 +846,24 @@ class Tesseract : public Wordrec {
             "Enable single word correction based on the dictionary.");
  INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram "
            "correction.");
  BOOL_VAR_H(enable_noise_removal, true,
             "Remove and conditionally reassign small outlines when they"
             " confuse layout analysis, determining diacritics vs noise");
  INT_VAR_H(debug_noise_removal, 0, "Debug reassignment of small outlines");
  // Worst (min) certainty, for which a diacritic is allowed to make the base
  // character worse and still be included.
  double_VAR_H(noise_cert_basechar, -8.0, "Hingepoint for base char certainty");
  // Worst (min) certainty, for which a non-overlapping diacritic is allowed to
  // make the base character worse and still be included.
  double_VAR_H(noise_cert_disjoint, -2.5, "Hingepoint for disjoint certainty");
  // Worst (min) certainty, for which a diacritic is allowed to make a new
  // stand-alone blob.
  double_VAR_H(noise_cert_punc, -2.5, "Threshold for new punc char certainty");
  // Factor of certainty margin for adding diacritics to not count as worse.
  double_VAR_H(noise_cert_factor, 0.375,
               "Scaling on certainty diff from Hingepoint");
  INT_VAR_H(noise_maxperblob, 8, "Max diacritics to apply to a blob");
  INT_VAR_H(noise_maxperword, 16, "Max diacritics to apply to a word");
  INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug");
  BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk");
  STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation");
--- a/ccstruct/blobbox.h
+++ b/ccstruct/blobbox.h
@ -137,6 +137,9 @@ class BLOBNBOX:public ELIST_LINK
      cblob_ptr = srcblob;
      area = static_cast<int>(srcblob->area());
    }
    ~BLOBNBOX() {
      if (owns_cblob_) delete cblob_ptr;
    }
    static BLOBNBOX* RealBlob(C_OUTLINE* outline) {
      C_BLOB* blob = new C_BLOB(outline);
      return new BLOBNBOX(blob);
@ -387,6 +390,7 @@ class BLOBNBOX:public ELIST_LINK
    void set_base_char_blob(BLOBNBOX* blob) {
      base_char_blob_ = blob;
    }
    void set_owns_cblob(bool value) { owns_cblob_ = value; }
    bool UniquelyVertical() const {
      return vert_possible_ && !horz_possible_;
@ -450,6 +454,7 @@ class BLOBNBOX:public ELIST_LINK
  // construction time.
  void ConstructionInit() {
    cblob_ptr = NULL;
    owns_cblob_ = false;
    area = 0;
    area_stroke_width_ = 0.0f;
    horz_stroke_width_ = 0.0f;
@ -525,6 +530,10 @@ class BLOBNBOX:public ELIST_LINK
  bool vert_possible_;           // Could be part of vertical flow.
  bool leader_on_left_;          // There is a leader to the left.
  bool leader_on_right_;         // There is a leader to the right.
  // Iff true, then the destructor should delete the cblob_ptr.
  // TODO(rays) migrate all uses to correctly setting this flag instead of
  // deleting the C_BLOB before deleting the BLOBNBOX.
  bool owns_cblob_;
 };
 class TO_ROW: public ELIST2_LINK
--- a/ccstruct/ocrblock.cpp
+++ b/ccstruct/ocrblock.cpp
@ -86,6 +86,18 @@ void BLOCK::rotate(const FCOORD& rotation) {
  box = *poly_block()->bounding_box();
 }
 // Returns the bounding box including the desired combination of upper and
 // lower noise/diacritic elements.
 TBOX BLOCK::restricted_bounding_box(bool upper_dots, bool lower_dots) const {
  TBOX box;
  // This is a read-only iteration of the rows in the block.
  ROW_IT it(const_cast<ROW_LIST*>(&rows));
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    box += it.data()->restricted_bounding_box(upper_dots, lower_dots);
  }
  return box;
 }
 /**
 * BLOCK::reflect_polygon_in_y_axis
 *
--- a/ccstruct/ocrblock.h
+++ b/ccstruct/ocrblock.h
@ -161,10 +161,14 @@ class BLOCK:public ELIST_LINK, public PDBLK
    median_size_.set_y(y);
  }
-  Pix* render_mask() {
+  Pix* render_mask(TBOX* mask_box) {
-    return PDBLK::render_mask(re_rotation_);
+    return PDBLK::render_mask(re_rotation_, mask_box);
  }
  // Returns the bounding box including the desired combination of upper and
  // lower noise/diacritic elements.
  TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;
  // Reflects the polygon in the y-axis and recomputes the bounding_box.
  // Does nothing to any contained rows/words/blobs etc.
  void reflect_polygon_in_y_axis();
--- a/ccstruct/ocrrow.cpp
+++ b/ccstruct/ocrrow.cpp
@ -80,6 +80,17 @@ ROW::ROW(                 //constructor
  rmargin_ = 0;
 }
 // Returns the bounding box including the desired combination of upper and
 // lower noise/diacritic elements.
 TBOX ROW::restricted_bounding_box(bool upper_dots, bool lower_dots) const {
  TBOX box;
  // This is a read-only iteration of the words in the row.
  WERD_IT it(const_cast<WERD_LIST *>(&words));
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    box += it.data()->restricted_bounding_box(upper_dots, lower_dots);
  }
  return box;
 }
 /**********************************************************************
 * ROW::recalc_bounding_box
--- a/ccstruct/ocrrow.h
+++ b/ccstruct/ocrrow.h
@ -85,6 +85,9 @@ class ROW:public ELIST_LINK
    TBOX bounding_box() const {  //return bounding box
      return bound_box;
    }
    // Returns the bounding box including the desired combination of upper and
    // lower noise/diacritic elements.
    TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;
    void set_lmargin(inT16 lmargin) {
      lmargin_ = lmargin;
--- a/ccstruct/pageres.cpp
+++ b/ccstruct/pageres.cpp
@ -1258,23 +1258,16 @@ int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
  return 0;
 }
-// Inserts the new_word and a corresponding WERD_RES before the current
+// Inserts the new_word as a combination owned by a corresponding WERD_RES
-// position. The simple fields of the WERD_RES are copied from clone_res and
+// before the current position. The simple fields of the WERD_RES are copied
-// the resulting WERD_RES is returned for further setup with best_choice etc.
+// from clone_res and the resulting WERD_RES is returned for further setup
 // with best_choice etc.
 WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res,
                                             WERD* new_word) {
  // Insert new_word into the ROW.
  WERD_IT w_it(row()->row->word_list());
  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
    WERD* word = w_it.data();
    if (word == word_res->word)
      break;
  }
  ASSERT_HOST(!w_it.cycled_list());
  w_it.add_before_then_move(new_word);
  // Make a WERD_RES for the new_word.
  WERD_RES* new_res = new WERD_RES(new_word);
  new_res->CopySimpleFields(clone_res);
  new_res->combination = true;
  // Insert into the appropriate place in the ROW_RES.
  WERD_RES_IT wr_it(&row()->word_res_list);
  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
@ -1477,6 +1470,33 @@ void PAGE_RES_IT::DeleteCurrentWord() {
  ResetWordIterator();
 }
 // Makes the current word a fuzzy space if not already fuzzy. Updates
 // corresponding part of combo if required.
 void PAGE_RES_IT::MakeCurrentWordFuzzy() {
  WERD* real_word = word_res->word;
  if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
    real_word->set_flag(W_FUZZY_SP, true);
    tprintf("Made word fuzzy at:");
    real_word->bounding_box().print();
    if (word_res->combination) {
      // The next word should be the corresponding part of combo, but we have
      // already stepped past it, so find it by search.
      WERD_RES_IT wr_it(&row()->word_res_list);
      for (wr_it.mark_cycle_pt();
           !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
      }
      wr_it.forward();
      ASSERT_HOST(wr_it.data()->part_of_combo);
      real_word = wr_it.data()->word;
      ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
                  !real_word->flag(W_FUZZY_NON));
      real_word->set_flag(W_FUZZY_SP, true);
      tprintf("Made part of combo word fuzzy at:");
      real_word->bounding_box().print();
    }
  }
 }
 /*************************************************************************
 * PAGE_RES_IT::restart_page
 *
--- a/ccstruct/pageres.h
+++ b/ccstruct/pageres.h
@ -708,6 +708,10 @@ class PAGE_RES_IT {
  // Deletes the current WERD_RES and its underlying WERD.
  void DeleteCurrentWord();
  // Makes the current word a fuzzy space if not already fuzzy. Updates
  // corresponding part of combo if required.
  void MakeCurrentWordFuzzy();
  WERD_RES *forward() {  // Get next word.
    return internal_forward(false, false);
  }
@ -747,9 +751,9 @@ class PAGE_RES_IT {
    return next_block_res;
  }
  void rej_stat_word();  // for page/block/row
  void ResetWordIterator();
 private:
  void ResetWordIterator();
  WERD_RES *internal_forward(bool new_block, bool empty_ok);
  WERD_RES * prev_word_res;    // previous word
--- a/ccstruct/pdblock.cpp
+++ b/ccstruct/pdblock.cpp
@ -77,7 +77,6 @@ void PDBLK::set_sides(                       //set vertex lists
  right_it.add_list_before (right);
 }
 /**********************************************************************
 * PDBLK::contains
 *
@ -126,7 +125,7 @@ void PDBLK::move(                  // reposition block
 // Returns a binary Pix mask with a 1 pixel for every pixel within the
 // block. Rotates the coordinate system by rerotation prior to rendering.
-Pix* PDBLK::render_mask(const FCOORD& rerotation) {
+Pix* PDBLK::render_mask(const FCOORD& rerotation, TBOX* mask_box) {
  TBOX rotated_box(box);
  rotated_box.rotate(rerotation);
  Pix* pix = pixCreate(rotated_box.width(), rotated_box.height(), 1);
@ -163,6 +162,7 @@ Pix* PDBLK::render_mask(const FCOORD& rerotation) {
    pixRasterop(pix, 0, 0, rotated_box.width(), rotated_box.height(),
                PIX_SET, NULL, 0, 0);
  }
  if (mask_box != NULL) *mask_box = rotated_box;
  return pix;
 }
--- a/ccstruct/pdblock.h
+++ b/ccstruct/pdblock.h
@ -89,7 +89,9 @@ class PDBLK
    // Returns a binary Pix mask with a 1 pixel for every pixel within the
    // block. Rotates the coordinate system by rerotation prior to rendering.
-    Pix* render_mask(const FCOORD& rerotation);
+    // If not NULL, mask_box is filled with the position box of the returned
    // mask image.
    Pix *render_mask(const FCOORD &rerotation, TBOX *mask_box);
    #ifndef GRAPHICS_DISABLED
    ///draw histogram
--- a/ccstruct/werd.cpp
+++ b/ccstruct/werd.cpp
@ -160,23 +160,37 @@ WERD* WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob) {
 * row being marked as FUZZY space.
 */
-TBOX WERD::bounding_box() {
+TBOX WERD::bounding_box() const { return restricted_bounding_box(true, true); }
  TBOX box;                       // box being built
  C_BLOB_IT rej_cblob_it = &rej_cblobs;  // rejected blobs
-  for (rej_cblob_it.mark_cycle_pt(); !rej_cblob_it.cycled_list();
+// Returns the bounding box including the desired combination of upper and
-       rej_cblob_it.forward()) {
+// lower noise/diacritic elements.
-    box += rej_cblob_it.data()->bounding_box();
+TBOX WERD::restricted_bounding_box(bool upper_dots, bool lower_dots) const {
  TBOX box = true_bounding_box();
  int bottom = box.bottom();
  int top = box.top();
  // This is a read-only iteration of the rejected blobs.
  C_BLOB_IT it(const_cast<C_BLOB_LIST*>(&rej_cblobs));
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    TBOX dot_box = it.data()->bounding_box();
    if ((upper_dots || dot_box.bottom() <= top) &&
        (lower_dots || dot_box.top() >= bottom)) {
      box += dot_box;
    }
  }
  return box;
 }
-  C_BLOB_IT it = &cblobs;    // blobs of WERD
+// Returns the bounding box of only the good blobs.
 TBOX WERD::true_bounding_box() const {
  TBOX box;  // box being built
  // This is a read-only iteration of the good blobs.
  C_BLOB_IT it(const_cast<C_BLOB_LIST*>(&cblobs));
  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
    box += it.data()->bounding_box();
  }
  return box;
 }
 /**
 * WERD::move
 *
@ -489,3 +503,101 @@ WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
  }
  return new_werd;
 }
 // Removes noise from the word by moving small outlines to the rej_cblobs
 // list, based on the size_threshold.
 void WERD::CleanNoise(float size_threshold) {
  C_BLOB_IT blob_it(&cblobs);
  C_BLOB_IT rej_it(&rej_cblobs);
  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
    C_BLOB* blob = blob_it.data();
    C_OUTLINE_IT ol_it(blob->out_list());
    for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
      C_OUTLINE* outline = ol_it.data();
      TBOX ol_box = outline->bounding_box();
      int ol_size =
          ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height();
      if (ol_size < size_threshold) {
        // This outline is too small. Move it to a separate blob in the
        // reject blobs list.
        C_BLOB* rej_blob = new C_BLOB(ol_it.extract());
        rej_it.add_after_then_move(rej_blob);
      }
    }
    if (blob->out_list()->empty()) delete blob_it.extract();
  }
 }
 // Extracts all the noise outlines and stuffs the pointers into the given
 // vector of outlines. Afterwards, the outlines vector owns the pointers.
 void WERD::GetNoiseOutlines(GenericVector<C_OUTLINE*>* outlines) {
  C_BLOB_IT rej_it(&rej_cblobs);
  for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) {
    C_BLOB* blob = rej_it.extract();
    C_OUTLINE_IT ol_it(blob->out_list());
    outlines->push_back(ol_it.extract());
    delete blob;
  }
 }
 // Adds the selected outlines to the indcated real blobs, and puts the rest
 // back in rej_cblobs where they came from. Where the target_blobs entry is
 // NULL, a run of wanted outlines is put into a single new blob.
 // Ownership of the outlines is transferred back to the word. (Hence
 // GenericVector and not PointerVector.)
 // Returns true if any new blob was added to the start of the word, which
 // suggests that it might need joining to the word before it, and likewise
 // sets make_next_word_fuzzy true if any new blob was added to the end.
 bool WERD::AddSelectedOutlines(const GenericVector<bool>& wanted,
                               const GenericVector<C_BLOB*>& target_blobs,
                               const GenericVector<C_OUTLINE*>& outlines,
                               bool* make_next_word_fuzzy) {
  bool outline_added_to_start = false;
  if (make_next_word_fuzzy != NULL) *make_next_word_fuzzy = false;
  C_BLOB_IT rej_it(&rej_cblobs);
  for (int i = 0; i < outlines.size(); ++i) {
    C_OUTLINE* outline = outlines[i];
    if (outline == NULL) continue;  // Already used it.
    if (wanted[i]) {
      C_BLOB* target_blob = target_blobs[i];
      TBOX noise_box = outline->bounding_box();
      if (target_blob == NULL) {
        target_blob = new C_BLOB(outline);
        // Need to find the insertion point.
        C_BLOB_IT blob_it(&cblobs);
        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
             blob_it.forward()) {
          C_BLOB* blob = blob_it.data();
          TBOX blob_box = blob->bounding_box();
          if (blob_box.left() > noise_box.left()) {
            if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) {
              // We might want to join this word to its predecessor.
              outline_added_to_start = true;
            }
            blob_it.add_before_stay_put(target_blob);
            break;
          }
        }
        if (blob_it.cycled_list()) {
          blob_it.add_to_end(target_blob);
          if (make_next_word_fuzzy != NULL) *make_next_word_fuzzy = true;
        }
        // Add all consecutive wanted, but null-blob outlines to same blob.
        C_OUTLINE_IT ol_it(target_blob->out_list());
        while (i + 1 < outlines.size() && wanted[i + 1] &&
               target_blobs[i + 1] == NULL) {
          ++i;
          ol_it.add_to_end(outlines[i]);
        }
      } else {
        // Insert outline into this blob.
        C_OUTLINE_IT ol_it(target_blob->out_list());
        ol_it.add_to_end(outline);
      }
    } else {
      // Put back on noise list.
      rej_it.add_to_end(new C_BLOB(outline));
    }
  }
  return outline_added_to_start;
 }
--- a/ccstruct/werd.h
+++ b/ccstruct/werd.h
@ -114,7 +114,13 @@ class WERD : public ELIST2_LINK {
      script_id_ = id;
    }
-    TBOX bounding_box();  // compute bounding box
+    // Returns the (default) bounding box including all the dots.
    TBOX bounding_box() const;  // compute bounding box
    // Returns the bounding box including the desired combination of upper and
    // lower noise/diacritic elements.
    TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;
    // Returns the bounding box of only the good blobs.
    TBOX true_bounding_box() const;
    const char *text() const { return correct.string(); }
    void set_text(const char *new_text) { correct = new_text; }
@ -155,6 +161,26 @@ class WERD : public ELIST2_LINK {
    void plot_rej_blobs(ScrollView *window);
    #endif  // GRAPHICS_DISABLED
    // Removes noise from the word by moving small outlines to the rej_cblobs
    // list, based on the size_threshold.
    void CleanNoise(float size_threshold);
    // Extracts all the noise outlines and stuffs the pointers into the given
    // vector of outlines. Afterwards, the outlines vector owns the pointers.
    void GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines);
    // Adds the selected outlines to the indcated real blobs, and puts the rest
    // back in rej_cblobs where they came from. Where the target_blobs entry is
    // NULL, a run of wanted outlines is put into a single new blob.
    // Ownership of the outlines is transferred back to the word. (Hence
    // GenericVector and not PointerVector.)
    // Returns true if any new blob was added to the start of the word, which
    // suggests that it might need joining to the word before it, and likewise
    // sets make_next_word_fuzzy true if any new blob was added to the end.
    bool AddSelectedOutlines(const GenericVector<bool> &wanted,
                             const GenericVector<C_BLOB *> &target_blobs,
                             const GenericVector<C_OUTLINE *> &outlines,
                             bool *make_next_word_fuzzy);
 private:
    uinT8 blanks;                // no of blanks
    uinT8 dummy;                 // padding
--- a/textord/colfind.cpp
+++ b/textord/colfind.cpp
@ -286,22 +286,27 @@ void ColumnFinder::CorrectOrientation(TO_BLOCK* block,
 // thresholds_pix is expected to be present iff grey_pix is present and
 // can be an integer factor reduction of the grey_pix. It represents the
 // thresholds that were used to create the binary_pix from the grey_pix.
 // If diacritic_blobs is non-null, then diacritics/noise blobs, that would
 // confuse layout anaylsis by causing textline overlap, are placed there,
 // with the expectation that they will be reassigned to words later and
 // noise/diacriticness determined via classification.
 // Returns -1 if the user hits the 'd' key in the blocks window while running
 // in debug mode, which requests a retry with more debug info.
-int ColumnFinder::FindBlocks(PageSegMode pageseg_mode,
+int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color,
-                             Pix* scaled_color, int scaled_factor,
+                             int scaled_factor, TO_BLOCK* input_block,
-                             TO_BLOCK* input_block, Pix* photo_mask_pix,
+                             Pix* photo_mask_pix, Pix* thresholds_pix,
-                             Pix* thresholds_pix, Pix* grey_pix,
+                             Pix* grey_pix, BLOCK_LIST* blocks,
-                             BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks) {
+                             BLOBNBOX_LIST* diacritic_blobs,
                             TO_BLOCK_LIST* to_blocks) {
  pixOr(photo_mask_pix, photo_mask_pix, nontext_map_);
  stroke_width_->FindLeaderPartitions(input_block, &part_grid_);
  stroke_width_->RemoveLineResidue(&big_parts_);
  FindInitialTabVectors(NULL, min_gutter_width_, tabfind_aligned_gap_fraction_,
                        input_block);
  SetBlockRuleEdges(input_block);
-  stroke_width_->GradeBlobsIntoPartitions(rerotate_, input_block, nontext_map_,
+  stroke_width_->GradeBlobsIntoPartitions(
-                                          denorm_, cjk_script_, &projection_,
+      rerotate_, input_block, nontext_map_, denorm_, cjk_script_, &projection_,
-                                          &part_grid_, &big_parts_);
+      diacritic_blobs, &part_grid_, &big_parts_);
  if (!PSM_SPARSE(pageseg_mode)) {
    ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_,
                                   input_block, this, &part_grid_, &big_parts_);
@ -1134,9 +1139,13 @@ void ColumnFinder::GridMergePartitions() {
            neighbour->Print();
          }
          rsearch.RemoveBBox();
-          gsearch.RepositionIterator();
+          if (!modified_box) {
            // We are going to modify part, so remove it and re-insert it after.
            gsearch.RemoveBBox();
            rsearch.RepositionIterator();
            modified_box = true;
          }
          part->Absorb(neighbour, WidthCB());
          modified_box = true;
        } else if (debug) {
          tprintf("Neighbour failed hgap test\n");
        }
@ -1151,7 +1160,6 @@ void ColumnFinder::GridMergePartitions() {
      // or it will never be found by a full search.
      // Because the box has changed, it has to be removed first, otherwise
      // add_sorted may fail to keep a single copy of the pointer.
      gsearch.RemoveBBox();
      part_grid_.InsertBBox(true, true, part);
      gsearch.RepositionIterator();
    }
--- a/textord/colfind.h
+++ b/textord/colfind.h
@ -155,13 +155,15 @@ class ColumnFinder : public TabFind {
  // thresholds_pix is expected to be present iff grey_pix is present and
  // can be an integer factor reduction of the grey_pix. It represents the
  // thresholds that were used to create the binary_pix from the grey_pix.
  // Small blobs that confuse the segmentation into lines are placed into
  // diacritic_blobs, with the intention that they be put into the most
  // appropriate word after the rest of layout analysis.
  // Returns -1 if the user hits the 'd' key in the blocks window while running
  // in debug mode, which requests a retry with more debug info.
-  int FindBlocks(PageSegMode pageseg_mode,
+  int FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, int scaled_factor,
-                 Pix* scaled_color, int scaled_factor,
+                 TO_BLOCK* block, Pix* photo_mask_pix, Pix* thresholds_pix,
-                 TO_BLOCK* block, Pix* photo_mask_pix,
+                 Pix* grey_pix, BLOCK_LIST* blocks,
-                 Pix* thresholds_pix, Pix* grey_pix,
+                 BLOBNBOX_LIST* diacritic_blobs, TO_BLOCK_LIST* to_blocks);
                 BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks);
  // Get the rotation required to deskew, and its inverse rotation.
  void GetDeskewVectors(FCOORD* deskew, FCOORD* reskew);
--- a/textord/colpartition.cpp
+++ b/textord/colpartition.cpp
@ -297,6 +297,25 @@ void ColPartition::DisownBoxesNoAssert() {
  }
 }
 // NULLs the owner of the blobs in this partition that are owned by this
 // partition and not leader blobs, removing them from the boxes_ list, thus
 // turning this partition back to a leader partition if it contains a leader,
 // or otherwise leaving it empty. Returns true if any boxes remain.
 bool ColPartition::ReleaseNonLeaderBoxes() {
  BLOBNBOX_C_IT bb_it(&boxes_);
  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
    BLOBNBOX* bblob = bb_it.data();
    if (bblob->flow() != BTFT_LEADER) {
      if (bblob->owner() == this) bblob->set_owner(NULL);
      bb_it.extract();
    }
  }
  if (bb_it.empty()) return false;
  flow_ = BTFT_LEADER;
  ComputeLimits();
  return true;
 }
 // Delete the boxes that this partition owns.
 void ColPartition::DeleteBoxes() {
  // Although the boxes_ list is a C_LIST, in some cases it owns the
@ -831,6 +850,10 @@ ColPartition* ColPartition::SplitAt(int split_x) {
        bbox->set_owner(split_part);
    }
  }
  if (it.empty()) {
    // Possible if split-x passes through the first blob.
    it.add_list_after(&split_part->boxes_);
  }
  ASSERT_HOST(!it.empty());
  if (split_part->IsEmpty()) {
    // Split part ended up with nothing. Possible if split_x passes
@ -1130,6 +1153,7 @@ bool ColPartition::MarkAsLeaderIfMonospaced() {
    if (best_end != NULL && best_end->total_cost() < blob_count) {
      // Good enough. Call it a leader.
      result = true;
      bool modified_blob_list = false;
      for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
        BLOBNBOX* blob = it.data();
        TBOX box = blob->bounding_box();
@ -1139,6 +1163,7 @@ bool ColPartition::MarkAsLeaderIfMonospaced() {
                     blob->bounding_box().right();
          if (blob->bounding_box().width() + gap > max_step) {
            it.extract();
            modified_blob_list = true;
            continue;
          }
        }
@ -1147,12 +1172,14 @@ bool ColPartition::MarkAsLeaderIfMonospaced() {
                     it.data_relative(-1)->bounding_box().right();
          if (blob->bounding_box().width() + gap > max_step) {
            it.extract();
            modified_blob_list = true;
            break;
          }
        }
        blob->set_region_type(BRT_TEXT);
        blob->set_flow(BTFT_LEADER);
      }
      if (modified_blob_list) ComputeLimits();
      blob_type_ = BRT_TEXT;
      flow_ = BTFT_LEADER;
    } else if (textord_debug_tabfind) {
--- a/textord/colpartition.h
+++ b/textord/colpartition.h
@ -481,6 +481,11 @@ class ColPartition : public ELIST2_LINK {
  // Any blobs that are not owned by this partition get to keep their owner
  // without an assert failure.
  void DisownBoxesNoAssert();
  // NULLs the owner of the blobs in this partition that are owned by this
  // partition and not leader blobs, removing them from the boxes_ list, thus
  // turning this partition back to a leader partition if it contains a leader,
  // or otherwise leaving it empty. Returns true if any boxes remain.
  bool ReleaseNonLeaderBoxes();
  // Delete the boxes that this partition owns.
  void DeleteBoxes();
--- a/textord/colpartitiongrid.cpp
+++ b/textord/colpartitiongrid.cpp
@ -324,6 +324,40 @@ static bool TestCompatibleCandidates(const ColPartition& part, bool debug,
  return true;
 }
 // Computes and returns the total overlap of all partitions in the grid.
 // If overlap_grid is non-null, it is filled with a grid that holds empty
 // partitions representing the union of all overlapped partitions.
 int ColPartitionGrid::ComputeTotalOverlap(ColPartitionGrid** overlap_grid) {
  int total_overlap = 0;
  // Iterate the ColPartitions in the grid.
  ColPartitionGridSearch gsearch(this);
  gsearch.StartFullSearch();
  ColPartition* part;
  while ((part = gsearch.NextFullSearch()) != NULL) {
    ColPartition_CLIST neighbors;
    const TBOX& part_box = part->bounding_box();
    FindOverlappingPartitions(part_box, part, &neighbors);
    ColPartition_C_IT n_it(&neighbors);
    bool any_part_overlap = false;
    for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
      const TBOX& n_box = n_it.data()->bounding_box();
      int overlap = n_box.intersection(part_box).area();
      if (overlap > 0 && overlap_grid != NULL) {
        if (*overlap_grid == NULL) {
          *overlap_grid = new ColPartitionGrid(gridsize(), bleft(), tright());
        }
        (*overlap_grid)->InsertBBox(true, true, n_it.data()->ShallowCopy());
        if (!any_part_overlap) {
          (*overlap_grid)->InsertBBox(true, true, part->ShallowCopy());
        }
      }
      any_part_overlap = true;
      total_overlap += overlap;
    }
  }
  return total_overlap;
 }
 // Finds all the ColPartitions in the grid that overlap with the given
 // box and returns them SortByBoxLeft(ed) and uniqued in the given list.
 // Any partition equal to not_this (may be NULL) is excluded.
@ -901,6 +935,7 @@ void ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST* im_blobs) {
  while ((part = gsearch.NextFullSearch()) != NULL) {
    BlobRegionType blob_type = part->blob_type();
    BlobTextFlowType flow = part->flow();
    bool any_blobs_moved = false;
    if (blob_type == BRT_POLYIMAGE || blob_type == BRT_RECTIMAGE) {
      BLOBNBOX_C_IT blob_it(part->boxes());
      for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
@ -918,6 +953,7 @@ void ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST* im_blobs) {
          ASSERT_HOST(blob->cblob()->area() != 0);
          blob->set_owner(NULL);
          blob_it.extract();
          any_blobs_moved = true;
        } else {
          blob->set_region_type(blob_type);
          if (blob->flow() != BTFT_LEADER)
@ -938,6 +974,11 @@ void ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST* im_blobs) {
          delete blob;
        }
      }
    } else if (any_blobs_moved) {
      gsearch.RemoveBBox();
      part->ComputeLimits();
      InsertBBox(true, true, part);
      gsearch.RepositionIterator();
    }
  }
 }
@ -1048,6 +1089,24 @@ void ColPartitionGrid::DeleteUnknownParts(TO_BLOCK* block) {
  block->DeleteUnownedNoise();
 }
 // Deletes all the partitions in the grid that are NOT of flow type BTFT_LEADER.
 void ColPartitionGrid::DeleteNonLeaderParts() {
  ColPartitionGridSearch gsearch(this);
  gsearch.StartFullSearch();
  ColPartition* part;
  while ((part = gsearch.NextFullSearch()) != NULL) {
    if (part->flow() != BTFT_LEADER) {
      gsearch.RemoveBBox();
      if (part->ReleaseNonLeaderBoxes()) {
        InsertBBox(true, true, part);
        gsearch.RepositionIterator();
      } else {
        delete part;
      }
    }
  }
 }
 // Finds and marks text partitions that represent figure captions.
 void ColPartitionGrid::FindFigureCaptions() {
  // For each image region find its best candidate text caption region,
--- a/textord/colpartitiongrid.h
+++ b/textord/colpartitiongrid.h
@ -63,6 +63,11 @@ class ColPartitionGrid : public BBGrid<ColPartition,
                                     const ColPartition*>* confirm_cb,
                 ColPartition* part);
  // Computes and returns the total overlap of all partitions in the grid.
  // If overlap_grid is non-null, it is filled with a grid that holds empty
  // partitions representing the union of all overlapped partitions.
  int ComputeTotalOverlap(ColPartitionGrid** overlap_grid);
  // Finds all the ColPartitions in the grid that overlap with the given
  // box and returns them SortByBoxLeft(ed) and uniqued in the given list.
  // Any partition equal to not_this (may be NULL) is excluded.
@ -165,6 +170,10 @@ class ColPartitionGrid : public BBGrid<ColPartition,
  // all the blobs in them.
  void DeleteUnknownParts(TO_BLOCK* block);
  // Deletes all the partitions in the grid that are NOT of flow type
  // BTFT_LEADER.
  void DeleteNonLeaderParts();
  // Finds and marks text partitions that represent figure captions.
  void FindFigureCaptions();
--- a/textord/strokewidth.cpp
+++ b/textord/strokewidth.cpp
@ -109,6 +109,13 @@ const float kSizeRatioToReject = 2.0;
 const int kMaxLargeOverlaps = 3;
 // Expansion factor for search box for good neighbours.
 const double kNeighbourSearchFactor = 2.5;
 // Factor of increase of overlap when adding diacritics to make an image noisy.
 const double kNoiseOverlapGrowthFactor = 4.0;
 // Fraction of the image size to add overlap when adding diacritics for an
 // image to qualify as noisy.
 const double kNoiseOverlapAreaFactor = 1.0 / 512;
 // Ratio of perimeter^2/area for a blob to be considered noise vs i dot.
 const double kShapePerimeterRatio = 3.0;
 StrokeWidth::StrokeWidth(int gridsize,
                         const ICOORD& bleft, const ICOORD& tright)
@ -343,14 +350,11 @@ void StrokeWidth::RemoveLineResidue(ColPartition_LIST* big_part_list) {
 // part_grid is the output grid of textline partitions.
 // Large blobs that cause overlap are put in separate partitions and added
 // to the big_parts list.
-void StrokeWidth::GradeBlobsIntoPartitions(const FCOORD& rerotation,
+void StrokeWidth::GradeBlobsIntoPartitions(
-                                           TO_BLOCK* block,
+    const FCOORD& rerotation, TO_BLOCK* block, Pix* nontext_pix,
-                                           Pix* nontext_pix,
+    const DENORM* denorm, bool cjk_script, TextlineProjection* projection,
-                                           const DENORM* denorm,
+    BLOBNBOX_LIST* diacritic_blobs, ColPartitionGrid* part_grid,
-                                           bool cjk_script,
+    ColPartition_LIST* big_parts) {
                                           TextlineProjection* projection,
                                           ColPartitionGrid* part_grid,
                                           ColPartition_LIST* big_parts) {
  nontext_map_ = nontext_pix;
  projection_ = projection;
  denorm_ = denorm;
@ -363,7 +367,7 @@ void StrokeWidth::GradeBlobsIntoPartitions(const FCOORD& rerotation,
  if (cjk_script) {
    FixBrokenCJK(block);
  }
-  FindTextlineFlowDirection(true);
+  FindTextlineFlowDirection(false);
  projection_->ConstructProjection(block, rerotation, nontext_map_);
  if (textord_tabfind_show_strokewidths) {
    ScrollView* line_blobs_win = MakeWindow(0, 0, "Initial textline Blobs");
@ -375,7 +379,19 @@ void StrokeWidth::GradeBlobsIntoPartitions(const FCOORD& rerotation,
  // Clear and re Insert to take advantage of the removed diacritics.
  Clear();
  InsertBlobs(block);
-  FindInitialPartitions(rerotation, block, part_grid, big_parts);
+  FCOORD skew;
  FindTextlineFlowDirection(true);
  PartitionFindResult r = FindInitialPartitions(
      rerotation, true, block, diacritic_blobs, part_grid, big_parts, &skew);
  if (r == PFR_NOISE) {
    tprintf("Detected %d diacritics\n", diacritic_blobs->length());
    // Noise was found, and removed.
    Clear();
    InsertBlobs(block);
    FindTextlineFlowDirection(true);
    r = FindInitialPartitions(rerotation, false, block, diacritic_blobs,
                              part_grid, big_parts, &skew);
  }
  nontext_map_ = NULL;
  projection_ = NULL;
  denorm_ = NULL;
@ -1220,10 +1236,17 @@ void StrokeWidth::SmoothNeighbourTypes(BLOBNBOX* blob, bool reset_all) {
 // minimize overlap and smoothes the types with neighbours and the color
 // image if provided. rerotation is used to rotate the coordinate space
 // back to the nontext_map_ image.
-void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation,
+// If find_problems is true, detects possible noise pollution by the amount
-                                        TO_BLOCK* block,
+// of partition overlap that is created by the diacritics. If excessive, the
-                                        ColPartitionGrid* part_grid,
+// noise is separated out into diacritic blobs, and PFR_NOISE is returned.
-                                        ColPartition_LIST* big_parts) {
+// [TODO(rays): if the partition overlap is caused by heavy skew, deskews
 // the components, saves the skew_angle and returns PFR_SKEW.] If the return
 // is not PFR_OK, the job is incomplete, and FindInitialPartitions must be
 // called again after cleaning up the partly done work.
 PartitionFindResult StrokeWidth::FindInitialPartitions(
    const FCOORD& rerotation, bool find_problems, TO_BLOCK* block,
    BLOBNBOX_LIST* diacritic_blobs, ColPartitionGrid* part_grid,
    ColPartition_LIST* big_parts, FCOORD* skew_angle) {
  FindVerticalTextChains(part_grid);
  FindHorizontalTextChains(part_grid);
  if (textord_tabfind_show_strokewidths) {
@ -1231,6 +1254,10 @@ void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation,
    part_grid->DisplayBoxes(chains_win_);
    projection_->DisplayProjection();
  }
  if (find_problems) {
    // TODO(rays) Do something to find skew, set skew_angle and return if there
    // is some.
  }
  part_grid->SplitOverlappingPartitions(big_parts);
  EasyMerges(part_grid);
  RemoveLargeUnusedBlobs(block, part_grid, big_parts);
@ -1239,8 +1266,14 @@ void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation,
                                         rerotation));
  while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_,
                                         grid_box, rerotation));
  int pre_overlap = part_grid->ComputeTotalOverlap(NULL);
  TestDiacritics(part_grid, block);
  MergeDiacritics(block, part_grid);
  if (find_problems && diacritic_blobs != NULL &&
      DetectAndRemoveNoise(pre_overlap, grid_box, block, part_grid,
                           diacritic_blobs)) {
    return PFR_NOISE;
  }
  if (textord_tabfind_show_strokewidths) {
    textlines_win_ = MakeWindow(400, 400, "GoodTextline blobs");
    part_grid->DisplayBoxes(textlines_win_);
@ -1260,6 +1293,57 @@ void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation,
    smoothed_win_ = MakeWindow(800, 400, "Smoothed blobs");
    part_grid->DisplayBoxes(smoothed_win_);
  }
  return PFR_OK;
 }
 // Detects noise by a significant increase in partition overlap from
 // pre_overlap to now, and removes noise from the union of all the overlapping
 // partitions, placing the blobs in diacritic_blobs. Returns true if any noise
 // was found and removed.
 bool StrokeWidth::DetectAndRemoveNoise(int pre_overlap, const TBOX& grid_box,
                                       TO_BLOCK* block,
                                       ColPartitionGrid* part_grid,
                                       BLOBNBOX_LIST* diacritic_blobs) {
  ColPartitionGrid* noise_grid = NULL;
  int post_overlap = part_grid->ComputeTotalOverlap(&noise_grid);
  if (pre_overlap == 0) pre_overlap = 1;
  BLOBNBOX_IT diacritic_it(diacritic_blobs);
  if (noise_grid != NULL) {
    if (post_overlap > pre_overlap * kNoiseOverlapGrowthFactor &&
        post_overlap > grid_box.area() * kNoiseOverlapAreaFactor) {
      // This is noisy enough to fix.
      if (textord_tabfind_show_strokewidths) {
        ScrollView* noise_win = MakeWindow(1000, 500, "Noise Areas");
        noise_grid->DisplayBoxes(noise_win);
      }
      part_grid->DeleteNonLeaderParts();
      BLOBNBOX_IT blob_it(&block->noise_blobs);
      ColPartitionGridSearch rsearch(noise_grid);
      for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
        BLOBNBOX* blob = blob_it.data();
        blob->ClearNeighbours();
        if (!blob->IsDiacritic() || blob->owner() != NULL)
          continue;  // Not a noise candidate.
        TBOX blob_box(blob->bounding_box());
        TBOX search_box(blob->bounding_box());
        search_box.pad(gridsize(), gridsize());
        rsearch.StartRectSearch(search_box);
        ColPartition* part = rsearch.NextRectSearch();
        if (part != NULL) {
          // Consider blob as possible noise.
          blob->set_owns_cblob(true);
          blob->compute_bounding_box();
          diacritic_it.add_after_then_move(blob_it.extract());
        }
      }
      noise_grid->DeleteParts();
      delete noise_grid;
      return true;
    }
    noise_grid->DeleteParts();
    delete noise_grid;
  }
  return false;
 }
 // Helper verifies that blob's neighbour in direction dir is good to add to a
--- a/textord/strokewidth.h
+++ b/textord/strokewidth.h
@ -41,6 +41,14 @@ enum LeftOrRight {
  LR_RIGHT
 };
 // Return value from FindInitialPartitions indicates detection of severe
 // skew or noise.
 enum PartitionFindResult {
  PFR_OK,    // Everything is OK.
  PFR_SKEW,  // Skew was detected and rotated.
  PFR_NOISE  // Noise was detected and removed.
 };
 /**
 * The StrokeWidth class holds all the normal and large blobs.
 * It is used to find good large blobs and move them to the normal blobs
@ -110,12 +118,10 @@ class StrokeWidth : public BlobGrid {
  // part_grid is the output grid of textline partitions.
  // Large blobs that cause overlap are put in separate partitions and added
  // to the big_parts list.
-  void GradeBlobsIntoPartitions(const FCOORD& rerotation,
+  void GradeBlobsIntoPartitions(const FCOORD& rerotation, TO_BLOCK* block,
-                                TO_BLOCK* block,
+                                Pix* nontext_pix, const DENORM* denorm,
-                                Pix* nontext_pix,
+                                bool cjk_script, TextlineProjection* projection,
-                                const DENORM* denorm,
+                                BLOBNBOX_LIST* diacritic_blobs,
                                bool cjk_script,
                                TextlineProjection* projection,
                                ColPartitionGrid* part_grid,
                                ColPartition_LIST* big_parts);
@ -205,10 +211,26 @@ class StrokeWidth : public BlobGrid {
  // minimize overlap and smoothes the types with neighbours and the color
  // image if provided. rerotation is used to rotate the coordinate space
  // back to the nontext_map_ image.
-  void FindInitialPartitions(const FCOORD& rerotation,
+  // If find_problems is true, detects possible noise pollution by the amount
-                             TO_BLOCK* block,
+  // of partition overlap that is created by the diacritics. If excessive, the
-                             ColPartitionGrid* part_grid,
+  // noise is separated out into diacritic blobs, and PFR_NOISE is returned.
-                             ColPartition_LIST* big_parts);
+  // [TODO(rays): if the partition overlap is caused by heavy skew, deskews
  // the components, saves the skew_angle and returns PFR_SKEW.] If the return
  // is not PFR_OK, the job is incomplete, and FindInitialPartitions must be
  // called again after cleaning up the partly done work.
  PartitionFindResult FindInitialPartitions(const FCOORD& rerotation,
                                            bool find_problems, TO_BLOCK* block,
                                            BLOBNBOX_LIST* diacritic_blobs,
                                            ColPartitionGrid* part_grid,
                                            ColPartition_LIST* big_parts,
                                            FCOORD* skew_angle);
  // Detects noise by a significant increase in partition overlap from
  // pre_overlap to now, and removes noise from the union of all the overlapping
  // partitions, placing the blobs in diacritic_blobs. Returns true if any noise
  // was found and removed.
  bool DetectAndRemoveNoise(int pre_overlap, const TBOX& grid_box,
                            TO_BLOCK* block, ColPartitionGrid* part_grid,
                            BLOBNBOX_LIST* diacritic_blobs);
  // Finds vertical chains of text-like blobs and puts them in ColPartitions.
  void FindVerticalTextChains(ColPartitionGrid* part_grid);
  // Finds horizontal chains of text-like blobs and puts them in ColPartitions.
--- a/textord/tablefind.cpp
+++ b/textord/tablefind.cpp
@ -974,12 +974,12 @@ bool TableFinder::HasLeaderAdjacent(const ColPartition& part) {
    hsearch.StartSideSearch(x, bottom, top);
    ColPartition* leader = NULL;
    while ((leader = hsearch.NextSideSearch(right_to_left)) != NULL) {
      // This should not happen, they are in different grids.
      ASSERT_HOST(&part != leader);
      // The leader could be a horizontal ruling in the grid.
      // Make sure it is actually a leader.
      if (leader->flow() != BTFT_LEADER)
        continue;
      // This should not happen, they are in different grids.
      ASSERT_HOST(&part != leader);
      // Make sure the leader shares a page column with the partition,
      // otherwise we are spreading across columns.
      if (!part.IsInSameColumnAs(*leader))
--- a/textord/textord.cpp
+++ b/textord/textord.cpp
@ -268,7 +268,7 @@ Textord::~Textord() {
 void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew,
                          int width, int height, Pix* binary_pix,
                          Pix* thresholds_pix, Pix* grey_pix,
-                          bool use_box_bottoms,
+                          bool use_box_bottoms, BLOBNBOX_LIST* diacritic_blobs,
                          BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks) {
  page_tr_.set_x(width);
  page_tr_.set_y(height);
@ -340,9 +340,9 @@ void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew,
    make_single_word(pageseg_mode == PSM_SINGLE_CHAR,
                     to_block->get_rows(), to_block->block->row_list());
  }
  cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks);
  // Remove empties.
-
+  cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks);
  TransferDiacriticsToBlockGroups(diacritic_blobs, blocks);
  // Compute the margins for each row in the block, to be used later for
  // paragraph detection.
  BLOCK_IT b_it(blocks);
--- a/textord/textord.h
+++ b/textord/textord.h
@ -22,6 +22,7 @@
 #define TESSERACT_TEXTORD_TEXTORD_H__
 #include "ccstruct.h"
 #include "bbgrid.h"
 #include "blobbox.h"
 #include "gap_map.h"
 #include "publictypes.h"  // For PageSegMode.
@ -35,6 +36,35 @@ class ScrollView;
 namespace tesseract {
 // A simple class that can be used by BBGrid to hold a word and an expanded
 // bounding box that makes it easy to find words to put diacritics.
 class WordWithBox {
 public:
  WordWithBox() : word_(NULL) {}
  explicit WordWithBox(WERD *word)
      : word_(word), bounding_box_(word->bounding_box()) {
    int height = bounding_box_.height();
    bounding_box_.pad(height, height);
  }
  const TBOX &bounding_box() const { return bounding_box_; }
  // Returns the bounding box of only the good blobs.
  TBOX true_bounding_box() const { return word_->true_bounding_box(); }
  C_BLOB_LIST *RejBlobs() const { return word_->rej_cblob_list(); }
  const WERD *word() const { return word_; }
 private:
  // Borrowed pointer to a real word somewhere that must outlive this class.
  WERD *word_;
  // Cached expanded bounding box of the word, padded all round by its height.
  TBOX bounding_box_;
 };
 // Make it usable by BBGrid.
 CLISTIZEH(WordWithBox)
 typedef BBGrid<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT> WordGrid;
 typedef GridSearch<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT> WordSearch;
 class Textord {
 public:
  explicit Textord(CCStruct* ccstruct);
@ -47,11 +77,13 @@ class Textord {
  // thresholds_pix is expected to be present iff grey_pix is present and
  // can be an integer factor reduction of the grey_pix. It represents the
  // thresholds that were used to create the binary_pix from the grey_pix.
-  void TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew,
+  // diacritic_blobs contain small confusing components that should be added
-                   int width, int height, Pix* binary_pix,
+  // to the appropriate word(s) in case they are really diacritics.
-                   Pix* thresholds_pix, Pix* grey_pix,
+  void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width,
-                   bool use_box_bottoms,
+                   int height, Pix *binary_pix, Pix *thresholds_pix,
-                   BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks);
+                   Pix *grey_pix, bool use_box_bottoms,
                   BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks,
                   TO_BLOCK_LIST *to_blocks);
  // If we were supposed to return only a single textline, and there is more
  // than one, clean up and leave only the best.
@ -212,6 +244,17 @@ class Textord {
  // Remove outlines that are a tiny fraction in either width or height
  // of the word height.
  void clean_small_noise_from_words(ROW *row);
  // Groups blocks by rotation, then, for each group, makes a WordGrid and calls
  // TransferDiacriticsToWords to copy the diacritic blobs to the most
  // appropriate words in the group of blocks. Source blobs are not touched.
  void TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs,
                                       BLOCK_LIST* blocks);
  // Places a copy of blobs that are near a word (after applying rotation to the
  // blob) in the most appropriate word, unless there is doubt, in which case a
  // blob can end up in two words. Source blobs are not touched.
  void TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs,
                                 const FCOORD &rotation, WordGrid *word_grid);
 public:
  // makerow.cpp ///////////////////////////////////////////
  BOOL_VAR_H(textord_single_height_mode, false,
--- a/textord/topitch.cpp
+++ b/textord/topitch.cpp
@ -283,12 +283,13 @@ void fix_row_pitch(TO_ROW *bad_row,        // row to fix
    bad_row->space_threshold =
      (bad_row->min_space + bad_row->max_nonspace) / 2;
    bad_row->space_size = bad_row->fixed_pitch;
-    if (bad_row->char_cells.empty ())
+    if (bad_row->char_cells.empty() && !bad_row->blob_list()->empty()) {
      tune_row_pitch (bad_row, &bad_row->projection,
        bad_row->projection_left, bad_row->projection_right,
        (bad_row->fixed_pitch +
        bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch,
        sp_sd, mid_cuts, &bad_row->char_cells, FALSE);
    }
  }
  else if (bad_row->pitch_decision == PITCH_CORR_PROP
  || bad_row->pitch_decision == PITCH_DEF_PROP) {
@ -1279,13 +1280,13 @@ float tune_row_pitch2(                             //find fp cells
  best_sp_sd = initial_pitch;
-  if (textord_disable_pitch_test) {
+  best_pitch = static_cast<int>(initial_pitch);
  if (textord_disable_pitch_test || best_pitch <= textord_pitch_range) {
    return initial_pitch;
  }
  sum_proj = new STATS[textord_pitch_range * 2 + 1];
  if (sum_proj == NULL)
    return initial_pitch;
  best_pitch = (inT32) initial_pitch;
  for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
    pitch_delta++)
@ -1293,12 +1294,12 @@ float tune_row_pitch2(                             //find fp cells
      best_pitch +
      pitch_delta + 1);
  for (pixel = projection_left; pixel <= projection_right; pixel++) {
-    for (pitch_delta = -textord_pitch_range;
+    for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
-      pitch_delta <= textord_pitch_range; pitch_delta++)
+         pitch_delta++) {
-    sum_proj[textord_pitch_range +
+      sum_proj[textord_pitch_range + pitch_delta].add(
-        pitch_delta].add ((pixel - projection_left) % (best_pitch +
+          (pixel - projection_left) % (best_pitch + pitch_delta),
-        pitch_delta),
+          projection->pile_count(pixel));
-        projection->pile_count (pixel));
+    }
  }
  best_count = sum_proj[textord_pitch_range].pile_count (0);
  best_delta = 0;
@ -1427,7 +1428,7 @@ float compute_pitch_sd(                            //find fp cells
  if (blob_it.empty ())
    return space_size * 10;
 #ifndef GRAPHICS_DISABLED
-  if (testing_on && to_win > 0) {
+  if (testing_on && to_win != NULL) {
    blob_box = blob_it.data ()->bounding_box ();
    projection->plot (to_win, projection_left,
      row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
@ -1476,7 +1477,7 @@ float compute_pitch_sd(                            //find fp cells
      tprintf ("\n");
    }
 #ifndef GRAPHICS_DISABLED
-    if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0)
+    if (textord_show_fixed_cuts && blob_count > 0 && to_win != NULL)
      plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
 #endif
    seg_it.set_to_list (&seg_list);
@ -1566,7 +1567,7 @@ float compute_pitch_sd2(                            //find fp cells
    return initial_pitch * 10;
  }
 #ifndef GRAPHICS_DISABLED
-  if (testing_on && to_win > 0) {
+  if (testing_on && to_win != NULL) {
    projection->plot (to_win, projection_left,
      row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
  }
@ -1602,7 +1603,7 @@ float compute_pitch_sd2(                            //find fp cells
    tprintf ("\n");
  }
 #ifndef GRAPHICS_DISABLED
-  if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0)
+  if (textord_show_fixed_cuts && blob_count > 0 && to_win != NULL)
    plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
 #endif
  seg_it.set_to_list (&seg_list);
--- a/textord/tordmain.cpp
+++ b/textord/tordmain.cpp
@ -38,13 +38,18 @@
 #include "allheaders.h"
-const ERRCODE BLOCKLESS_BLOBS = "Warning:some blobs assigned to no block";
+// Gridsize for word grid when reassigning diacritics to words. Not critical.
 const int kWordGridSize = 50;
 #undef EXTERN
 #define EXTERN
 #define MAX_NEAREST_DIST  600    //for block skew stats
 namespace tesseract {
 CLISTIZE(WordWithBox)
 /**********************************************************************
 * SetBlobStrokeWidth
 *
@ -143,7 +148,6 @@ void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) {
  }
 }
 /**********************************************************************
 * assign_blobs_to_blocks2
 *
@ -193,7 +197,6 @@ void assign_blobs_to_blocks2(Pix* pix,
  }
 }
 namespace tesseract {
 /**********************************************************************
 * find_components
 *
@ -400,7 +403,7 @@ void Textord::cleanup_nontext_block(BLOCK* block) {
 * Delete empty blocks, rows from the page.
 **********************************************************************/
-void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) {
+void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST* blocks) {
  BLOCK_IT block_it = blocks;    //iterator
  ROW_IT row_it;                 //row iterator
@ -420,18 +423,18 @@ void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) {
    if (clean_noise) {
      row_it.set_to_list(block->row_list());
      for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
        ROW* row = row_it.data();
        ++num_rows_all;
-        clean_small_noise_from_words(row_it.data());
+        clean_small_noise_from_words(row);
-        if ((textord_noise_rejrows && !row_it.data()->word_list()->empty() &&
+        if ((textord_noise_rejrows && !row->word_list()->empty() &&
-             clean_noise_from_row(row_it.data())) ||
+             clean_noise_from_row(row)) ||
-            row_it.data()->word_list()->empty()) {
+            row->word_list()->empty()) {
          delete row_it.extract();  // lose empty row.
        } else {
          if (textord_noise_rejwords)
            clean_noise_from_words(row_it.data());
          if (textord_blshift_maxshift >= 0)
-            tweak_row_baseline(row_it.data(),
+            tweak_row_baseline(row, textord_blshift_maxshift,
                               textord_blshift_maxshift,
                               textord_blshift_xfraction);
          ++num_rows;
        }
@ -640,16 +643,16 @@ void Textord::clean_noise_from_words(          //remove empties
        && (!word_it.at_first () || !blob_it.at_first ()))
        dot_count += 2;
    }
-    if (dot_count > 2) {
+    if (dot_count > 2 && !word->flag(W_REP_CHAR)) {
      if (dot_count > norm_count * textord_noise_normratio * 2)
        word_dud[word_index] = 2;
      else if (dot_count > norm_count * textord_noise_normratio)
        word_dud[word_index] = 1;
      else
        word_dud[word_index] = 0;
-    }
+    } else {
    else
      word_dud[word_index] = 0;
    }
    if (word_dud[word_index] == 2)
      dud_words++;
    else
@ -661,11 +664,11 @@ void Textord::clean_noise_from_words(          //remove empties
  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
    if (word_dud[word_index] == 2
    || (word_dud[word_index] == 1 && dud_words > ok_words)) {
-      word = word_it.data ();    //current word
+      word = word_it.data();  // Current word.
-                                 //rejected blobs
+      // Previously we threw away the entire word.
-      blob_it.set_to_list (word->rej_cblob_list ());
+      // Now just aggressively throw all small blobs into the reject list, where
-                                 //move from blobs
+      // the classifier can decide whether they are actually needed.
-      blob_it.add_list_after (word->cblob_list ());
+      word->CleanNoise(textord_noise_sizelimit * row->x_height());
    }
    word_index++;
  }
@ -705,6 +708,176 @@ void Textord::clean_small_noise_from_words(ROW *row) {
    }
  }
 }
 // Local struct to hold a group of blocks.
 struct BlockGroup {
  BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {}
  explicit BlockGroup(BLOCK* block)
      : bounding_box(block->bounding_box()),
        rotation(block->re_rotation()),
        angle(block->re_rotation().angle()),
        min_xheight(block->x_height()) {
    blocks.push_back(block);
  }
  // Union of block bounding boxes.
  TBOX bounding_box;
  // Common rotation of the blocks.
  FCOORD rotation;
  // Angle of rotation.
  float angle;
  // Min xheight of the blocks.
  float min_xheight;
  // Collection of borrowed pointers to the blocks in the group.
  GenericVector<BLOCK*> blocks;
 };
 // Groups blocks by rotation, then, for each group, makes a WordGrid and calls
 // TransferDiacriticsToWords to copy the diacritic blobs to the most
 // appropriate words in the group of blocks. Source blobs are not touched.
 void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs,
                                              BLOCK_LIST* blocks) {
  // Angle difference larger than this is too much to consider equal.
  // They should only be in multiples of M_PI/2 anyway.
  const double kMaxAngleDiff = 0.01;  // About 0.6 degrees.
  PointerVector<BlockGroup> groups;
  BLOCK_IT bk_it(blocks);
  for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) {
    BLOCK* block = bk_it.data();
    if (block->poly_block() != NULL && !block->poly_block()->IsText()) {
      continue;
    }
    // Linear search of the groups to find a matching rotation.
    float block_angle = block->re_rotation().angle();
    int best_g = 0;
    float best_angle_diff = MAX_FLOAT32;
    for (int g = 0; g < groups.size(); ++g) {
      double angle_diff = fabs(block_angle - groups[g]->angle);
      if (angle_diff > M_PI) angle_diff = fabs(angle_diff - 2.0 * M_PI);
      if (angle_diff < best_angle_diff) {
        best_angle_diff = angle_diff;
        best_g = g;
      }
    }
    if (best_angle_diff > kMaxAngleDiff) {
      groups.push_back(new BlockGroup(block));
    } else {
      groups[best_g]->blocks.push_back(block);
      groups[best_g]->bounding_box += block->bounding_box();
      float x_height = block->x_height();
      if (x_height < groups[best_g]->min_xheight)
        groups[best_g]->min_xheight = x_height;
    }
  }
  // Now process each group of blocks.
  PointerVector<WordWithBox> word_ptrs;
  for (int g = 0; g < groups.size(); ++g) {
    const BlockGroup* group = groups[g];
    tprintf("group %d, xh=%g, %d blocks\n", g, group->min_xheight,
            group->blocks.size());
    WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(),
                       group->bounding_box.topright());
    for (int b = 0; b < group->blocks.size(); ++b) {
      tprintf("block %d, %d rows\n", b, group->blocks[b]->row_list()->length());
      ROW_IT row_it(group->blocks[b]->row_list());
      for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
        ROW* row = row_it.data();
        tprintf("%d words in row\n", row->word_list()->length());
        // Put the words of the row into the grid.
        WERD_IT w_it(row->word_list());
        for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
          WERD* word = w_it.data();
          WordWithBox* box_word = new WordWithBox(word);
          word_grid.InsertBBox(true, true, box_word);
          // Save the pointer where it will be auto-deleted.
          word_ptrs.push_back(box_word);
        }
      }
    }
    FCOORD rotation = group->rotation;
    // Make it a forward rotation that will transform blob coords to block.
    rotation.set_y(-rotation.y());
    TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid);
  }
 }
 // Places a copy of blobs that are near a word (after applying rotation to the
 // blob) in the most appropriate word, unless there is doubt, in which case a
 // blob can end up in two words. Source blobs are not touched.
 void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST* diacritic_blobs,
                                        const FCOORD& rotation,
                                        WordGrid* word_grid) {
  WordSearch ws(word_grid);
  BLOBNBOX_IT b_it(diacritic_blobs);
  // Apply rotation to each blob before finding the nearest words. The rotation
  // allows us to only consider above/below placement and not left/right on
  // vertical text, because all text is horizontal here.
  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
    BLOBNBOX* blobnbox = b_it.data();
    TBOX blob_box = blobnbox->bounding_box();
    blob_box.rotate(rotation);
    ws.StartRectSearch(blob_box);
    // Above/below refer to word position relative to diacritic. Since some
    // scripts eg Kannada/Telugu habitually put diacritics below words, and
    // others eg Thai/Vietnamese/Latin put most diacritics above words, try
    // for both if there isn't much in it.
    WordWithBox* best_above_word = NULL;
    WordWithBox* best_below_word = NULL;
    int best_above_distance = 0;
    int best_below_distance = 0;
    for (WordWithBox* word = ws.NextRectSearch(); word != NULL;
         word = ws.NextRectSearch()) {
      if (word->word()->flag(W_REP_CHAR)) continue;
      TBOX word_box = word->true_bounding_box();
      int x_distance = blob_box.x_gap(word_box);
      int y_distance = blob_box.y_gap(word_box);
      if (x_distance > 0) {
        // Arbitrarily divide x-distance by 2 if there is a major y overlap,
        // and the word is to the left of the diacritic. If the
        // diacritic is a dropped broken character between two words, this will
        // help send all the pieces to a single word, instead of splitting them
        // over the 2 words.
        if (word_box.major_y_overlap(blob_box) &&
            blob_box.left() > word_box.right()) {
          x_distance /= 2;
        }
        y_distance += x_distance;
      }
      if (word_box.y_middle() > blob_box.y_middle() &&
          (best_above_word == NULL || y_distance < best_above_distance)) {
        best_above_word = word;
        best_above_distance = y_distance;
      }
      if (word_box.y_middle() <= blob_box.y_middle() &&
          (best_below_word == NULL || y_distance < best_below_distance)) {
        best_below_word = word;
        best_below_distance = y_distance;
      }
    }
    bool above_good =
        best_above_word != NULL &&
        (best_below_word == NULL ||
         best_above_distance < best_below_distance + blob_box.height());
    bool below_good =
        best_below_word != NULL && best_below_word != best_above_word &&
        (best_above_word == NULL ||
         best_below_distance < best_above_distance + blob_box.height());
    if (below_good) {
      C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
      copied_blob->rotate(rotation);
      // Put the blob into the word's reject blobs list.
      C_BLOB_IT blob_it(best_below_word->RejBlobs());
      blob_it.add_to_end(copied_blob);
    }
    if (above_good) {
      C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
      copied_blob->rotate(rotation);
      // Put the blob into the word's reject blobs list.
      C_BLOB_IT blob_it(best_above_word->RejBlobs());
      blob_it.add_to_end(copied_blob);
    }
  }
 }
 }  // tesseract
 /**********************************************************************
@ -820,33 +993,3 @@ void tweak_row_baseline(ROW *row,
  free_mem(xstarts);
  free_mem(coeffs);
 }
 /**********************************************************************
 * blob_y_order
 *
 * Sort function to sort blobs in y from page top.
 **********************************************************************/
 inT32 blob_y_order(              //sort function
                   void *item1,  //items to compare
                   void *item2) {
                                 //converted ptr
  BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
                                 //converted ptr
  BLOBNBOX *blob2 = *(BLOBNBOX **) item2;
  if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ())
    return -1;
  else if (blob1->bounding_box ().bottom () <
    blob2->bounding_box ().bottom ())
    return 1;
  else {
    if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
      return -1;
    else if (blob1->bounding_box ().left () >
      blob2->bounding_box ().left ())
      return 1;
    else
      return 0;
  }
 }
--- a/textord/tordmain.h
+++ b/textord/tordmain.h
@ -29,29 +29,14 @@
 struct Pix;
 namespace tesseract {
 class Tesseract;
 }
 void make_blocks_from_blobs(                       //convert & textord
                            TBLOB *tessblobs,      //tess style input
                            const char *filename,  //blob file
                            ICOORD page_tr,        //top right
                            BOOL8 do_shift,        //shift tess coords
                            BLOCK_LIST *blocks     //block list
                           );
 void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob);
 void assign_blobs_to_blocks2(Pix* pix, BLOCK_LIST *blocks,
                             TO_BLOCK_LIST *port_blocks);
-void textord_page(                             //make rows & words
+}  // namespace tesseract
-                  ICOORD page_tr,              //top right
+
                  BLOCK_LIST *blocks,          //block list
                  TO_BLOCK_LIST *land_blocks,  //rotated for landscape
                  TO_BLOCK_LIST *port_blocks,  //output list
                  tesseract::Tesseract*
                 );
 void tweak_row_baseline(ROW *row,
                        double blshift_maxshift,
                        double blshift_xfraction);
-inT32 blob_y_order(              //sort function
+
                   void *item1,  //items to compare
                   void *item2);
 #endif