diff --git a/dict/dawg.cpp b/dict/dawg.cpp
index 21da8e26e..ed304245d 100644
--- a/dict/dawg.cpp
+++ b/dict/dawg.cpp
@@ -98,6 +98,32 @@ int Dawg::check_for_words(const char *filename,
   return misses;
 }
 
+void Dawg::iterate_words(const UNICHARSET &unicharset,
+                         TessCallback1<const char *> *cb) const {
+  WERD_CHOICE word(&unicharset);
+  iterate_words_rec(word, 0, cb);
+}
+
+void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
+                             NODE_REF to_explore,
+                             TessCallback1<const char *> *cb) const {
+  NodeChildVector children;
+  this->unichar_ids_of(to_explore, &children);
+  for (int i = 0; i < children.size(); i++) {
+    WERD_CHOICE next_word(word_so_far);
+    next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
+    if (this->end_of_word(children[i].edge_ref)) {
+      STRING s;
+      next_word.string_and_lengths(&s, NULL);
+      cb->Run(s.string());
+    }
+    NODE_REF next = next_node(children[i].edge_ref);
+    if (next != 0) {
+      iterate_words_rec(next_word, next, cb);
+    }
+  }
+}
+
 bool Dawg::match_words(WERD_CHOICE *word, inT32 index,
                        NODE_REF node, UNICHAR_ID wildcard) const {
   EDGE_REF edge;
@@ -286,12 +312,12 @@ void SquishedDawg::read_squished_dawg(FILE *file,
   int unicharset_size;
   fread(&unicharset_size, sizeof(inT32), 1, file);
   fread(&num_edges_, sizeof(inT32), 1, file);
-  ASSERT_HOST(num_edges_ > 0);  // DAWG should not be empty
 
   if (swap) {
     unicharset_size = reverse32(unicharset_size);
     num_edges_ = reverse32(num_edges_);
   }
+  ASSERT_HOST(num_edges_ > 0);  // DAWG should not be empty
   Dawg::init(type, lang, perm, unicharset_size, debug_level);
 
   edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_);
@@ -318,13 +344,13 @@ NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const {
 
   node_map = (NODE_MAP) malloc(sizeof(EDGE_REF) * num_edges_);
 
-  for (edge=0; edge < num_edges_; edge++)       // init all slots
+  for (edge = 0; edge < num_edges_; edge++)       // init all slots
     node_map [edge] = -1;
 
   node_counter = num_forward_edges(0);
 
   *num_nodes   = 0;
-  for (edge=0; edge < num_edges_; edge++) {     // search all slots
+  for (edge = 0; edge < num_edges_; edge++) {     // search all slots
 
     if (forward_edge(edge)) {
       (*num_nodes)++;                          // count nodes links
@@ -332,6 +358,7 @@ NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const {
       num_edges = num_forward_edges(edge);
       if (edge != 0) node_counter += num_edges;
       edge += num_edges;
+      if (edge >= num_edges_) break;
       if (backward_edge(edge)) while (!last_edge(edge++));
       edge--;
     }
@@ -369,7 +396,7 @@ void SquishedDawg::write_squished_dawg(FILE *file) {
     tprintf("%d edges in DAWG\n", num_edges);
   }
 
-  for (edge=0; edge<num_edges_; edge++) {
+  for (edge = 0; edge < num_edges_; edge++) {
     if (forward_edge(edge)) {  // write forward edges
       do {
         old_index = next_node_from_edge_rec(edges_[edge]);
@@ -379,6 +406,7 @@ void SquishedDawg::write_squished_dawg(FILE *file) {
         set_next_node(edge, old_index);
       } while (!last_edge(edge++));
 
+      if (edge >= num_edges_) break;
       if (backward_edge(edge))  // skip back links
         while (!last_edge(edge++));
 
diff --git a/dict/dawg.h b/dict/dawg.h
index 2606f6edf..81c213863 100644
--- a/dict/dawg.h
+++ b/dict/dawg.h
@@ -34,6 +34,7 @@
 #include "elst.h"
 #include "ratngs.h"
 #include "params.h"
+#include "tesscallback.h"
 
 #ifndef __GNUC__
 #ifdef __MSW32__
@@ -142,6 +143,11 @@ class Dawg {
                       const UNICHARSET &unicharset,
                       bool enable_wildcard) const;
 
+  // For each word in the Dawg, call the given (permanent) callback with the
+  // text (UTF-8) version of the word.
+  void iterate_words(const UNICHARSET &unicharset,
+                     TessCallback1<const char *> *cb) const;
+
   // Pure virtual function that should be implemented by the derived classes.
 
   /// Returns the edge that corresponds to the letter out of this node.
@@ -268,6 +274,11 @@ class Dawg {
   bool match_words(WERD_CHOICE *word, inT32 index,
                    NODE_REF node, UNICHAR_ID wildcard) const;
 
+  // Recursively iterate over all words in a dawg (see public iterate_words).
+  void iterate_words_rec(const WERD_CHOICE &word_so_far,
+                         NODE_REF to_explore,
+                         TessCallback1<const char *> *cb) const;
+
   // Member Variables.
   DawgType type_;
   STRING lang_;
diff --git a/dict/dict.cpp b/dict/dict.cpp
index add041aa3..5ad88cbac 100644
--- a/dict/dict.cpp
+++ b/dict/dict.cpp
@@ -16,7 +16,10 @@
 //
 ///////////////////////////////////////////////////////////////////////
 
+#include <stdio.h>
+
 #include "dict.h"
+#include "unicodes.h"
 
 #ifdef _MSC_VER
 #pragma warning(disable:4244)  // Conversion warnings
@@ -41,6 +44,8 @@ Dict::Dict(Image* image_ptr)
                        getImage()->getCCUtil()->params()),
       BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
                        getImage()->getCCUtil()->params()),
+      BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
+                       getImage()->getCCUtil()->params()),
       BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation"
                        " patterns.", getImage()->getCCUtil()->params()),
       BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number"
@@ -48,6 +53,8 @@ Dict::Dict(Image* image_ptr)
       BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs"
                        " (e.g. for non-space delimited languages)",
                        getImage()->getCCUtil()->params()),
+      BOOL_INIT_MEMBER(load_bigram_dawg, false, "Load dawg with special word "
+                       "bigrams.", getImage()->getCCUtil()->params()),
       double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
                     "Score multiplier for word matches which have good case and"
                     "are frequent in the given language (lower is better).",
@@ -70,6 +77,9 @@ Dict::Dict(Image* image_ptr)
                     "Score multiplier for poorly cased strings that are not in"
                     " the dictionary and generally look like garbage (lower is"
                     " better).", getImage()->getCCUtil()->params()),
+      STRING_MEMBER(output_ambig_words_file, "",
+                    "Output file for ambiguities found in the dictionary",
+                    getImage()->getCCUtil()->params()),
       INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info"
                  ", to 2 for more details, to 3 to see all the debug messages",
                  getImage()->getCCUtil()->params()),
@@ -104,6 +114,12 @@ Dict::Dict(Image* image_ptr)
                   "Make AcceptableChoice() always return false. Useful"
                   " when there is a need to explore all segmentations",
                   getImage()->getCCUtil()->params()),
+      double_MEMBER(stopper_ambiguity_threshold_gain, 8.0,
+                    "Gain factor for ambiguity threshold.",
+                    getImage()->getCCUtil()->params()),
+      double_MEMBER(stopper_ambiguity_threshold_offset, 1.5,
+                    "Certainty offset for ambiguity threshold.",
+                    getImage()->getCCUtil()->params()),
       BOOL_MEMBER(save_raw_choices, false, "Save all explored raw choices",
                   getImage()->getCCUtil()->params()),
       INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
@@ -130,6 +146,11 @@ Dict::Dict(Image* image_ptr)
       BOOL_MEMBER(segment_segcost_rating, 0,
                   "incorporate segmentation cost in word rating?",
                   getImage()->getCCUtil()->params()),
+      BOOL_MEMBER(segment_nonalphabetic_script, false,
+                 "Don't use any alphabetic-specific tricks."
+                 "Set to true in the traineddata config file for"
+                 " scripts that are cursive or inherently fixed-pitch",
+                 getImage()->getCCUtil()->params()),
       double_MEMBER(segment_reward_script, 0.95,
                     "Score multipler for script consistency within a word. "
                     "Being a 'reward' factor, it should be <= 1. "
@@ -144,10 +165,10 @@ Dict::Dict(Image* image_ptr)
       double_MEMBER(segment_reward_chartype, 0.97,
                     "Score multipler for char type consistency within a word. ",
                     getImage()->getCCUtil()->params()),
-     double_MEMBER(segment_reward_ngram_best_choice, 0.99,
-                   "Score multipler for ngram permuter's best choice"
-                   " (only used in the Han script path).",
-                   getImage()->getCCUtil()->params()),
+      double_MEMBER(segment_reward_ngram_best_choice, 0.99,
+                    "Score multipler for ngram permuter's best choice"
+                    " (only used in the Han script path).",
+                    getImage()->getCCUtil()->params()),
       BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
                   getImage()->getCCUtil()->params()),
       BOOL_MEMBER(doc_dict_enable, 1, "Enable Document Dictionary ",
@@ -182,14 +203,17 @@ Dict::Dict(Image* image_ptr)
   hyphen_unichar_id_ = INVALID_UNICHAR_ID;
   document_words_ = NULL;
   pending_words_ = NULL;
+  bigram_dawg_ = NULL;
   freq_dawg_ = NULL;
   punc_dawg_ = NULL;
   max_fixed_length_dawgs_wdlen_ = -1;
   wordseg_rating_adjust_factor_ = -1.0f;
+  output_ambig_words_file_ = NULL;
 }
 
 Dict::~Dict() {
   if (hyphen_word_ != NULL) delete hyphen_word_;
+  if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_);
 }
 
 void Dict::Load() {
@@ -199,6 +223,10 @@ void Dict::Load() {
   if (dawgs_.length() != 0) this->End();
 
   hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
+
+  LoadEquivalenceList(kHyphenLikeUTF8);
+  LoadEquivalenceList(kApostropheLikeUTF8);
+
   TessdataManager &tessdata_manager =
     getImage()->getCCUtil()->tessdata_manager;
 
@@ -219,12 +247,26 @@ void Dict::Load() {
       new SquishedDawg(tessdata_manager.GetDataFilePtr(),
                        DAWG_TYPE_NUMBER, lang, NUMBER_PERM, dawg_debug_level);
   }
-  if (tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) {
+  if (load_bigram_dawg && tessdata_manager.SeekToStart(TESSDATA_BIGRAM_DAWG)) {
+    bigram_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
+                                    DAWG_TYPE_WORD, // doesn't actually matter.
+                                    lang,
+                                    COMPOUND_PERM,  // doesn't actually matter.
+                                    dawg_debug_level);
+  }
+  if (load_freq_dawg && tessdata_manager.SeekToStart(TESSDATA_FREQ_DAWG)) {
     freq_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
                                   DAWG_TYPE_WORD, lang, FREQ_DAWG_PERM,
                                   dawg_debug_level);
     dawgs_ += freq_dawg_;
   }
+  if (load_unambig_dawg &&
+      tessdata_manager.SeekToStart(TESSDATA_UNAMBIG_DAWG)) {
+    unambig_dawg_ = new SquishedDawg(tessdata_manager.GetDataFilePtr(),
+                                     DAWG_TYPE_WORD, lang, SYSTEM_DAWG_PERM,
+                                     dawg_debug_level);
+    dawgs_ += unambig_dawg_;
+  }
 
   if (((STRING &)user_words_suffix).length() > 0) {
     Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
@@ -232,7 +274,8 @@ void Dict::Load() {
                               dawg_debug_level);
     name = getImage()->getCCUtil()->language_data_path_prefix;
     name += user_words_suffix;
-    if (!trie_ptr->read_word_list(name.string(), getUnicharset())) {
+    if (!trie_ptr->read_word_list(name.string(), getUnicharset(),
+                                  Trie::RRP_REVERSE_IF_HAS_RTL)) {
       tprintf("Error: failed to load %s\n", name.string());
       exit(1);
     }
@@ -295,6 +338,7 @@ void Dict::End() {
   dawgs_.delete_data_pointers();
   successors_.delete_data_pointers();
   dawgs_.clear();
+  delete bigram_dawg_;
   successors_.clear();
   document_words_ = NULL;
   max_fixed_length_dawgs_wdlen_ = -1;
@@ -304,12 +348,38 @@ void Dict::End() {
   }
 }
 
+// Create unicharset adaptations of known, short lists of UTF-8 equivalent
+// characters (think all hyphen-like symbols).  The first version of the
+// list is taken as equivalent for matching against the dictionary.
+void Dict::LoadEquivalenceList(const char *unichar_strings[]) {
+  equivalent_symbols_.push_back(GenericVectorEqEq<UNICHAR_ID>());
+  const UNICHARSET &unicharset = getUnicharset();
+  GenericVectorEqEq<UNICHAR_ID> *equiv_list = &equivalent_symbols_.back();
+  for (int i = 0; unichar_strings[i] != 0; i++) {
+    UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar_strings[i]);
+    if (unichar_id != INVALID_UNICHAR_ID) {
+      equiv_list->push_back(unichar_id);
+    }
+  }
+}
+
+// Normalize all hyphen and apostrophes to the canonicalized one for
+// matching; pass everything else through as is.
+UNICHAR_ID Dict::NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const {
+  for (int i = 0; i < equivalent_symbols_.size(); i++) {
+    if (equivalent_symbols_[i].contains(unichar_id)) {
+      return equivalent_symbols_[i][0];
+    }
+  }
+  return unichar_id;
+}
+
 // Returns true if in light of the current state unichar_id is allowed
 // according to at least one of the dawgs in the dawgs_ vector.
 // See more extensive comments in dict.h where this function is declared.
 int Dict::def_letter_is_okay(void* void_dawg_args,
                              UNICHAR_ID unichar_id,
-                             bool word_end) {
+                             bool word_end) const {
   DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args);
 
   if (dawg_debug_level >= 3) {
@@ -484,7 +554,7 @@ int Dict::def_letter_is_okay(void* void_dawg_args,
 void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info,
                                UNICHAR_ID unichar_id, bool word_end,
                                DawgArgs *dawg_args,
-                               PermuterType *curr_perm) {
+                               PermuterType *curr_perm) const {
   NODE_REF node = GetStartingNode(dawg, info.ref);
   // Try to find the edge corresponding to the exact unichar_id and to all the
   // edges corresponding to the character class of unichar_id.
@@ -572,7 +642,7 @@ void Dict::WriteFixedLengthDawgs(
 // from hyphen_active_dawgs_ instead.
 void Dict::init_active_dawgs(int sought_word_length,
                              DawgInfoVector *active_dawgs,
-                             bool ambigs_mode) {
+                             bool ambigs_mode) const {
   int i;
   if (sought_word_length != kAnyWordLength) {
     // Only search one fixed word length dawg.
@@ -604,7 +674,7 @@ void Dict::init_active_dawgs(int sought_word_length,
 
 // If hyphenated() returns true, copy the entries from hyphen_constraints_
 // into the given constraints vector.
-void Dict::init_constraints(DawgInfoVector *constraints) {
+void Dict::init_constraints(DawgInfoVector *constraints) const {
   if (hyphenated()) {
     *constraints = hyphen_constraints_;
     if (dawg_debug_level >= 3) {
@@ -670,7 +740,7 @@ void Dict::add_document_word(const WERD_CHOICE &best_choice) {
     strcat(filename, ".doc");
     doc_word_file = open_file (filename, "a");
     fprintf(doc_word_file, "%s\n",
-            best_choice.debug_string(getUnicharset()).string());
+            best_choice.debug_string().string());
     fclose(doc_word_file);
   }
   document_words_->add_word_to_dawg(best_choice);
@@ -693,7 +763,7 @@ void Dict::adjust_word(WERD_CHOICE *word,
   float new_rating = word->rating();
   if (debug) {
     tprintf("%sWord: %s %4.2f ", nonword ? "Non-" : "",
-            word->debug_string(getUnicharset()).string(), word->rating());
+            word->debug_string().string(), word->rating());
   }
   new_rating += kRatingPad;
   if (nonword) {  // non-dictionary word
@@ -733,9 +803,9 @@ void Dict::adjust_word(WERD_CHOICE *word,
   LogNewChoice(adjust_factor, certainty_array, false, word);
 }
 
-int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) {
+int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
   const WERD_CHOICE *word_ptr = &word;
-  WERD_CHOICE temp_word;
+  WERD_CHOICE temp_word(word.unicharset());
   if (hyphenated()) {
     copy_hyphen_info(&temp_word);
     temp_word += word;
@@ -775,10 +845,40 @@ int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) {
     dawg_args.permuter : NO_PERM;
 }
 
+bool Dict::valid_bigram(const WERD_CHOICE &word1,
+                        const WERD_CHOICE &word2) const {
+  if (bigram_dawg_ == NULL) return false;
+
+  // Extract the core word from the middle of each word with any digits
+  //         replaced with question marks.
+  int w1start, w1end, w2start, w2end;
+  word1.punct_stripped(&w1start, &w1end);
+  word2.punct_stripped(&w2start, &w2end);
+
+  // We don't want to penalize a single guillemet, hyphen, etc.
+  // But our bigram list doesn't have any information about punctuation.
+  if (w1start >= w1end) return word1.length() < 3;
+  if (w2start >= w2end) return word2.length() < 3;
+
+  const UNICHARSET& uchset = getUnicharset();
+  STRING bigram_string;
+  for (int i = w1start; i < w1end; i++) {
+    UNICHAR_ID ch = NormalizeUnicharIdForMatch(word1.unichar_id(i));
+    bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch);
+  }
+  bigram_string += " ";
+  for (int i = w2start; i < w2end; i++) {
+    UNICHAR_ID ch = NormalizeUnicharIdForMatch(word2.unichar_id(i));
+    bigram_string += uchset.get_isdigit(ch) ? "?" : uchset.id_to_unichar(ch);
+  }
+  WERD_CHOICE normalized_word(bigram_string.string(), uchset);
+  return bigram_dawg_->word_in_dawg(normalized_word);
+}
+
 bool Dict::valid_punctuation(const WERD_CHOICE &word) {
   if (word.length() == 0) return NO_PERM;
   int i;
-  WERD_CHOICE new_word;
+  WERD_CHOICE new_word(word.unicharset());
   int last_index = word.length() - 1;
   int new_len = 0;
   for (i = 0; i <= last_index; ++i) {
diff --git a/dict/dict.h b/dict/dict.h
index 45cafd9c7..9ced54ee0 100644
--- a/dict/dict.h
+++ b/dict/dict.h
@@ -89,16 +89,17 @@ struct DawgArgs {
 
 class Dict {
  public:
-  // Gain factor for ambiguity threshold.
-  static const float kStopperAmbiguityThresholdGain;
-  // Certainty offset for ambiguity threshold.
-  static const float kStopperAmbiguityThresholdOffset;
-
   Dict(Image* image_ptr);
   ~Dict();
+  const Image* getImage() const {
+    return image_ptr_;
+  }
   Image* getImage() {
     return image_ptr_;
   }
+  const UNICHARSET& getUnicharset() const {
+    return getImage()->getCCUtil()->unicharset;
+  }
   UNICHARSET& getUnicharset() {
     return getImage()->getCCUtil()->unicharset;
   }
@@ -114,17 +115,17 @@ class Dict {
   /* hyphen.cpp ************************************************************/
 
   /// Returns true if we've recorded the beginning of a hyphenated word.
-  inline bool hyphenated() { return
+  inline bool hyphenated() const { return
     !last_word_on_line_ && hyphen_word_ && GetMaxFixedLengthDawgIndex() < 0;
   }
   /// Size of the base word (the part on the line before) of a hyphenated word.
-  inline int hyphen_base_size() {
+  inline int hyphen_base_size() const {
     return this->hyphenated() ? hyphen_word_->length() : 0;
   }
   /// If this word is hyphenated copy the base word (the part on
   /// the line before) of a hyphenated word into the given word.
   /// This function assumes that word is not NULL.
-  inline void copy_hyphen_info(WERD_CHOICE *word) {
+  inline void copy_hyphen_info(WERD_CHOICE *word) const {
     if (this->hyphenated()) {
       *word = *hyphen_word_;
       if (hyphen_debug_level) word->print("copy_hyphen_info: ");
@@ -133,19 +134,19 @@ class Dict {
   /// Erase the unichar ids corresponding to the portion of the word
   /// from the previous line. The word is not changed if it is not
   /// split between lines and hyphenated.
-  inline void remove_hyphen_head(WERD_CHOICE *word) {
+  inline void remove_hyphen_head(WERD_CHOICE *word) const {
     if (this->hyphenated()) {
       word->remove_unichar_ids(0, hyphen_word_->length());
       if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: ");
     }
   }
   /// Check whether the word has a hyphen at the end.
-  inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) {
+  inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const {
     return (last_word_on_line_ && !first_pos &&
             unichar_id == hyphen_unichar_id_);
   }
   /// Same as above, but check the unichar at the end of the word.
-  inline bool has_hyphen_end(const WERD_CHOICE &word) {
+  inline bool has_hyphen_end(const WERD_CHOICE &word) const {
     int word_index = word.length() - 1;
     return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
   }
@@ -171,12 +172,14 @@ class Dict {
   /// from hyphen_active_dawgs_ instead.
   void init_active_dawgs(int sought_word_length,
                          DawgInfoVector *active_dawgs,
-                         bool ambigs_mode);
+                         bool ambigs_mode) const;
   /// If hyphenated() returns true, copy the entries from hyphen_constraints_
   /// into the given constraints vector.
-  void init_constraints(DawgInfoVector *constraints);
+  void init_constraints(DawgInfoVector *constraints) const;
   /// Returns true if we are operating in ambigs mode.
-  inline bool ambigs_mode(float rating_limit) { return rating_limit <= 0.0; }
+  inline bool ambigs_mode(float rating_limit) {
+    return rating_limit <= 0.0;
+  }
   /// Recursively explore all the possible character combinations in
   /// the given char_choices. Use go_deeper_dawg_fxn() to explore all the
   /// dawgs in the dawgs_ vector in parallel and discard invalid words.
@@ -316,6 +319,15 @@ class Dict {
                         bool fix_replaceable,
                         BLOB_CHOICE_LIST_VECTOR *Choices,
                         bool *modified_blobs);
+  double StopperAmbigThreshold(double f1, double f2) {
+    return (f2 - f1) * stopper_ambiguity_threshold_gain -
+        stopper_ambiguity_threshold_offset;
+  }
+  // If the certainty of any chunk in Choice (item1) is not ambiguous with the
+  // corresponding chunk in the best choice (item2), frees Choice and
+  // returns true.
+  int FreeBadChoice(void *item1,   // VIABLE_CHOICE Choice
+                    void *item2);  // EXPANDED_CHOICE *BestChoice
   /// Replaces the corresponding wrong ngram in werd_choice with the correct
   /// one. We indicate that this newly inserted ngram unichar is composed from
   /// several fragments and modify the corresponding entries in blob_choices to
@@ -401,7 +413,7 @@ class Dict {
   /// and Certainties.
   void FillViableChoice(const WERD_CHOICE &WordChoice,
                         FLOAT32 AdjustFactor, const float Certainties[],
-                        bool SameString, VIABLE_CHOICE ViableChoice);
+                        VIABLE_CHOICE ViableChoice);
   /// Returns true if there are no alternative choices for the current word
   /// or if all alternatives have an adjust factor worse than Threshold.
   bool AlternativeChoicesWorseThan(FLOAT32 Threshold);
@@ -467,6 +479,15 @@ class Dict {
       document_words_->clear();
   }
 
+  // Create unicharset adaptations of known, short lists of UTF-8 equivalent
+  // characters (think all hyphen-like symbols).  The first version of the
+  // list is taken as equivalent for matching against the dictionary.
+  void LoadEquivalenceList(const char *unichar_strings[]);
+
+  // Normalize all hyphen and apostrophes to the canonicalized one for
+  // matching; pass everything else through as is.  See LoadEquivalenceList().
+  UNICHAR_ID NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const;
+
   /**
    * Returns the maximal permuter code (from ccstruct/ratngs.h) if in light
    * of the current state the letter at word_index in the given word
@@ -531,13 +552,13 @@ class Dict {
 
   //
   int def_letter_is_okay(void* void_dawg_args,
-                         UNICHAR_ID unichar_id, bool word_end);
+                         UNICHAR_ID unichar_id, bool word_end) const;
 
   int (Dict::*letter_is_okay_)(void* void_dawg_args,
-                               UNICHAR_ID unichar_id, bool word_end);
+                               UNICHAR_ID unichar_id, bool word_end) const;
   /// Calls letter_is_okay_ member function.
   int LetterIsOkay(void* void_dawg_args,
-                   UNICHAR_ID unichar_id, bool word_end) {
+                   UNICHAR_ID unichar_id, bool word_end) const {
     return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
   }
 
@@ -581,6 +602,8 @@ class Dict {
   inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
   /// Return the points to the punctuation dawg.
   inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
+  /// Return the points to the unambiguous words dawg.
+  inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }
   /// Return the pointer to the Dawg that contains words of length word_length.
   inline const Dawg *GetFixedLengthDawg(int word_length) const {
     if (word_length > max_fixed_length_dawgs_wdlen_) return NULL;
@@ -603,7 +626,7 @@ class Dict {
   /// leading punctuation is found this would ensure that we are not
   /// expecting any particular trailing punctuation after the word).
   inline bool ConstraintsOk(const DawgInfoVector &constraints,
-                            int word_end, DawgType current_dawg_type) {
+                            int word_end, DawgType current_dawg_type) const {
     if (!word_end) return true;
     if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true;
     for (int c = 0; c < constraints.length(); ++c) {
@@ -627,7 +650,8 @@ class Dict {
   /// edges were found.
   void ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info,
                            UNICHAR_ID unichar_id, bool word_end,
-                           DawgArgs *dawg_args, PermuterType *current_permuter);
+                           DawgArgs *dawg_args,
+                           PermuterType *current_permuter) const;
 
   /// Read/Write/Access special purpose dawgs which contain words
   /// only of a certain length (used for phrase search for
@@ -649,23 +673,25 @@ class Dict {
       int num_dawgs, int debug_level, FILE *output_file);
 
   /// Check all the DAWGs to see if this word is in any of them.
-  inline bool valid_word_permuter(uinT8 perm, bool numbers_ok) {
+  inline static bool valid_word_permuter(uinT8 perm, bool numbers_ok) {
     return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
             perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
             perm == USER_PATTERN_PERM || (numbers_ok && perm == NUMBER_PERM));
   }
-  int valid_word(const WERD_CHOICE &word, bool numbers_ok);
-  int valid_word(const WERD_CHOICE &word) {
+  int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
+  int valid_word(const WERD_CHOICE &word) const {
     return valid_word(word, false);  // return NO_PERM for words with digits
   }
-  int valid_word_or_number(const WERD_CHOICE &word) {
+  int valid_word_or_number(const WERD_CHOICE &word) const {
     return valid_word(word, true);  // return NUMBER_PERM for valid numbers
   }
   /// This function is used by api/tesseract_cube_combiner.cpp
-  int valid_word(const char *string) {
+  int valid_word(const char *string) const {
     WERD_CHOICE word(string, getUnicharset());
     return valid_word(word);
   }
+  // Do the two WERD_CHOICEs form a meaningful bigram?
+  bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
   /// Returns true if the word contains a valid punctuation pattern.
   /// Note: Since the domains of punctuation symbols and symblos
   /// used in numbers are not disjoint, a valid number might contain
@@ -691,6 +717,8 @@ class Dict {
   inline void SetWordsegRatingAdjustFactor(float f) {
     wordseg_rating_adjust_factor_ = f;
   }
+  // Accessor for best_choices_.
+  const LIST &getBestChoices() { return best_choices_; }
 
  private:
   /** Private member variables. */
@@ -723,15 +751,27 @@ class Dict {
   DawgInfoVector hyphen_active_dawgs_;
   DawgInfoVector hyphen_constraints_;
   bool last_word_on_line_;
+  // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
+  // matching.  The first member of each list is taken as canonical.  For
+  // example, the first list contains hyphens and dashes with the first symbol
+  // being the ASCII hyphen minus.
+  GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_;
   // Dawgs.
   DawgVector dawgs_;
   SuccessorListsVector successors_;
   Trie *pending_words_;
+  // bigram_dawg_ points to a dawg of two-word bigrams which always supercede if
+  // any of them are present on the best choices list for a word pair.
+  // the bigrams are stored as space-separated words where:
+  // (1) leading and trailing punctuation has been removed from each word and
+  // (2) any digits have been replaced with '?' marks.
+  Dawg *bigram_dawg_;
   /// The following pointers are only cached for convenience.
   /// The dawgs will be deleted when dawgs_ vector is destroyed.
   // TODO(daria): need to support multiple languages in the future,
   // so maybe will need to maintain a list of dawgs of each kind.
   Dawg *freq_dawg_;
+  Dawg *unambig_dawg_;
   Dawg *punc_dawg_;
   Trie *document_words_;
   /// Maximum word length of fixed-length word dawgs.
@@ -740,6 +780,8 @@ class Dict {
   /// Current segmentation cost adjust factor for word rating.
   /// See comments in incorporate_segcost.
   float wordseg_rating_adjust_factor_;
+  // File for recording ambiguities discovered during dictionary search.
+  FILE *output_ambig_words_file_;
 
  public:
   /// Variable members.
@@ -750,11 +792,14 @@ class Dict {
                "A list of user-provided patterns.");
   BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
   BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
+  BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
   BOOL_VAR_H(load_punc_dawg, true,
              "Load dawg with punctuation patterns.");
   BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
   BOOL_VAR_H(load_fixed_length_dawgs, true,  "Load fixed length"
              " dawgs (e.g. for non-space delimited languages)");
+  BOOL_VAR_H(load_bigram_dawg, false,
+             "Load dawg with special word bigrams.");
   double_VAR_H(segment_penalty_dict_frequent_word, 1.0,
                "Score multiplier for word matches which have good case and"
                "are frequent in the given language (lower is better).");
@@ -779,6 +824,8 @@ class Dict {
                "Score multiplier for poorly cased strings that are not in"
                " the dictionary and generally look like garbage (lower is"
                " better).");
+  STRING_VAR_H(output_ambig_words_file, "",
+               "Output file for ambiguities found in the dictionary");
   INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
             ", to 2 for more details, to 3 to see all the debug messages");
   INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
@@ -801,6 +848,10 @@ class Dict {
   BOOL_VAR_H(stopper_no_acceptable_choices, false,
              "Make AcceptableChoice() always return false. Useful"
              " when there is a need to explore all segmentations");
+  double_VAR_H(stopper_ambiguity_threshold_gain, 8.0,
+               "Gain factor for ambiguity threshold.");
+  double_VAR_H(stopper_ambiguity_threshold_offset, 1.5,
+               "Certainty offset for ambiguity threshold.");
   BOOL_VAR_H(save_raw_choices, false, "Save all explored raw choices");
   INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
   STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
@@ -816,6 +867,10 @@ class Dict {
              "Turn on word script consistency permuter");
   BOOL_VAR_H(segment_segcost_rating, 0,
              "incorporate segmentation cost in word rating?");
+  BOOL_VAR_H(segment_nonalphabetic_script, false,
+             "Don't use any alphabetic-specific tricks."
+             "Set to true in the traineddata config file for"
+             " scripts that are cursive or inherently fixed-pitch");
   double_VAR_H(segment_reward_script, 0.95,
                "Score multipler for script consistency within a word. "
                "Being a 'reward' factor, it should be <= 1. "
diff --git a/dict/hyphen.cpp b/dict/hyphen.cpp
index 6b5d5fba0..1f39afdc5 100644
--- a/dict/hyphen.cpp
+++ b/dict/hyphen.cpp
@@ -51,7 +51,7 @@ void Dict::set_hyphen_word(const WERD_CHOICE &word,
                            const DawgInfoVector &active_dawgs,
                            const DawgInfoVector &constraints) {
   if (hyphen_word_ == NULL) {
-    hyphen_word_ = new WERD_CHOICE();
+    hyphen_word_ = new WERD_CHOICE(word.unicharset());
     hyphen_word_->make_bad();
   }
   if (hyphen_word_->rating() > word.rating()) {
diff --git a/dict/matchdefs.h b/dict/matchdefs.h
index c2b321fe5..bfab1e6be 100644
--- a/dict/matchdefs.h
+++ b/dict/matchdefs.h
@@ -28,7 +28,7 @@
 /* define the maximum number of classes defined for any matcher
   and the maximum class id for any matcher. This must be changed
   if more different classes need to be classified */
-#define MAX_NUM_CLASSES   8192
+#define MAX_NUM_CLASSES   12288
 #define MAX_CLASS_ID    (MAX_NUM_CLASSES - 1)
 
 /** a CLASS_ID is the ascii character to be associated with a class */
diff --git a/dict/permdawg.cpp b/dict/permdawg.cpp
index 1852c2e0d..20fb5792f 100644
--- a/dict/permdawg.cpp
+++ b/dict/permdawg.cpp
@@ -86,7 +86,7 @@ void Dict::go_deeper_dawg_fxn(
         if (permute_debug && dawg_debug_level) {
           tprintf("early pruned word rating=%4.2f,"
                   " permdawg_limit=%4.2f, word=%s\n", word->rating(),
-                  permdawg_limit, word->debug_string(getUnicharset()).string());
+                  permdawg_limit, word->debug_string().string());
         }
         return;
       }
@@ -106,8 +106,7 @@ void Dict::go_deeper_dawg_fxn(
     }
     if (clean_active_dawgs.size() > 0) {
       if (permute_debug && dawg_debug_level)
-        tprintf("new hyphen choice = %s\n",
-                word->debug_string(getUnicharset()).string());
+        tprintf("new hyphen choice = %s\n", word->debug_string().string());
       word->set_permuter(more_args->permuter);
       adjust_word(word, certainties, permute_debug);
       set_hyphen_word(*word, *(more_args->active_dawgs),
@@ -190,11 +189,26 @@ void Dict::go_deeper_dawg_fxn(
       // Add a new word choice
       if (word_ending) {
         if (permute_debug && dawg_debug_level) {
-          tprintf("found word = %s\n",
-                  word->debug_string(getUnicharset()).string());
+          tprintf("found word = %s\n", word->debug_string().string());
+        }
+        if (ambigs_mode(*limit) &&
+            strcmp(output_ambig_words_file.string(), "") != 0) {
+          if (output_ambig_words_file_ == NULL) {
+            output_ambig_words_file_ =
+                fopen(output_ambig_words_file.string(), "w+");
+            if (output_ambig_words_file_ == NULL) {
+              tprintf("Failed to open output_ambig_words_file %s\n",
+                      output_ambig_words_file.string());
+              exit(1);
+            }
+          }
+          STRING word_str;
+          word->string_and_lengths(&word_str, NULL);
+          word_str += " ";
+          fprintf(output_ambig_words_file_, word_str.string());
         }
         WERD_CHOICE *adjusted_word = word;
-        WERD_CHOICE hyphen_tail_word;
+        WERD_CHOICE hyphen_tail_word(&getUnicharset());
         if (hyphen_base_size() > 0) {
           hyphen_tail_word = *word;
           remove_hyphen_head(&hyphen_tail_word);
@@ -226,7 +240,7 @@ void Dict::go_deeper_dawg_fxn(
     } else {
       if (permute_debug && dawg_debug_level) {
         tprintf("last unichar not OK at index %d in %s\n",
-                word_index, word->debug_string(getUnicharset()).string());
+                word_index, word->debug_string().string());
       }
     }
   }
@@ -249,7 +263,7 @@ void Dict::go_deeper_dawg_fxn(
 WERD_CHOICE *Dict::dawg_permute_and_select(
     const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit,
     int sought_word_length, int start_char_choice_index) {
-  WERD_CHOICE *best_choice = new WERD_CHOICE();
+  WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
   best_choice->make_bad();
   best_choice->set_rating(rating_limit);
   if (char_choices.length() == 0) return best_choice;
@@ -272,7 +286,7 @@ WERD_CHOICE *Dict::dawg_permute_and_select(
                      (segment_penalty_dict_case_bad /
                       segment_penalty_dict_case_ok),
                      NO_PERM, sought_word_length, end_char_choice_index);
-  WERD_CHOICE word(MAX_WERD_LENGTH);
+  WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);
   copy_hyphen_info(&word);
   // Discard rating and certainty of the hyphen base (if any).
   word.set_rating(0.0);
diff --git a/dict/permute.cpp b/dict/permute.cpp
index 2fa0cce93..e41116698 100644
--- a/dict/permute.cpp
+++ b/dict/permute.cpp
@@ -126,12 +126,13 @@ int find_choice_by_uid(BLOB_CHOICE_LIST *blob_list, UNICHAR_ID target_uid) {
  * 1st choice of char 3, 2nd choice of char 4, 3rd choice of char 5, 2nd choice
  * of char 6.  If n > number of choice, the closest (last) one is used.
  */
-WERD_CHOICE* get_choice_from_posstr(const BLOB_CHOICE_LIST_VECTOR &char_choices,
+WERD_CHOICE* get_choice_from_posstr(const UNICHARSET *unicharset,
+                                    const BLOB_CHOICE_LIST_VECTOR &char_choices,
                                     int start_pos,
                                     const char* pos_str,
                                     float *certainties) {
   int pos_str_len = strlen(pos_str);
-  WERD_CHOICE* wchoice = new WERD_CHOICE();
+  WERD_CHOICE* wchoice = new WERD_CHOICE(unicharset);
   if (start_pos + pos_str_len > char_choices.length()) {
     wchoice->make_bad();
     return wchoice;
@@ -228,6 +229,7 @@ BLOB_CHOICE* find_choice_by_script(
 
 
 PermuterState::PermuterState() {
+  unicharset_ = NULL;
   char_choices_ = NULL;
   adjust_factor_ = 1.0f;
   allow_collision_ = false;
@@ -240,6 +242,7 @@ void PermuterState::Init(const BLOB_CHOICE_LIST_VECTOR& char_choices,
                          float default_bias,
                          bool debug) {
   ASSERT_HOST(char_choices.length() < MAX_PERM_LENGTH);
+  unicharset_ = &unicharset;
   char_choices_ = &char_choices;
   word_length_ = char_choices.length();
   for (int i = 0; i < word_length_; ++i)
@@ -300,9 +303,8 @@ void PermuterState::AddPreference(int char_pos, BLOB_CHOICE* blob_choice,
 WERD_CHOICE* PermuterState::GetPermutedWord(float *certainties,
                                             float *adjust_factor) {
   ASSERT_HOST(char_choices_ != NULL);
-  WERD_CHOICE *word_choice = get_choice_from_posstr(*char_choices_,
-                                                    0, perm_state_,
-                                                    certainties);
+  WERD_CHOICE *word_choice = get_choice_from_posstr(
+      unicharset_, *char_choices_, 0, perm_state_, certainties);
   float rating = word_choice->rating() * adjust_factor_;
   word_choice->set_rating(rating);
   *adjust_factor = adjust_factor_;
@@ -431,7 +433,8 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
   if (permute_debug)
     print_char_choices_list("\n\nPermute FixedLength Word",
                             char_choices, getUnicharset(), false);
-  WERD_CHOICE* best_choice = new WERD_CHOICE(char_choices.length());
+  WERD_CHOICE* best_choice =
+      new WERD_CHOICE(&getUnicharset(), char_choices.length());
   const int max_dict_len = max_fixed_length_dawgs_wdlen_;
   const int min_dict_len = 2;
   char posstr[256];
@@ -461,7 +464,7 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
      }
 
      if (part_choice && step > 1) {   // found lexicon match
-       part_choice->populate_unichars(getUnicharset());
+       part_choice->populate_unichars();
        get_posstr_from_choice(char_choices, part_choice, anchor_pos, posstr);
        float adjust_factor = pow(0.95, 1.0 + step*2.0/char_choices.length());
        if (permuter_state)
@@ -472,8 +475,8 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
                  part_choice->unichar_string().string());
      } else {     // no lexicon match
        step = 1;
-       part_choice =
-         get_choice_from_posstr(char_choices, anchor_pos, "0", NULL);
+       part_choice = get_choice_from_posstr(&getUnicharset(), char_choices,
+                                            anchor_pos, "0", NULL);
        if (permute_debug)
          tprintf("Single char %d %s\n", anchor_pos,
                  part_choice->unichar_string().string());
@@ -493,7 +496,7 @@ WERD_CHOICE* Dict::permute_fixed_length_words(
               best_choice->rating(), match_score, adjusted_score);
     best_choice->set_rating(adjusted_score);
   }
-  best_choice->populate_unichars(getUnicharset());
+  best_choice->populate_unichars();
   if (permute_debug)
     tprintf("Found Best CJK word %f: %s\n",
             best_choice->rating(), best_choice->unichar_string().string());
@@ -554,11 +557,12 @@ WERD_CHOICE* Dict::permute_chartype_words(
     print_char_choices_list("", char_choices, getUnicharset(), true);
   }
 
-  WERD_CHOICE *current_word = new WERD_CHOICE();
+  WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset());
   BLOB_CHOICE_IT blob_choice_it;
   const UNICHARSET& unicharset = getUnicharset();
   bool replaced = false;        // has any character choice been replaced
   int prev_unambig_type = 0;    // the last chartype of an unambiguous char
+  float certainties[MAX_PERM_LENGTH + 1];
   for (int x = 0; x < char_choices.length(); ++x) {
     BLOB_CHOICE_LIST* pos_choice = char_choices.get(x);
     UNICHAR_ID unichar_id = get_top_choice_uid(pos_choice);
@@ -640,12 +644,12 @@ WERD_CHOICE* Dict::permute_chartype_words(
     current_word->append_unichar_id(first_choice->unichar_id(), 1,
                                     first_choice->rating(),
                                     first_choice->certainty());
+    certainties[x] = first_choice->certainty();
   }
   // All permuter choices should go through adjust_non_word so the choice
   // rating would be adjusted on the same scale.
-  float certainties[MAX_PERM_LENGTH + 1];
   adjust_non_word(current_word, certainties, permute_debug);
-  current_word->populate_unichars(unicharset);
+  current_word->populate_unichars();
   if (replaced) {
     // Apply a reward multiplier on rating if an chartype permutation is made.
     float rating = current_word->rating();
@@ -682,10 +686,11 @@ WERD_CHOICE* Dict::permute_script_words(
                             permute_debug > 1);
   }
 
-  WERD_CHOICE *current_word = new WERD_CHOICE();
+  WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset());
   BLOB_CHOICE_IT blob_choice_it;
   bool replaced = false;
   bool prev_is_consistent = false;
+  float certainties[MAX_PERM_LENGTH + 1];
   for (int x = 0; x < char_choices.length(); ++x) {
     blob_choice_it.set_to_list(char_choices.get(x));
     BLOB_CHOICE *first_choice = blob_choice_it.data();
@@ -737,13 +742,13 @@ WERD_CHOICE* Dict::permute_script_words(
     current_word->append_unichar_id(first_choice->unichar_id(), 1,
                                     first_choice->rating(),
                                     first_choice->certainty());
+    certainties[x] = first_choice->certainty();
     prev_is_consistent = sid_consistent;
   }
   // All permuter choices should go through adjust_non_word so the choice
   // rating would be adjusted on the same scale.
-  float certainties[MAX_PERM_LENGTH + 1];
   adjust_non_word(current_word, certainties, permute_debug);
-  current_word->populate_unichars(getUnicharset());
+  current_word->populate_unichars();
   if (replaced) {
     // Apply a reward multiplier on rating if an script permutation is made.
     float rating = current_word->rating();
@@ -780,19 +785,19 @@ bool Dict::permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
     // Populate unichars_ and unichar_lengths_ of raw_choice. This is
     // needed for various components that still work with unichars rather
     // than unichar ids (e.g. LearnWord).
-    raw_choice->populate_unichars(getUnicharset());
+    raw_choice->populate_unichars();
   }
   if (this_choice && this_choice->rating() < best_choice->rating()) {
     *best_choice = *this_choice;
     // Populate unichars_ and unichar_lengths_ of best_choice. This is
     // needed for various components that still work with unichars rather
     // than unichar ids (dawg, *_ok functions, various hard-coded hacks).
-    best_choice->populate_unichars(getUnicharset());
+    best_choice->populate_unichars();
 
     if (permute_debug) {
       best_choice->print("\n**** Populate BestChoice");
       cprintf("populate best_choice\n\t%s\n",
-              best_choice->debug_string(getUnicharset()).string());
+              best_choice->debug_string().string());
     }
     delete this_choice;
     return true;
@@ -811,13 +816,13 @@ WERD_CHOICE *Dict::permute_compound_words(
     float rating_limit) {
   BLOB_CHOICE *first_choice;
   WERD_CHOICE *best_choice = NULL;
-  WERD_CHOICE current_word(MAX_WERD_LENGTH);
+  WERD_CHOICE current_word(&getUnicharset(), MAX_WERD_LENGTH);
   int first_index = 0;
   int x;
   BLOB_CHOICE_IT blob_choice_it;
 
   if (char_choices.length() > MAX_WERD_LENGTH) {
-    WERD_CHOICE *bad_word_choice = new WERD_CHOICE();
+    WERD_CHOICE *bad_word_choice = new WERD_CHOICE(&getUnicharset());
     bad_word_choice->make_bad();
     return bad_word_choice;
   }
@@ -874,7 +879,7 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
   int x;
   BLOB_CHOICE_LIST_VECTOR subchoices;
   WERD_CHOICE *best_choice = NULL;
-  WERD_CHOICE raw_choice;
+  WERD_CHOICE raw_choice(&getUnicharset());
   raw_choice.make_bad();
 
   DisableChoiceAccum();
@@ -886,7 +891,7 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
   }
 
   if (!subchoices.empty()) {
-    WERD_CHOICE initial_choice;
+    WERD_CHOICE initial_choice(&getUnicharset());
     initial_choice.make_bad();
     initial_choice.set_rating(rating_limit);
 
@@ -906,10 +911,10 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
 
   if (segment_debug && current_word->rating() < MAX_FLOAT32) {
     cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n",
-             current_word->debug_string(getUnicharset()).string(),
+             current_word->debug_string().string(),
              current_word->rating(), current_word->certainty());
   }
-  current_word->populate_unichars(getUnicharset());
+  current_word->populate_unichars();
 
   EnableChoiceAccum();
 }
@@ -919,7 +924,7 @@ void Dict::permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
  */
 WERD_CHOICE *Dict::get_top_choice_word(
     const BLOB_CHOICE_LIST_VECTOR &char_choices) {
-  WERD_CHOICE *top_word = new WERD_CHOICE(MAX_PERM_LENGTH);
+  WERD_CHOICE *top_word = new WERD_CHOICE(&getUnicharset(), MAX_PERM_LENGTH);
   float certainties[MAX_PERM_LENGTH];
   top_word->set_permuter(TOP_CHOICE_PERM);
   for (int x = 0; x < char_choices.length(); x++) {
@@ -956,11 +961,11 @@ WERD_CHOICE *Dict::permute_top_choice(
   const char *next_char = "";         //next in word
   const char *next_next_char = "";    //after next next in word
 
-  WERD_CHOICE word(MAX_PERM_LENGTH);
+  WERD_CHOICE word(&getUnicharset(), MAX_PERM_LENGTH);
   word.set_permuter(TOP_CHOICE_PERM);
-  WERD_CHOICE capital_word(MAX_PERM_LENGTH);
+  WERD_CHOICE capital_word(&getUnicharset(), MAX_PERM_LENGTH);
   capital_word.set_permuter(UPPER_CASE_PERM);
-  WERD_CHOICE lower_word(MAX_PERM_LENGTH);
+  WERD_CHOICE lower_word(&getUnicharset(), MAX_PERM_LENGTH);
   lower_word.set_permuter(LOWER_CASE_PERM);
 
   int x;
@@ -1023,7 +1028,7 @@ WERD_CHOICE *Dict::permute_top_choice(
     if (first_choice == NULL) {
       cprintf("Permuter found only fragments for"
               " character at position %d; word=%s\n",
-              x, word.debug_string(getUnicharset()).string());
+              x, word.debug_string().string());
     }
     ASSERT_HOST(first_choice != NULL);
 
@@ -1132,7 +1137,7 @@ WERD_CHOICE *Dict::permute_top_choice(
     }
   }
 
-  if (word.rating() < raw_choice->rating()) {
+  if (raw_choice != NULL && word.rating() < raw_choice->rating()) {
     *raw_choice = word;
     LogNewChoice(1.0, certainties, true, raw_choice);
   }
@@ -1423,9 +1428,9 @@ WERD_CHOICE *Dict::top_fragments_permute_and_select(
     frag_char_choices += frag_choices;
   }
 
-  WERD_CHOICE *best_choice = new WERD_CHOICE();
+  WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
   best_choice->make_bad();
-  WERD_CHOICE word(MAX_PERM_LENGTH);
+  WERD_CHOICE word(&getUnicharset(), MAX_PERM_LENGTH);
   word.set_permuter(TOP_CHOICE_PERM);
   float certainties[MAX_PERM_LENGTH];
   this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_top_fragments_fxn;
@@ -1459,7 +1464,7 @@ void Dict::permute_choices(
     tprintf("%s permute_choices: char_choice_index=%d"
             " limit=%g rating=%g, certainty=%g word=%s\n",
             debug, char_choice_index, *limit, word->rating(),
-            word->certainty(), word->debug_string(getUnicharset()).string());
+            word->certainty(), word->debug_string().string());
   }
   if (char_choice_index < char_choices.length()) {
     BLOB_CHOICE_IT blob_choice_it;
@@ -1554,7 +1559,7 @@ void Dict::go_deeper_top_fragments_fxn(
     if (word_ending) {
       if (fragments_debug > 1) {
         tprintf("fragments_debug new choice = %s\n",
-                word->debug_string(getUnicharset()).string());
+                word->debug_string().string());
       }
       *limit = word->rating();
       adjust_non_word(word, certainties, permute_debug);
@@ -1567,8 +1572,7 @@ void Dict::go_deeper_top_fragments_fxn(
   } else {
     if (fragments_debug > 1) {
       tprintf("fragments_debug pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
-              word->debug_string(getUnicharset()).string(),
-              word->rating(), *limit);
+              word->debug_string().string(), word->rating(), *limit);
     }
   }
 }
diff --git a/dict/permute.h b/dict/permute.h
index ca66d6748..f7ff6cad9 100644
--- a/dict/permute.h
+++ b/dict/permute.h
@@ -133,6 +133,8 @@ class PermuterState {
  private:
   static const char kPosFree = '.';
 
+  const UNICHARSET *unicharset_;
+
   const BLOB_CHOICE_LIST_VECTOR *char_choices_;   // reference pointer only
                             // does not need to be allocated or freed
   char perm_state_[MAX_PERM_LENGTH];   // handles upto MAX_PERM_LENGTH-1 states
diff --git a/dict/states.cpp b/dict/states.cpp
index 35a06477e..0a5393f80 100644
--- a/dict/states.cpp
+++ b/dict/states.cpp
@@ -241,6 +241,19 @@ void print_state(const char *label, STATE *state, int num_joints) {
   new_line();
 }
 
+// Prints out the number of fragments in each segment in a state to
+// toappend.
+void print_state(STATE *state, int num_joints, STRING *toappend) {
+  PIECES_STATE pieces;
+  bin_to_pieces(state, num_joints, pieces);
+  for (int i = 0; pieces[i] > 0; i++) {
+   if (i > 0) {
+     toappend->add_str_int(" ", pieces[i]);
+   } else {
+     toappend->add_str_int("", pieces[i]);
+   }
+  }
+}
 
 /**
  * set_n_ones
diff --git a/dict/states.h b/dict/states.h
index a478c39ba..ef0640171 100644
--- a/dict/states.h
+++ b/dict/states.h
@@ -29,6 +29,7 @@
               I n c l u d e s
 ----------------------------------------------------------------------*/
 #include "host.h"
+#include "strngs.h"
 
 /*----------------------------------------------------------------------
               T y p e s
@@ -64,6 +65,8 @@ int ones_in_state(STATE *state, int num_joints);
 
 void print_state(const char *label, STATE *state, int num_joints);
 
+void print_state(STATE *state, int num_joints, STRING *toappend);
+
 void set_n_ones(STATE *state, int n);
 
 extern void free_state(STATE *);
diff --git a/dict/stopper.cpp b/dict/stopper.cpp
index 319da6712..01d99f09d 100644
--- a/dict/stopper.cpp
+++ b/dict/stopper.cpp
@@ -17,13 +17,11 @@
  ******************************************************************************/
 
 #include "stopper.h"
-#include "emalloc.h"
 #include "matchdefs.h"
 #include "callcpp.h"
 #include "permute.h"
 #include "danerror.h"
 #include "const.h"
-#include "freelist.h"
 #include "efio.h"
 #include "scanutils.h"
 #include "unichar.h"
@@ -58,6 +56,10 @@ typedef struct
   UNICHAR_ID ChunkClass[MAX_NUM_CHUNKS];
 } EXPANDED_CHOICE;
 
+void DeleteViableChoiceStruct(void *vcs) {
+  delete (static_cast<VIABLE_CHOICE_STRUCT *>(vcs));
+}
+
 #define BestCertainty(Choices) \
   (((VIABLE_CHOICE) first_node (Choices))->Certainty)
 
@@ -66,10 +68,6 @@ typedef struct
 #define BestFactor(Choices) \
   (((VIABLE_CHOICE) first_node (Choices))->AdjustFactor)
 
-#define AmbigThreshold(F1,F2) \
-  (((F2) - (F1)) * tesseract::Dict::kStopperAmbiguityThresholdGain - \
-   tesseract::Dict::kStopperAmbiguityThresholdOffset)
-
 /**----------------------------------------------------------------------------
               Private Code
 ----------------------------------------------------------------------------**/
@@ -100,23 +98,72 @@ static void ExpandChoice(VIABLE_CHOICE Choice,
   }
 }
 
+VIABLE_CHOICE_STRUCT::VIABLE_CHOICE_STRUCT(int length)
+    : Length(length) {
+  Blob = new CHAR_CHOICE[length];
+  segmentation_state = new uinT8[length];
+}
+
+VIABLE_CHOICE_STRUCT::VIABLE_CHOICE_STRUCT() : Length(0) {
+  Blob = NULL;
+  segmentation_state = NULL;
+}
+
+VIABLE_CHOICE_STRUCT::~VIABLE_CHOICE_STRUCT() {
+  delete []Blob;
+  delete []segmentation_state;
+}
+
+void VIABLE_CHOICE_STRUCT::Init(
+    const WERD_CHOICE &word_choice,
+    const PIECES_STATE &pieces_state,
+    const float certainties[],
+    FLOAT32 adjust_factor) {
+  this->Rating = word_choice.rating();
+  this->Certainty = word_choice.certainty();
+  this->AdjustFactor = adjust_factor;
+  this->ComposedFromCharFragments = false;
+  ASSERT_HOST(this->Length == word_choice.length());
+
+  for (int i = 0, bw_idx = 0; i < word_choice.length(); i++, bw_idx++) {
+    int blob_width = pieces_state[bw_idx];
+    CHAR_CHOICE *blob_choice = &this->Blob[i];
+    blob_choice->Class = word_choice.unichar_id(i);
+    blob_choice->NumChunks = blob_width;
+    blob_choice->Certainty = certainties[i];
+    for (int f = 1; f < word_choice.fragment_length(i); ++f) {
+      blob_width = pieces_state[++bw_idx];
+      assert(blob_width > 0);
+      blob_choice->NumChunks += blob_width;
+      this->ComposedFromCharFragments = true;
+    }
+    this->segmentation_state[i] = blob_choice->NumChunks;
+  }
+}
+
+
+namespace tesseract {
+
 // If the certainty of any chunk in Choice (item1) is not ambiguous with the
 // corresponding chunk in the best choice (item2), frees Choice and
 // returns true.
-static int FreeBadChoice(void *item1,    // VIABLE_CHOICE Choice,
-                         void *item2) {  // EXPANDED_CHOICE *BestChoice
+int Dict::FreeBadChoice(
+    void *item1,    // VIABLE_CHOICE Choice,
+    void *item2) {  // EXPANDED_CHOICE *BestChoice
   int i, j, Chunk;
   FLOAT32 Threshold;
   VIABLE_CHOICE Choice = reinterpret_cast<VIABLE_CHOICE>(item1);
   EXPANDED_CHOICE *BestChoice = reinterpret_cast<EXPANDED_CHOICE *>(item2);
-  Threshold = AmbigThreshold(BestChoice->Choice->AdjustFactor,
-                             Choice->AdjustFactor);
+  Threshold = StopperAmbigThreshold(BestChoice->Choice->AdjustFactor,
+                                    Choice->AdjustFactor);
   for (i = 0, Chunk = 0; i < Choice->Length; i++) {
-    for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++){
+    for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
       if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] &&
           Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] <
           Threshold) {
-        memfree(Choice);
+        if (stopper_debug_level >= 2)
+          PrintViableChoice(stderr, "\nDiscarding bad choice:  ", Choice);
+        delete Choice;
         return true;
       }
     }
@@ -124,11 +171,6 @@ static int FreeBadChoice(void *item1,    // VIABLE_CHOICE Choice,
   return false;
 }
 
-namespace tesseract {
-
-const float Dict::kStopperAmbiguityThresholdGain = 8.0;
-const float Dict::kStopperAmbiguityThresholdOffset = 1.5;
-
 bool Dict::AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
                             WERD_CHOICE *BestChoice,
                             DANGERR *fixpt,
@@ -158,7 +200,7 @@ bool Dict::AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
 
   if (stopper_debug_level >= 1)
     tprintf("\nStopper:  %s (word=%c, case=%c)\n",
-            BestChoice->debug_string(getUnicharset()).string(),
+            BestChoice->debug_string().string(),
             (is_valid_word ? 'y' : 'n'),
             (is_case_ok ? 'y' : 'n'));
 
@@ -198,7 +240,7 @@ bool Dict::AcceptableResult(const WERD_CHOICE &BestChoice) {
 
   if (stopper_debug_level >= 1) {
     tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n",
-            BestChoice.debug_string(getUnicharset()).string(),
+            BestChoice.debug_string().string(),
             (valid_word(BestChoice) ? 'y' : 'n'),
             (case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'),
             ((list_rest (best_choices_) != NIL_LIST) ? 'n' : 'y'));
@@ -320,10 +362,16 @@ void Dict::FilterWordChoices() {
     return;
 
   // Compute certainties and class for each chunk in best choice.
-  ExpandChoice((VIABLE_CHOICE_STRUCT *)first_node(best_choices_), &BestChoice);
-
-  set_rest (best_choices_, delete_d(list_rest (best_choices_),
-                                    &BestChoice, FreeBadChoice));
+  VIABLE_CHOICE_STRUCT *best_choice =
+      (VIABLE_CHOICE_STRUCT *)first_node(best_choices_);
+  ExpandChoice(best_choice, &BestChoice);
+  if (stopper_debug_level >= 2)
+    PrintViableChoice(stderr, "\nFiltering against best choice: ", best_choice);
+  TessResultCallback2<int, void*, void*>* is_bad =
+      NewPermanentTessCallback(this, &Dict::FreeBadChoice);
+  set_rest(best_choices_, delete_d(list_rest(best_choices_),
+                                   &BestChoice, is_bad));
+  delete is_bad;
 }
 
 void Dict::FindClassifierErrors(FLOAT32 MinRating,
@@ -371,15 +419,15 @@ void Dict::InitChoiceAccum() {
   BLOB_WIDTH *BlobWidth, *End;
 
   if (best_raw_choice_)
-    memfree(best_raw_choice_);
+    delete best_raw_choice_;
   best_raw_choice_ = NULL;
 
   if (best_choices_)
-    destroy_nodes(best_choices_, memfree);
+    destroy_nodes(best_choices_, DeleteViableChoiceStruct);
   best_choices_ = NIL_LIST;
 
   if (raw_choices_)
-    destroy_nodes(raw_choices_, memfree);
+    destroy_nodes(raw_choices_, DeleteViableChoiceStruct);
   raw_choices_ = NIL_LIST;
 
   EnableChoiceAccum();
@@ -391,7 +439,7 @@ void Dict::InitChoiceAccum() {
 }
 
 void Dict::ClearBestChoiceAccum() {
-  if (best_choices_) destroy_nodes(best_choices_, memfree);
+  if (best_choices_) destroy_nodes(best_choices_, DeleteViableChoiceStruct);
   best_choices_ = NIL_LIST;
 }
 
@@ -420,7 +468,6 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
                         const float Certainties[],
                         bool raw_choice,
                         WERD_CHOICE *WordChoice) {
-  VIABLE_CHOICE NewChoice;
   LIST ChoicesList;
   LIST Choices;
   FLOAT32 Threshold;
@@ -429,14 +476,15 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
     return;
 
   if (raw_choice) {
-    if (!best_raw_choice_)
-      best_raw_choice_ = NewViableChoice(*WordChoice, AdjustFactor, Certainties);
-    else if (WordChoice->rating() < best_raw_choice_->Rating) {
-      if (ChoiceSameAs(*WordChoice, best_raw_choice_))
-        FillViableChoice(*WordChoice, AdjustFactor, Certainties, true,
+    if (!best_raw_choice_) {
+      best_raw_choice_ =
+          NewViableChoice(*WordChoice, AdjustFactor, Certainties);
+    } else if (WordChoice->rating() < best_raw_choice_->Rating) {
+      if (ChoiceSameAs(*WordChoice, best_raw_choice_)) {
+        FillViableChoice(*WordChoice, AdjustFactor, Certainties,
                          best_raw_choice_);
-      else {
-        memfree(best_raw_choice_);
+      } else {
+        delete best_raw_choice_;
         best_raw_choice_ =
           NewViableChoice(*WordChoice, AdjustFactor, Certainties);
       }
@@ -449,16 +497,20 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
 
   // Throw out obviously bad choices to save some work.
   if (ChoicesList != NIL_LIST) {
-    Threshold = AmbigThreshold (BestFactor (ChoicesList), AdjustFactor);
-    if (Threshold > -kStopperAmbiguityThresholdOffset)
-      Threshold = -kStopperAmbiguityThresholdOffset;
+    Threshold = StopperAmbigThreshold(BestFactor(ChoicesList), AdjustFactor);
+    if (Threshold > -stopper_ambiguity_threshold_offset)
+      Threshold = -stopper_ambiguity_threshold_offset;
     if (WordChoice->certainty() - BestCertainty (ChoicesList) < Threshold) {
       // Set the rating of the word to be terrible, so that it does not
       // get chosen as the best choice.
       if (stopper_debug_level >= 2) {
-        tprintf("Discarding a choice with an overly low certainty"
-                " %.4f vs best choice certainty %.4f\n",
-                WordChoice->certainty(), BestCertainty(ChoicesList));
+        STRING bad_string;
+        WordChoice->string_and_lengths(&bad_string, NULL);
+        tprintf("Discarding choice \"%s\" with an overly low certainty"
+                " %.4f vs best choice certainty %.4f (Threshold: %.4f)\n",
+                bad_string.string(), WordChoice->certainty(),
+                BestCertainty(ChoicesList),
+                Threshold + BestCertainty(ChoicesList));
       }
       WordChoice->set_rating(WERD_CHOICE::kBadRating);
       return;
@@ -466,7 +518,7 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
   }
 
   // See if a choice with the same text string has already been found.
-  NewChoice = NULL;
+  VIABLE_CHOICE NewChoice = NULL;
   Choices = ChoicesList;
 
   iterate(Choices) {
@@ -480,11 +532,10 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
   }
 
   if (NewChoice) {
-    FillViableChoice(*WordChoice, AdjustFactor, Certainties, true, NewChoice);
+    FillViableChoice(*WordChoice, AdjustFactor, Certainties, NewChoice);
     ChoicesList = delete_d(ChoicesList, NewChoice, is_same_node);
-  }
-  else {
-    NewChoice = NewViableChoice (*WordChoice, AdjustFactor, Certainties);
+  } else {
+    NewChoice = NewViableChoice(*WordChoice, AdjustFactor, Certainties);
   }
 
   ChoicesList = s_adjoin (ChoicesList, NewChoice, CmpChoiceRatings);
@@ -494,7 +545,7 @@ void Dict::LogNewChoice(FLOAT32 AdjustFactor,
   if (count (ChoicesList) > tessedit_truncate_wordchoice_log) {
     Choices =
       (LIST) nth_cell (ChoicesList, tessedit_truncate_wordchoice_log);
-    destroy_nodes (list_rest (Choices), Efree);
+    destroy_nodes(list_rest (Choices), DeleteViableChoiceStruct);
     set_rest(Choices, NIL_LIST);
   }
 
@@ -513,7 +564,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
                             bool *modified_blobs) {
   if (stopper_debug_level > 2) {
     tprintf("\nRunning NoDangerousAmbig() for %s\n",
-            best_choice->debug_string(getUnicharset()).string());
+            best_choice->debug_string().string());
   }
 
   // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
@@ -549,8 +600,10 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
       for (i = 0; i < best_choice->length(); ++i) {
         BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
         BLOB_CHOICE_IT lst_it(lst);
+        // TODO(rays/antonova) Should these BLOB_CHOICEs use real xheights
+        // or are these fake ones good enough?
         lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
-                                          0.0, 0.0, -1, -1, -1));
+                                          0.0, 0.0, -1, -1, -1, 0, 1, false));
         ambig_blob_choices.push_back(lst);
       }
     }
@@ -630,7 +683,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
               BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
               bc_it.add_to_end(new BLOB_CHOICE(
                   ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
-                  -1, -1, -1));
+                  -1, -1, -1, 0, 1, false));
             }
           }
           spec_it.forward();
@@ -650,7 +703,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
       }  // end searching AmbigSpec_LIST
     }  // end searching best_choice
   }  // end searching replace and dangerous ambigs
-  if (modified_best_choice) best_choice->populate_unichars(getUnicharset());
+  if (modified_best_choice) best_choice->populate_unichars();
   // If any ambiguities were found permute the constructed ambig_blob_choices
   // to see if an alternative dictionary word can be found.
   if (ambigs_found) {
@@ -666,7 +719,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
     if (ambigs_found) {
       if (stopper_debug_level >= 1) {
         tprintf ("Stopper: Possible ambiguous word = %s\n",
-                 alt_word->debug_string(getUnicharset()).string());
+                 alt_word->debug_string().string());
       }
       if (fixpt != NULL) {
         // Note: Currently character choices combined from fragments can only
@@ -691,6 +744,10 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice,
     }
     delete alt_word;
   }
+  if (output_ambig_words_file_ != NULL) {
+    fprintf(output_ambig_words_file_, "\n");
+  }
+
   ambig_blob_choices.delete_data_pointers();
   return !ambigs_found;
 }
@@ -714,7 +771,6 @@ void Dict::AddNewChunk(VIABLE_CHOICE Choice, int Blob) {
       return;
     }
   }
-  mem_tidy (1);
   cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
            Choice->Length, LastChunk, Blob);
   assert(false);  // this should never get executed
@@ -748,7 +804,7 @@ void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
       for (i = 0; i < fraglen; ++i) {
         if (fraglen > 1) {
           STRING frag_str =
-            CHAR_FRAGMENT::to_string(temp_uch, i, fraglen);
+            CHAR_FRAGMENT::to_string(temp_uch, i, fraglen, false);
           getUnicharset().unichar_insert(frag_str.string());
           uch_id = getUnicharset().unichar_to_id(frag_str.string());
         }
@@ -756,7 +812,7 @@ void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
         STRING correct_frag_uch =
           CHAR_FRAGMENT::to_string(correct_ngram_str,
                                    temp_blob_index - begin_blob_index,
-                                   num_blobs_to_replace);
+                                   num_blobs_to_replace, false);
         getUnicharset().unichar_insert(correct_frag_uch.string());
         UNICHAR_ID correct_frag_uch_id =
           getUnicharset().unichar_to_id(correct_frag_uch.string());
@@ -825,10 +881,9 @@ VIABLE_CHOICE Dict::NewViableChoice(const WERD_CHOICE &WordChoice,
                                     const float Certainties[]) {
   int Length = WordChoice.length();
   assert (Length <= MAX_NUM_CHUNKS && Length > 0);
-  VIABLE_CHOICE NewChoice = (VIABLE_CHOICE) Emalloc (
-      sizeof (VIABLE_CHOICE_STRUCT) + (Length - 1) * sizeof (CHAR_CHOICE));
-  FillViableChoice(WordChoice, AdjustFactor, Certainties, false, NewChoice);
-  return (NewChoice);
+  VIABLE_CHOICE NewChoice = new VIABLE_CHOICE_STRUCT(Length);
+  FillViableChoice(WordChoice, AdjustFactor, Certainties, NewChoice);
+  return NewChoice;
 }
 
 void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) {
@@ -864,35 +919,10 @@ void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice
 
 void Dict::FillViableChoice(const WERD_CHOICE &WordChoice,
                             FLOAT32 AdjustFactor, const float Certainties[],
-                            bool SameString, VIABLE_CHOICE ViableChoice) {
-  CHAR_CHOICE *NewChar;
-  BLOB_WIDTH *BlobWidth;
-  int x;
+                            VIABLE_CHOICE ViableChoice) {
+  ViableChoice->Init(WordChoice, current_segmentation_, Certainties,
+                     AdjustFactor);
 
-  ViableChoice->Rating = WordChoice.rating();
-  ViableChoice->Certainty = WordChoice.certainty();
-  ViableChoice->AdjustFactor = AdjustFactor;
-  ViableChoice->ComposedFromCharFragments = false;
-  if (!SameString) {
-    ViableChoice->Length = WordChoice.length();
-  }
-  for (x = 0,
-       NewChar = &(ViableChoice->Blob[0]),
-       BlobWidth = current_segmentation_;
-       x < WordChoice.length();
-       x++, NewChar++, Certainties++, BlobWidth++) {
-    if (!SameString) {
-      NewChar->Class = WordChoice.unichar_id(x);
-    }
-    NewChar->NumChunks = *BlobWidth;
-    NewChar->Certainty = *Certainties;
-    for (int i = 1; i < WordChoice.fragment_length(x); ++i) {
-      BlobWidth++;
-      assert(*BlobWidth > 0);
-      NewChar->NumChunks += *BlobWidth;
-      ViableChoice->ComposedFromCharFragments = true;
-    }
-  }
 }
 
 bool Dict::StringSameAs(const WERD_CHOICE &WordChoice,
diff --git a/dict/stopper.h b/dict/stopper.h
index d9993c4be..6ff597be9 100644
--- a/dict/stopper.h
+++ b/dict/stopper.h
@@ -27,6 +27,8 @@
 #include "states.h"
 #include "unichar.h"
 
+class WERD_CHOICE;
+
 typedef uinT8 BLOB_WIDTH;
 
 struct DANGERR_INFO {
@@ -50,13 +52,36 @@ struct CHAR_CHOICE {
   float Certainty;
 };
 
-struct VIABLE_CHOICE_STRUCT {
+class VIABLE_CHOICE_STRUCT {
+ public:
+  VIABLE_CHOICE_STRUCT();
+  explicit VIABLE_CHOICE_STRUCT(int length);
+  ~VIABLE_CHOICE_STRUCT();
+
+  // Fill in the data with these values.
+  void Init(const WERD_CHOICE& word_choice,
+            const PIECES_STATE& pieces_state,
+            const float certainties[],
+            FLOAT32 adjust_factor);
+
+  int Length;
   float Rating;
   float Certainty;
   FLOAT32 AdjustFactor;
-  int Length;
   bool ComposedFromCharFragments;
-  CHAR_CHOICE Blob[1];
+  CHAR_CHOICE *Blob;
+
+  // segmentation_state: for each choice, how many consecutive blobs
+  //     does it use?
+  uinT8 *segmentation_state;
+
+ private:
+  // Disallow assignment and copy construction
+  VIABLE_CHOICE_STRUCT(const VIABLE_CHOICE_STRUCT &other)
+      : Length(0), Blob(NULL), segmentation_state(NULL) {}
+  VIABLE_CHOICE_STRUCT &operator=(const VIABLE_CHOICE_STRUCT &other) {
+    return *this;
+  }
 };
 
 typedef VIABLE_CHOICE_STRUCT *VIABLE_CHOICE;
diff --git a/dict/trie.cpp b/dict/trie.cpp
index a981d7e95..ededbaf12 100644
--- a/dict/trie.cpp
+++ b/dict/trie.cpp
@@ -40,6 +40,16 @@
 
 namespace tesseract {
 
+const char kDoNotReverse[] = "RRP_DO_NO_REVERSE";
+const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL";
+const char kForceReverse[] = "RRP_FORCE_REVERSE";
+
+const char * const RTLReversePolicyNames[] = {
+  kDoNotReverse,
+  kReverseIfHasRTL,
+  kForceReverse
+};
+
 const char Trie::kAlphaPatternUnicode[] = "\u2000";
 const char Trie::kDigitPatternUnicode[] = "\u2001";
 const char Trie::kAlphanumPatternUnicode[] = "\u2002";
@@ -47,6 +57,10 @@ const char Trie::kPuncPatternUnicode[] = "\u2003";
 const char Trie::kLowerPatternUnicode[] = "\u2004";
 const char Trie::kUpperPatternUnicode[] = "\u2005";
 
+const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {
+  return RTLReversePolicyNames[reverse_policy];
+}
+
 // Reset the Trie to empty.
 void Trie::clear() {
   nodes_.delete_data_pointers();
@@ -156,10 +170,15 @@ void Trie::add_word_ending(EDGE_RECORD *edge_ptr,
   *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
 }
 
-void Trie::add_word_to_dawg(const WERD_CHOICE &word,
+bool Trie::add_word_to_dawg(const WERD_CHOICE &word,
                             const GenericVector<bool> *repetitions) {
-  if (word.length() <= 0) return;  // can't add empty words
+  if (word.length() <= 0) return false;  // can't add empty words
   if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length());
+  // Make sure the word does not contain invalid unchar ids.
+  for (int i = 0; i < word.length(); ++i) {
+    if (word.unichar_id(i) < 0 ||
+        word.unichar_id(i) >= unicharset_size_) return false;
+  }
 
   EDGE_RECORD *edge_ptr;
   NODE_REF last_node = 0;
@@ -233,6 +252,9 @@ void Trie::add_word_to_dawg(const WERD_CHOICE &word,
   if (add_failed) {
     tprintf("Re-initializing document dictionary...\n");
     clear();
+    return false;
+  } else {
+    return true;
   }
 }
 
@@ -244,7 +266,8 @@ NODE_REF Trie::new_dawg_node() {
 }
 
 bool Trie::read_word_list(const char *filename,
-                          const UNICHARSET &unicharset) {
+                          const UNICHARSET &unicharset,
+                          Trie::RTLReversePolicy reverse_policy) {
   FILE *word_file;
   char string[CHARS_PER_LINE];
   int  word_count = 0;
@@ -254,6 +277,11 @@ bool Trie::read_word_list(const char *filename,
   while (fgets(string, CHARS_PER_LINE, word_file) != NULL) {
     chomp_string(string);  // remove newline
     WERD_CHOICE word(string, unicharset);
+    if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
+        word.has_rtl_unichar_id()) ||
+        reverse_policy == RRP_FORCE_REVERSE) {
+      word.reverse_and_mirror_unichar_ids();
+    }
     ++word_count;
     if (debug_level_ && word_count % 10000 == 0)
       tprintf("Read %d words so far\n", word_count);
@@ -290,6 +318,7 @@ void Trie::initialize_patterns(UNICHARSET *unicharset) {
   unicharset->unichar_insert(kUpperPatternUnicode);
   upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode);
   initialized_patterns_ = true;
+  unicharset_size_ = unicharset->size();
 }
 
 void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id,
@@ -351,7 +380,7 @@ bool Trie::read_pattern_list(const char *filename,
     chomp_string(string);  // remove newline
     // Parse the pattern and construct a unichar id vector.
     // Record the number of repetitions of each unichar in the parallel vector.
-    WERD_CHOICE word;
+    WERD_CHOICE word(&unicharset);
     GenericVector<bool> repetitions_vec;
     const char *str_ptr = string;
     int step = unicharset.step(str_ptr);
@@ -397,7 +426,7 @@ bool Trie::read_pattern_list(const char *filename,
     // Insert the pattern into the trie.
     if (debug_level_ > 2) {
       tprintf("Inserting expanded user pattern %s\n",
-              word.debug_string(unicharset).string());
+              word.debug_string().string());
     }
     if (!this->word_in_dawg(word)) {
       this->add_word_to_dawg(word, &repetitions_vec);
diff --git a/dict/trie.h b/dict/trie.h
index 2196e28d1..bf1bfb83b 100644
--- a/dict/trie.h
+++ b/dict/trie.h
@@ -61,6 +61,12 @@ namespace tesseract {
  */
 class Trie : public Dawg {
  public:
+  enum RTLReversePolicy {
+    RRP_DO_NO_REVERSE,
+    RRP_REVERSE_IF_HAS_RTL,
+    RRP_FORCE_REVERSE,
+  };
+
   // Minimum number of concrete characters at the beginning of user patterns.
   static const int kSaneNumConcreteChars = 4;
   // Various unicode whitespace characters are used to denote unichar patterns,
@@ -73,6 +79,9 @@ class Trie : public Dawg {
   static const char kLowerPatternUnicode[];
   static const char kUpperPatternUnicode[];
 
+  static const char *get_reverse_policy_name(
+      RTLReversePolicy reverse_policy);
+
   // max_num_edges argument allows limiting the amount of memory this
   // Trie can consume (if a new word insert would cause the Trie to
   // contain more edges than max_num_edges, all the edges are cleared
@@ -86,7 +95,7 @@ class Trie : public Dawg {
     new_dawg_node();  // need to allocate node 0
     initialized_patterns_ = false;
   }
-  ~Trie() { nodes_.delete_data_pointers(); }
+  virtual ~Trie() { nodes_.delete_data_pointers(); }
 
   // Reset the Trie to empty.
   void clear();
@@ -149,8 +158,11 @@ class Trie : public Dawg {
   SquishedDawg *trie_to_dawg();
 
   // Inserts the list of words from the given file into the Trie.
+  // If reverse is true, calls WERD_CHOICE::reverse_unichar_ids_if_rtl()
+  // on each word before inserting it into the Trie.
   bool read_word_list(const char *filename,
-                      const UNICHARSET &unicharset);
+                      const UNICHARSET &unicharset,
+                      Trie::RTLReversePolicy reverse);
 
   // Inserts the list of patterns from the given file into the Trie.
   // The pattern list file should contain one pattern per line in UTF-8 format.
@@ -225,10 +237,13 @@ class Trie : public Dawg {
   // whether the unichar id with the corresponding index in the word is allowed
   // to repeat an unlimited number of times. For each entry that is true, MARKER
   // flag of the corresponding edge created for this unichar id is set to true).
-  void add_word_to_dawg(const WERD_CHOICE &word,
+  //
+  // Return true if add succeeded, false otherwise (e.g. when a word contained
+  // an invalid unichar id or the trie was getting too large and was cleared).
+  bool add_word_to_dawg(const WERD_CHOICE &word,
                         const GenericVector<bool> *repetitions);
-  void add_word_to_dawg(const WERD_CHOICE &word) {
-    add_word_to_dawg(word, NULL);
+  bool add_word_to_dawg(const WERD_CHOICE &word) {
+    return add_word_to_dawg(word, NULL);
   }
 
  protected:
@@ -377,11 +392,11 @@ class Trie : public Dawg {
   UNICHAR_ID character_class_to_pattern(char ch);
 
   // Member variables
-  TRIE_NODES nodes_;              ///< vector of nodes in the Trie
-  uinT64 num_edges_;              ///< sum of all edges (forward and backward)
-  uinT64 max_num_edges_;          ///< maximum number of edges allowed
-  uinT64 deref_direction_mask_;   ///< mask for EDGE_REF to extract direction
-  uinT64 deref_node_index_mask_;  ///< mask for EDGE_REF to extract node index
+  TRIE_NODES nodes_;              // vector of nodes in the Trie
+  uinT64 num_edges_;              // sum of all edges (forward and backward)
+  uinT64 max_num_edges_;          // maximum number of edges allowed
+  uinT64 deref_direction_mask_;   // mask for EDGE_REF to extract direction
+  uinT64 deref_node_index_mask_;  // mask for EDGE_REF to extract node index
   // Variables for translating character class codes denoted in user patterns
   // file to the unichar ids used to represent them in a Trie.
   bool initialized_patterns_;