Move training to src.

2025-06-06 17:32:41 +08:00 · 2018-04-25 11:35:26 +03:00 · 2018-04-25 11:35:26 +03:00 · 104fe7931c
commit 104fe7931c
parent ca5c15e6a8
65 changed files with 1664 additions and 1664 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -308,7 +308,7 @@ if (BUILD_TESTS AND EXISTS ${PROJECT_SOURCE_DIR}/googletest/CMakeLists.txt)
 endif()

 if (BUILD_TRAINING_TOOLS)
-add_subdirectory(training)
+add_subdirectory(src/training)
 endif()

 get_target_property(tesseract_NAME libtesseract NAME)
--- a/configure.ac
+++ b/configure.ac
@ -502,7 +502,7 @@ AC_CONFIG_FILES([java/com/google/scrollview/Makefile])
 AC_CONFIG_FILES([java/com/google/scrollview/events/Makefile])
 AC_CONFIG_FILES([java/com/google/scrollview/ui/Makefile])
 AC_CONFIG_FILES([doc/Makefile])
-AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(training/Makefile)])
+AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(src/training/Makefile)])
 AC_OUTPUT

 # Final message
--- a/cppan.yml
+++ b/cppan.yml
@ -172,7 +172,7 @@ projects:
    tessopt:
        type: lib
        static_only: true
-        files: training/tessopt.*
+        files: src/training/tessopt.*
        include_directories: training
        dependencies: libtesseract

@ -180,104 +180,104 @@ projects:
        type: lib
        static_only: true
        files:
-            - training/commandlineflags.cpp
-            - training/commandlineflags.h
-            - training/commontraining.cpp
-            - training/commontraining.h
+            - src/training/commandlineflags.cpp
+            - src/training/commandlineflags.h
+            - src/training/commontraining.cpp
+            - src/training/commontraining.h
        include_directories: training
        dependencies:
            - tessopt

    ambiguous_words:
-        files: training/ambiguous_words.cpp
+        files: src/training/ambiguous_words.cpp
        dependencies:
            - libtesseract

    classifier_tester:
-        files: training/classifier_tester.cpp
+        files: src/training/classifier_tester.cpp
        dependencies: common_training

    combine_lang_model:
-        files: training/combine_lang_model.cpp
+        files: src/training/combine_lang_model.cpp
        dependencies: unicharset_training

    combine_tessdata:
-        files: training/combine_tessdata.cpp
+        files: src/training/combine_tessdata.cpp
        dependencies: libtesseract

    cntraining:
-        files: training/cntraining.cpp
+        files: src/training/cntraining.cpp
        dependencies: common_training

    dawg2wordlist:
-        files: training/dawg2wordlist.cpp
+        files: src/training/dawg2wordlist.cpp
        dependencies: libtesseract

    mftraining:
        files:
-            - training/mftraining.cpp
-            - training/mergenf.*
+            - src/training/mftraining.cpp
+            - src/training/mergenf.*
        dependencies: common_training

    shapeclustering:
-        files: training/shapeclustering.cpp
+        files: src/training/shapeclustering.cpp
        dependencies: common_training

    unicharset_extractor:
-        files: training/unicharset_extractor.cpp
+        files: src/training/unicharset_extractor.cpp
        dependencies: unicharset_training

    wordlist2dawg:
-        files: training/wordlist2dawg.cpp
+        files: src/training/wordlist2dawg.cpp
        dependencies: libtesseract

    unicharset_training:
        type: lib
        static_only: true
        files:
-            - training/fileio.*
-            - training/icuerrorcode.h
-            - training/lang_model_helpers.*
-            - training/lstmtester.*
-            - training/normstrngs.*
-            - training/unicharset_training_utils.*
-            - training/validat.*
+            - src/training/fileio.*
+            - src/training/icuerrorcode.h
+            - src/training/lang_model_helpers.*
+            - src/training/lstmtester.*
+            - src/training/normstrngs.*
+            - src/training/unicharset_training_utils.*
+            - src/training/validat.*
        include_directories: training
        dependencies:
            - common_training
            - pvt.cppan.demo.unicode.icu.i18n

    lstmeval:
-        files: training/lstmeval.cpp
+        files: src/training/lstmeval.cpp
        dependencies: unicharset_training

    lstmtraining:
-        files: training/lstmtraining.cpp
+        files: src/training/lstmtraining.cpp
        dependencies: unicharset_training

    set_unicharset_properties:
-        files: training/set_unicharset_properties.cpp
+        files: src/training/set_unicharset_properties.cpp
        dependencies: unicharset_training

    text2image:
        files:
-            - training/text2image.cpp
-            - training/boxchar.cpp
-            - training/boxchar.h
-            - training/degradeimage.cpp
-            - training/degradeimage.h
-            - training/ligature_table.cpp
-            - training/ligature_table.h
-            - training/normstrngs.cpp
-            - training/normstrngs.h
-            - training/pango_font_info.cpp
-            - training/pango_font_info.h
-            - training/stringrenderer.cpp
-            - training/stringrenderer.h
-            - training/tlog.cpp
-            - training/tlog.h
-            - training/util.h
-            - training/icuerrorcode.h
+            - src/training/text2image.cpp
+            - src/training/boxchar.cpp
+            - src/training/boxchar.h
+            - src/training/degradeimage.cpp
+            - src/training/degradeimage.h
+            - src/training/ligature_table.cpp
+            - src/training/ligature_table.h
+            - src/training/normstrngs.cpp
+            - src/training/normstrngs.h
+            - src/training/pango_font_info.cpp
+            - src/training/pango_font_info.h
+            - src/training/stringrenderer.cpp
+            - src/training/stringrenderer.h
+            - src/training/tlog.cpp
+            - src/training/tlog.h
+            - src/training/util.h
+            - src/training/icuerrorcode.h

        dependencies:
            - unicharset_training
--- a/src/training/CMakeLists.txt
+++ b/src/training/CMakeLists.txt
--- a/src/training/Makefile.am
+++ b/src/training/Makefile.am
--- a/src/training/ambiguous_words.cpp
+++ b/src/training/ambiguous_words.cpp
--- a/src/training/boxchar.cpp
+++ b/src/training/boxchar.cpp
--- a/src/training/boxchar.h
+++ b/src/training/boxchar.h
--- a/src/training/classifier_tester.cpp
+++ b/src/training/classifier_tester.cpp
--- a/src/training/cntraining.cpp
+++ b/src/training/cntraining.cpp
--- a/src/training/combine_lang_model.cpp
+++ b/src/training/combine_lang_model.cpp
--- a/src/training/combine_tessdata.cpp
+++ b/src/training/combine_tessdata.cpp
--- a/src/training/commandlineflags.cpp
+++ b/src/training/commandlineflags.cpp
--- a/src/training/commandlineflags.h
+++ b/src/training/commandlineflags.h
--- a/src/training/commontraining.cpp
+++ b/src/training/commontraining.cpp
--- a/src/training/commontraining.h
+++ b/src/training/commontraining.h
--- a/src/training/dawg2wordlist.cpp
+++ b/src/training/dawg2wordlist.cpp
--- a/src/training/degradeimage.cpp
+++ b/src/training/degradeimage.cpp
@ -1,310 +1,310 @@
-/**********************************************************************
- * File:        degradeimage.cpp
- * Description: Function to degrade an image (usually of text) as if it
- *              has been printed and then scanned.
- * Authors:     Ray Smith
- * Created:     Tue Nov 19 2013
- *
- * (C) Copyright 2013, Google Inc.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * http://www.apache.org/licenses/LICENSE-2.0
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- **********************************************************************/
-
-#include "degradeimage.h"
-
-#include <stdlib.h>
-#include "allheaders.h"   // from leptonica
-#include "genericvector.h"
-#include "helpers.h"  // For TRand.
-#include "rect.h"
-
-namespace tesseract {
-
-// A randomized perspective distortion can be applied to synthetic input.
-// The perspective distortion comes from leptonica, which uses 2 sets of 4
-// corners to determine the distortion. There are random values for each of
-// the x numbers x0..x3 and y0..y3, except for x2 and x3 which are instead
-// defined in terms of a single shear value. This reduces the degrees of
-// freedom enough to make the distortion more realistic than it would otherwise
-// be if all 8 coordinates could move independently.
-// One additional factor is used for the color of the pixels that don't exist
-// in the source image.
-// Name for each of the randomizing factors.
-enum FactorNames {
-  FN_INCOLOR,
-  FN_Y0,
-  FN_Y1,
-  FN_Y2,
-  FN_Y3,
-  FN_X0,
-  FN_X1,
-  FN_SHEAR,
-  // x2 = x1 - shear
-  // x3 = x0 + shear
-  FN_NUM_FACTORS
-};
-
-// Rotation is +/- kRotationRange radians.
-const float kRotationRange = 0.02f;
-// Number of grey levels to shift by for each exposure step.
-const int kExposureFactor = 16;
-// Salt and pepper noise is +/- kSaltnPepper.
-const int kSaltnPepper = 5;
-// Min sum of width + height on which to operate the ramp.
-const int kMinRampSize = 1000;
-
-// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
-// corresponding to darkening on the copier and <0 lighter and 0 not copied.
-// Exposures in [-2,2] are most useful, with -3 and 3 being extreme.
-// If rotation is nullptr, rotation is skipped. If *rotation is non-zero, the
-// pix is rotated by *rotation else it is randomly rotated and *rotation is
-// modified.
-//
-// HOW IT WORKS:
-// Most of the process is really dictated by the fact that the minimum
-// available convolution is 3X3, which is too big really to simulate a
-// good quality print/scan process. (2X2 would be better.)
-// 1 pixel wide inputs are heavily smeared by the 3X3 convolution, making the
-// images generally biased to being too light, so most of the work is to make
-// them darker. 3 levels of thickening/darkening are achieved with 2 dilations,
-// (using a greyscale erosion) one heavy (by being before convolution) and one
-// light (after convolution).
-// With no dilation, after covolution, the images are so light that a heavy
-// constant offset is required to make the 0 image look reasonable. A simple
-// constant offset multiple of exposure to undo this value is enough to achieve
-// all the required lightening. This gives the advantage that exposure level 1
-// with a single dilation gives a good impression of the broken-yet-too-dark
-// problem that is often seen in scans.
-// A small random rotation gives some varying greyscale values on the edges,
-// and some random salt and pepper noise on top helps to realistically jaggy-up
-// the edges.
-// Finally a greyscale ramp provides a continuum of effects between exposure
-// levels.
-Pix* DegradeImage(Pix* input, int exposure, TRand* randomizer,
-                  float* rotation) {
-  Pix* pix = pixConvertTo8(input, false);
-  pixDestroy(&input);
-  input = pix;
-  int width = pixGetWidth(input);
-  int height = pixGetHeight(input);
-  if (exposure >= 2) {
-    // An erosion simulates the spreading darkening of a dark copy.
-    // This is backwards to binary morphology,
-    // see http://www.leptonica.com/grayscale-morphology.html
-    pix = input;
-    input = pixErodeGray(pix, 3, 3);
-    pixDestroy(&pix);
-  }
-  // A convolution is essential to any mode as no scanner produces an
-  // image as sharp as the electronic image.
-  pix = pixBlockconv(input, 1, 1);
-  pixDestroy(&input);
-  // A small random rotation helps to make the edges jaggy in a realistic way.
-  if (rotation != nullptr) {
-    float radians_clockwise = 0.0f;
-    if (*rotation) {
-      radians_clockwise = *rotation;
-    } else if (randomizer != nullptr) {
-      radians_clockwise = randomizer->SignedRand(kRotationRange);
-    }
-
-    input = pixRotate(pix, radians_clockwise,
-                      L_ROTATE_AREA_MAP, L_BRING_IN_WHITE,
-                      0, 0);
-    // Rotate the boxes to match.
-    *rotation = radians_clockwise;
-    pixDestroy(&pix);
-  } else {
-    input = pix;
-  }
-
-  if (exposure >= 3 || exposure == 1) {
-    // Erosion after the convolution is not as heavy as before, so it is
-    // good for level 1 and in addition as a level 3.
-    // This is backwards to binary morphology,
-    // see http://www.leptonica.com/grayscale-morphology.html
-    pix = input;
-    input = pixErodeGray(pix, 3, 3);
-    pixDestroy(&pix);
-  }
-  // The convolution really needed to be 2x2 to be realistic enough, but
-  // we only have 3x3, so we have to bias the image darker or lose thin
-  // strokes.
-  int erosion_offset = 0;
-  // For light and 0 exposure, there is no dilation, so compensate for the
-  // convolution with a big darkening bias which is undone for lighter
-  // exposures.
-  if (exposure <= 0)
-    erosion_offset = -3 * kExposureFactor;
-  // Add in a general offset of the greyscales for the exposure level so
-  // a threshold of 128 gives a reasonable binary result.
-  erosion_offset -= exposure * kExposureFactor;
-  // Add a gradual fade over the page and a small amount of salt and pepper
-  // noise to simulate noise in the sensor/paper fibres and varying
-  // illumination.
-  l_uint32* data = pixGetData(input);
-  for (int y = 0; y < height; ++y) {
-    for (int x = 0; x < width; ++x) {
-      int pixel = GET_DATA_BYTE(data, x);
-      if (randomizer != nullptr)
-        pixel += randomizer->IntRand() % (kSaltnPepper*2 + 1) - kSaltnPepper;
-      if (height + width > kMinRampSize)
-        pixel -= (2*x + y) * 32 / (height + width);
-      pixel += erosion_offset;
-      if (pixel < 0)
-        pixel = 0;
-      if (pixel > 255)
-        pixel = 255;
-      SET_DATA_BYTE(data, x, pixel);
-    }
-    data += input->wpl;
-  }
-  return input;
-}
-
-// Creates and returns a Pix distorted by various means according to the bool
-// flags. If boxes is not nullptr, the boxes are resized/positioned according to
-// any spatial distortion and also by the integer reduction factor box_scale
-// so they will match what the network will output.
-// Returns nullptr on error. The returned Pix must be pixDestroyed.
-Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
-                         bool white_noise, bool smooth_noise, bool blur,
-                         int box_reduction, TRand* randomizer,
-                         GenericVector<TBOX>* boxes) {
-  Pix* distorted = pixCopy(nullptr, const_cast<Pix*>(pix));
-  // Things to do to synthetic training data.
-  if (invert && randomizer->SignedRand(1.0) < 0)
-    pixInvert(distorted, distorted);
-  if ((white_noise || smooth_noise) && randomizer->SignedRand(1.0) > 0.0) {
-    // TODO(rays) Cook noise in a more thread-safe manner than rand().
-    // Attempt to make the sequences reproducible.
-    srand(randomizer->IntRand());
-    Pix* pixn = pixAddGaussianNoise(distorted, 8.0);
-    pixDestroy(&distorted);
-    if (smooth_noise) {
-      distorted = pixBlockconv(pixn, 1, 1);
-      pixDestroy(&pixn);
-    } else {
-      distorted = pixn;
-    }
-  }
-  if (blur && randomizer->SignedRand(1.0) > 0.0) {
-    Pix* blurred = pixBlockconv(distorted, 1, 1);
-    pixDestroy(&distorted);
-    distorted = blurred;
-  }
-  if (perspective)
-    GeneratePerspectiveDistortion(0, 0, randomizer, &distorted, boxes);
-  if (boxes != nullptr) {
-    for (int b = 0; b < boxes->size(); ++b) {
-      (*boxes)[b].scale(1.0f / box_reduction);
-      if ((*boxes)[b].width() <= 0)
-        (*boxes)[b].set_right((*boxes)[b].left() + 1);
-    }
-  }
-  return distorted;
-}
-
-// Distorts anything that has a non-null pointer with the same pseudo-random
-// perspective distortion. Width and height only need to be set if there
-// is no pix. If there is a pix, then they will be taken from there.
-void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
-                                   Pix** pix, GenericVector<TBOX>* boxes) {
-  if (pix != nullptr && *pix != nullptr) {
-    width = pixGetWidth(*pix);
-    height = pixGetHeight(*pix);
-  }
-  float* im_coeffs = nullptr;
-  float* box_coeffs = nullptr;
-  l_int32 incolor =
-      ProjectiveCoeffs(width, height, randomizer, &im_coeffs, &box_coeffs);
-  if (pix != nullptr && *pix != nullptr) {
-    // Transform the image.
-    Pix* transformed = pixProjective(*pix, im_coeffs, incolor);
-    if (transformed == nullptr) {
-      tprintf("Projective transformation failed!!\n");
-      return;
-    }
-    pixDestroy(pix);
-    *pix = transformed;
-  }
-  if (boxes != nullptr) {
-    // Transform the boxes.
-    for (int b = 0; b < boxes->size(); ++b) {
-      int x1, y1, x2, y2;
-      const TBOX& box = (*boxes)[b];
-      projectiveXformSampledPt(box_coeffs, box.left(), height - box.top(), &x1,
-                               &y1);
-      projectiveXformSampledPt(box_coeffs, box.right(), height - box.bottom(),
-                               &x2, &y2);
-      TBOX new_box1(x1, height - y2, x2, height - y1);
-      projectiveXformSampledPt(box_coeffs, box.left(), height - box.bottom(),
-                               &x1, &y1);
-      projectiveXformSampledPt(box_coeffs, box.right(), height - box.top(), &x2,
-                               &y2);
-      TBOX new_box2(x1, height - y1, x2, height - y2);
-      (*boxes)[b] = new_box1.bounding_union(new_box2);
-    }
-  }
-  free(im_coeffs);
-  free(box_coeffs);
-}
-
-// Computes the coefficients of a randomized projective transformation.
-// The image transform requires backward transformation coefficient, and the
-// box transform the forward coefficients.
-// Returns the incolor arg to pixProjective.
-int ProjectiveCoeffs(int width, int height, TRand* randomizer,
-                     float** im_coeffs, float** box_coeffs) {
-  // Setup "from" points.
-  Pta* src_pts = ptaCreate(4);
-  ptaAddPt(src_pts, 0.0f, 0.0f);
-  ptaAddPt(src_pts, width, 0.0f);
-  ptaAddPt(src_pts, width, height);
-  ptaAddPt(src_pts, 0.0f, height);
-  // Extract factors from pseudo-random sequence.
-  float factors[FN_NUM_FACTORS];
-  float shear = 0.0f;  // Shear is signed.
-  for (int i = 0; i < FN_NUM_FACTORS; ++i) {
-    // Everything is squared to make wild values rarer.
-    if (i == FN_SHEAR) {
-      // Shear is signed.
-      shear = randomizer->SignedRand(0.5 / 3.0);
-      shear = shear >= 0.0 ? shear * shear : -shear * shear;
-      // Keep the sheared points within the original rectangle.
-      if (shear < -factors[FN_X0]) shear = -factors[FN_X0];
-      if (shear > factors[FN_X1]) shear = factors[FN_X1];
-      factors[i] = shear;
-    } else if (i != FN_INCOLOR) {
-      factors[i] = fabs(randomizer->SignedRand(1.0));
-      if (i <= FN_Y3)
-        factors[i] *= 5.0 / 8.0;
-      else
-        factors[i] *= 0.5;
-      factors[i] *= factors[i];
-    }
-  }
-  // Setup "to" points.
-  Pta* dest_pts = ptaCreate(4);
-  ptaAddPt(dest_pts, factors[FN_X0] * width, factors[FN_Y0] * height);
-  ptaAddPt(dest_pts, (1.0f - factors[FN_X1]) * width, factors[FN_Y1] * height);
-  ptaAddPt(dest_pts, (1.0f - factors[FN_X1] + shear) * width,
-           (1 - factors[FN_Y2]) * height);
-  ptaAddPt(dest_pts, (factors[FN_X0] + shear) * width,
-           (1 - factors[FN_Y3]) * height);
-  getProjectiveXformCoeffs(dest_pts, src_pts, im_coeffs);
-  getProjectiveXformCoeffs(src_pts, dest_pts, box_coeffs);
-  ptaDestroy(&src_pts);
-  ptaDestroy(&dest_pts);
-  return factors[FN_INCOLOR] > 0.5f ? L_BRING_IN_WHITE : L_BRING_IN_BLACK;
-}
-
-}  // namespace tesseract
+/**********************************************************************
+ * File:        degradeimage.cpp
+ * Description: Function to degrade an image (usually of text) as if it
+ *              has been printed and then scanned.
+ * Authors:     Ray Smith
+ * Created:     Tue Nov 19 2013
+ *
+ * (C) Copyright 2013, Google Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **********************************************************************/
+
+#include "degradeimage.h"
+
+#include <stdlib.h>
+#include "allheaders.h"   // from leptonica
+#include "genericvector.h"
+#include "helpers.h"  // For TRand.
+#include "rect.h"
+
+namespace tesseract {
+
+// A randomized perspective distortion can be applied to synthetic input.
+// The perspective distortion comes from leptonica, which uses 2 sets of 4
+// corners to determine the distortion. There are random values for each of
+// the x numbers x0..x3 and y0..y3, except for x2 and x3 which are instead
+// defined in terms of a single shear value. This reduces the degrees of
+// freedom enough to make the distortion more realistic than it would otherwise
+// be if all 8 coordinates could move independently.
+// One additional factor is used for the color of the pixels that don't exist
+// in the source image.
+// Name for each of the randomizing factors.
+enum FactorNames {
+  FN_INCOLOR,
+  FN_Y0,
+  FN_Y1,
+  FN_Y2,
+  FN_Y3,
+  FN_X0,
+  FN_X1,
+  FN_SHEAR,
+  // x2 = x1 - shear
+  // x3 = x0 + shear
+  FN_NUM_FACTORS
+};
+
+// Rotation is +/- kRotationRange radians.
+const float kRotationRange = 0.02f;
+// Number of grey levels to shift by for each exposure step.
+const int kExposureFactor = 16;
+// Salt and pepper noise is +/- kSaltnPepper.
+const int kSaltnPepper = 5;
+// Min sum of width + height on which to operate the ramp.
+const int kMinRampSize = 1000;
+
+// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
+// corresponding to darkening on the copier and <0 lighter and 0 not copied.
+// Exposures in [-2,2] are most useful, with -3 and 3 being extreme.
+// If rotation is nullptr, rotation is skipped. If *rotation is non-zero, the
+// pix is rotated by *rotation else it is randomly rotated and *rotation is
+// modified.
+//
+// HOW IT WORKS:
+// Most of the process is really dictated by the fact that the minimum
+// available convolution is 3X3, which is too big really to simulate a
+// good quality print/scan process. (2X2 would be better.)
+// 1 pixel wide inputs are heavily smeared by the 3X3 convolution, making the
+// images generally biased to being too light, so most of the work is to make
+// them darker. 3 levels of thickening/darkening are achieved with 2 dilations,
+// (using a greyscale erosion) one heavy (by being before convolution) and one
+// light (after convolution).
+// With no dilation, after covolution, the images are so light that a heavy
+// constant offset is required to make the 0 image look reasonable. A simple
+// constant offset multiple of exposure to undo this value is enough to achieve
+// all the required lightening. This gives the advantage that exposure level 1
+// with a single dilation gives a good impression of the broken-yet-too-dark
+// problem that is often seen in scans.
+// A small random rotation gives some varying greyscale values on the edges,
+// and some random salt and pepper noise on top helps to realistically jaggy-up
+// the edges.
+// Finally a greyscale ramp provides a continuum of effects between exposure
+// levels.
+Pix* DegradeImage(Pix* input, int exposure, TRand* randomizer,
+                  float* rotation) {
+  Pix* pix = pixConvertTo8(input, false);
+  pixDestroy(&input);
+  input = pix;
+  int width = pixGetWidth(input);
+  int height = pixGetHeight(input);
+  if (exposure >= 2) {
+    // An erosion simulates the spreading darkening of a dark copy.
+    // This is backwards to binary morphology,
+    // see http://www.leptonica.com/grayscale-morphology.html
+    pix = input;
+    input = pixErodeGray(pix, 3, 3);
+    pixDestroy(&pix);
+  }
+  // A convolution is essential to any mode as no scanner produces an
+  // image as sharp as the electronic image.
+  pix = pixBlockconv(input, 1, 1);
+  pixDestroy(&input);
+  // A small random rotation helps to make the edges jaggy in a realistic way.
+  if (rotation != nullptr) {
+    float radians_clockwise = 0.0f;
+    if (*rotation) {
+      radians_clockwise = *rotation;
+    } else if (randomizer != nullptr) {
+      radians_clockwise = randomizer->SignedRand(kRotationRange);
+    }
+
+    input = pixRotate(pix, radians_clockwise,
+                      L_ROTATE_AREA_MAP, L_BRING_IN_WHITE,
+                      0, 0);
+    // Rotate the boxes to match.
+    *rotation = radians_clockwise;
+    pixDestroy(&pix);
+  } else {
+    input = pix;
+  }
+
+  if (exposure >= 3 || exposure == 1) {
+    // Erosion after the convolution is not as heavy as before, so it is
+    // good for level 1 and in addition as a level 3.
+    // This is backwards to binary morphology,
+    // see http://www.leptonica.com/grayscale-morphology.html
+    pix = input;
+    input = pixErodeGray(pix, 3, 3);
+    pixDestroy(&pix);
+  }
+  // The convolution really needed to be 2x2 to be realistic enough, but
+  // we only have 3x3, so we have to bias the image darker or lose thin
+  // strokes.
+  int erosion_offset = 0;
+  // For light and 0 exposure, there is no dilation, so compensate for the
+  // convolution with a big darkening bias which is undone for lighter
+  // exposures.
+  if (exposure <= 0)
+    erosion_offset = -3 * kExposureFactor;
+  // Add in a general offset of the greyscales for the exposure level so
+  // a threshold of 128 gives a reasonable binary result.
+  erosion_offset -= exposure * kExposureFactor;
+  // Add a gradual fade over the page and a small amount of salt and pepper
+  // noise to simulate noise in the sensor/paper fibres and varying
+  // illumination.
+  l_uint32* data = pixGetData(input);
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      int pixel = GET_DATA_BYTE(data, x);
+      if (randomizer != nullptr)
+        pixel += randomizer->IntRand() % (kSaltnPepper*2 + 1) - kSaltnPepper;
+      if (height + width > kMinRampSize)
+        pixel -= (2*x + y) * 32 / (height + width);
+      pixel += erosion_offset;
+      if (pixel < 0)
+        pixel = 0;
+      if (pixel > 255)
+        pixel = 255;
+      SET_DATA_BYTE(data, x, pixel);
+    }
+    data += input->wpl;
+  }
+  return input;
+}
+
+// Creates and returns a Pix distorted by various means according to the bool
+// flags. If boxes is not nullptr, the boxes are resized/positioned according to
+// any spatial distortion and also by the integer reduction factor box_scale
+// so they will match what the network will output.
+// Returns nullptr on error. The returned Pix must be pixDestroyed.
+Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
+                         bool white_noise, bool smooth_noise, bool blur,
+                         int box_reduction, TRand* randomizer,
+                         GenericVector<TBOX>* boxes) {
+  Pix* distorted = pixCopy(nullptr, const_cast<Pix*>(pix));
+  // Things to do to synthetic training data.
+  if (invert && randomizer->SignedRand(1.0) < 0)
+    pixInvert(distorted, distorted);
+  if ((white_noise || smooth_noise) && randomizer->SignedRand(1.0) > 0.0) {
+    // TODO(rays) Cook noise in a more thread-safe manner than rand().
+    // Attempt to make the sequences reproducible.
+    srand(randomizer->IntRand());
+    Pix* pixn = pixAddGaussianNoise(distorted, 8.0);
+    pixDestroy(&distorted);
+    if (smooth_noise) {
+      distorted = pixBlockconv(pixn, 1, 1);
+      pixDestroy(&pixn);
+    } else {
+      distorted = pixn;
+    }
+  }
+  if (blur && randomizer->SignedRand(1.0) > 0.0) {
+    Pix* blurred = pixBlockconv(distorted, 1, 1);
+    pixDestroy(&distorted);
+    distorted = blurred;
+  }
+  if (perspective)
+    GeneratePerspectiveDistortion(0, 0, randomizer, &distorted, boxes);
+  if (boxes != nullptr) {
+    for (int b = 0; b < boxes->size(); ++b) {
+      (*boxes)[b].scale(1.0f / box_reduction);
+      if ((*boxes)[b].width() <= 0)
+        (*boxes)[b].set_right((*boxes)[b].left() + 1);
+    }
+  }
+  return distorted;
+}
+
+// Distorts anything that has a non-null pointer with the same pseudo-random
+// perspective distortion. Width and height only need to be set if there
+// is no pix. If there is a pix, then they will be taken from there.
+void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
+                                   Pix** pix, GenericVector<TBOX>* boxes) {
+  if (pix != nullptr && *pix != nullptr) {
+    width = pixGetWidth(*pix);
+    height = pixGetHeight(*pix);
+  }
+  float* im_coeffs = nullptr;
+  float* box_coeffs = nullptr;
+  l_int32 incolor =
+      ProjectiveCoeffs(width, height, randomizer, &im_coeffs, &box_coeffs);
+  if (pix != nullptr && *pix != nullptr) {
+    // Transform the image.
+    Pix* transformed = pixProjective(*pix, im_coeffs, incolor);
+    if (transformed == nullptr) {
+      tprintf("Projective transformation failed!!\n");
+      return;
+    }
+    pixDestroy(pix);
+    *pix = transformed;
+  }
+  if (boxes != nullptr) {
+    // Transform the boxes.
+    for (int b = 0; b < boxes->size(); ++b) {
+      int x1, y1, x2, y2;
+      const TBOX& box = (*boxes)[b];
+      projectiveXformSampledPt(box_coeffs, box.left(), height - box.top(), &x1,
+                               &y1);
+      projectiveXformSampledPt(box_coeffs, box.right(), height - box.bottom(),
+                               &x2, &y2);
+      TBOX new_box1(x1, height - y2, x2, height - y1);
+      projectiveXformSampledPt(box_coeffs, box.left(), height - box.bottom(),
+                               &x1, &y1);
+      projectiveXformSampledPt(box_coeffs, box.right(), height - box.top(), &x2,
+                               &y2);
+      TBOX new_box2(x1, height - y1, x2, height - y2);
+      (*boxes)[b] = new_box1.bounding_union(new_box2);
+    }
+  }
+  free(im_coeffs);
+  free(box_coeffs);
+}
+
+// Computes the coefficients of a randomized projective transformation.
+// The image transform requires backward transformation coefficient, and the
+// box transform the forward coefficients.
+// Returns the incolor arg to pixProjective.
+int ProjectiveCoeffs(int width, int height, TRand* randomizer,
+                     float** im_coeffs, float** box_coeffs) {
+  // Setup "from" points.
+  Pta* src_pts = ptaCreate(4);
+  ptaAddPt(src_pts, 0.0f, 0.0f);
+  ptaAddPt(src_pts, width, 0.0f);
+  ptaAddPt(src_pts, width, height);
+  ptaAddPt(src_pts, 0.0f, height);
+  // Extract factors from pseudo-random sequence.
+  float factors[FN_NUM_FACTORS];
+  float shear = 0.0f;  // Shear is signed.
+  for (int i = 0; i < FN_NUM_FACTORS; ++i) {
+    // Everything is squared to make wild values rarer.
+    if (i == FN_SHEAR) {
+      // Shear is signed.
+      shear = randomizer->SignedRand(0.5 / 3.0);
+      shear = shear >= 0.0 ? shear * shear : -shear * shear;
+      // Keep the sheared points within the original rectangle.
+      if (shear < -factors[FN_X0]) shear = -factors[FN_X0];
+      if (shear > factors[FN_X1]) shear = factors[FN_X1];
+      factors[i] = shear;
+    } else if (i != FN_INCOLOR) {
+      factors[i] = fabs(randomizer->SignedRand(1.0));
+      if (i <= FN_Y3)
+        factors[i] *= 5.0 / 8.0;
+      else
+        factors[i] *= 0.5;
+      factors[i] *= factors[i];
+    }
+  }
+  // Setup "to" points.
+  Pta* dest_pts = ptaCreate(4);
+  ptaAddPt(dest_pts, factors[FN_X0] * width, factors[FN_Y0] * height);
+  ptaAddPt(dest_pts, (1.0f - factors[FN_X1]) * width, factors[FN_Y1] * height);
+  ptaAddPt(dest_pts, (1.0f - factors[FN_X1] + shear) * width,
+           (1 - factors[FN_Y2]) * height);
+  ptaAddPt(dest_pts, (factors[FN_X0] + shear) * width,
+           (1 - factors[FN_Y3]) * height);
+  getProjectiveXformCoeffs(dest_pts, src_pts, im_coeffs);
+  getProjectiveXformCoeffs(src_pts, dest_pts, box_coeffs);
+  ptaDestroy(&src_pts);
+  ptaDestroy(&dest_pts);
+  return factors[FN_INCOLOR] > 0.5f ? L_BRING_IN_WHITE : L_BRING_IN_BLACK;
+}
+
+}  // namespace tesseract
--- a/src/training/degradeimage.h
+++ b/src/training/degradeimage.h
@ -1,61 +1,61 @@
-/**********************************************************************
- * File:        degradeimage.h
- * Description: Function to degrade an image (usually of text) as if it
- *              has been printed and then scanned.
- * Authors:     Ray Smith
- * Created:     Tue Nov 19 2013
- *
- * (C) Copyright 2013, Google Inc.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * http://www.apache.org/licenses/LICENSE-2.0
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- **********************************************************************/
-#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_
-#define TESSERACT_TRAINING_DEGRADEIMAGE_H_
-
-#include "allheaders.h"
-#include "genericvector.h"
-#include "helpers.h"  // For TRand.
-#include "rect.h"
-
-namespace tesseract {
-
-// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
-// corresponding to darkening on the copier and <0 lighter and 0 not copied.
-// If rotation is not nullptr, the clockwise rotation in radians is saved there.
-// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.)
-// The input image is destroyed and a different image returned.
-struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer,
-                         float* rotation);
-
-// Creates and returns a Pix distorted by various means according to the bool
-// flags. If boxes is not nullptr, the boxes are resized/positioned according to
-// any spatial distortion and also by the integer reduction factor box_scale
-// so they will match what the network will output.
-// Returns nullptr on error. The returned Pix must be pixDestroyed.
-Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
-                         bool white_noise, bool smooth_noise, bool blur,
-                         int box_reduction, TRand* randomizer,
-                         GenericVector<TBOX>* boxes);
-// Distorts anything that has a non-null pointer with the same pseudo-random
-// perspective distortion. Width and height only need to be set if there
-// is no pix. If there is a pix, then they will be taken from there.
-void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
-                                   Pix** pix, GenericVector<TBOX>* boxes);
-// Computes the coefficients of a randomized projective transformation.
-// The image transform requires backward transformation coefficient, and the
-// box transform the forward coefficients.
-// Returns the incolor arg to pixProjective.
-int ProjectiveCoeffs(int width, int height, TRand* randomizer,
-                     float** im_coeffs, float** box_coeffs);
-
-}  // namespace tesseract
-
-#endif  // TESSERACT_TRAINING_DEGRADEIMAGE_H_
+/**********************************************************************
+ * File:        degradeimage.h
+ * Description: Function to degrade an image (usually of text) as if it
+ *              has been printed and then scanned.
+ * Authors:     Ray Smith
+ * Created:     Tue Nov 19 2013
+ *
+ * (C) Copyright 2013, Google Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **********************************************************************/
+#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_
+#define TESSERACT_TRAINING_DEGRADEIMAGE_H_
+
+#include "allheaders.h"
+#include "genericvector.h"
+#include "helpers.h"  // For TRand.
+#include "rect.h"
+
+namespace tesseract {
+
+// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
+// corresponding to darkening on the copier and <0 lighter and 0 not copied.
+// If rotation is not nullptr, the clockwise rotation in radians is saved there.
+// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.)
+// The input image is destroyed and a different image returned.
+struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer,
+                         float* rotation);
+
+// Creates and returns a Pix distorted by various means according to the bool
+// flags. If boxes is not nullptr, the boxes are resized/positioned according to
+// any spatial distortion and also by the integer reduction factor box_scale
+// so they will match what the network will output.
+// Returns nullptr on error. The returned Pix must be pixDestroyed.
+Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
+                         bool white_noise, bool smooth_noise, bool blur,
+                         int box_reduction, TRand* randomizer,
+                         GenericVector<TBOX>* boxes);
+// Distorts anything that has a non-null pointer with the same pseudo-random
+// perspective distortion. Width and height only need to be set if there
+// is no pix. If there is a pix, then they will be taken from there.
+void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
+                                   Pix** pix, GenericVector<TBOX>* boxes);
+// Computes the coefficients of a randomized projective transformation.
+// The image transform requires backward transformation coefficient, and the
+// box transform the forward coefficients.
+// Returns the incolor arg to pixProjective.
+int ProjectiveCoeffs(int width, int height, TRand* randomizer,
+                     float** im_coeffs, float** box_coeffs);
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_DEGRADEIMAGE_H_
--- a/src/training/fileio.cpp
+++ b/src/training/fileio.cpp
--- a/src/training/fileio.h
+++ b/src/training/fileio.h
--- a/src/training/icuerrorcode.h
+++ b/src/training/icuerrorcode.h
@ -1,66 +1,66 @@
-/**********************************************************************
- * File:        icuerrorcode.h
- * Description: Wrapper class for UErrorCode, with conversion operators for
- *              direct use in ICU C and C++ APIs.
- * Author:      Fredrik Roubert
- * Created:     Thu July 4 2013
- *
- * Features:
- * - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,
- *  removing one common source of errors.
- * - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking
- *   UErrorCode& (reference), via conversion operators.
- * - Automatic checking for success when it goes out of scope. On failure,
- *   the destructor will log an error message and exit.
- *
- * Most of ICU will handle errors gracefully and provide sensible fallbacks.
- * Using IcuErrorCode, it is therefore possible to write very compact code
- * that does sensible things on failure and provides logging for debugging.
- *
- * Example:
- * IcuErrorCode icuerrorcode;
- * return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL;
- *
- * (C) Copyright 2013, Google Inc.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * http://www.apache.org/licenses/LICENSE-2.0
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- **********************************************************************/
-#ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_
-#define TESSERACT_CCUTIL_ICUERRORCODE_H_
-
-#include "tprintf.h"
-#include "unicode/errorcode.h"  // From libicu
-
-namespace tesseract {
-
-class IcuErrorCode : public icu::ErrorCode {
- public:
-  IcuErrorCode() {}
-  virtual ~IcuErrorCode() {
-    if (isFailure()) {
-      handleFailure();
-    }
-  }
-
- protected:
-  virtual void handleFailure() const {
-    tprintf("ICU ERROR: %s", errorName());
-    exit(errorCode);
-  }
-
- private:
-  // Disallow implicit copying of object.
-  IcuErrorCode(const IcuErrorCode&);
-  void operator=(const IcuErrorCode&);
-};
-
-}  // namespace tesseract
-#endif  // TESSERACT_CCUTIL_ICUERRORCODE_H_
+/**********************************************************************
+ * File:        icuerrorcode.h
+ * Description: Wrapper class for UErrorCode, with conversion operators for
+ *              direct use in ICU C and C++ APIs.
+ * Author:      Fredrik Roubert
+ * Created:     Thu July 4 2013
+ *
+ * Features:
+ * - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,
+ *  removing one common source of errors.
+ * - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking
+ *   UErrorCode& (reference), via conversion operators.
+ * - Automatic checking for success when it goes out of scope. On failure,
+ *   the destructor will log an error message and exit.
+ *
+ * Most of ICU will handle errors gracefully and provide sensible fallbacks.
+ * Using IcuErrorCode, it is therefore possible to write very compact code
+ * that does sensible things on failure and provides logging for debugging.
+ *
+ * Example:
+ * IcuErrorCode icuerrorcode;
+ * return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL;
+ *
+ * (C) Copyright 2013, Google Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **********************************************************************/
+#ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_
+#define TESSERACT_CCUTIL_ICUERRORCODE_H_
+
+#include "tprintf.h"
+#include "unicode/errorcode.h"  // From libicu
+
+namespace tesseract {
+
+class IcuErrorCode : public icu::ErrorCode {
+ public:
+  IcuErrorCode() {}
+  virtual ~IcuErrorCode() {
+    if (isFailure()) {
+      handleFailure();
+    }
+  }
+
+ protected:
+  virtual void handleFailure() const {
+    tprintf("ICU ERROR: %s", errorName());
+    exit(errorCode);
+  }
+
+ private:
+  // Disallow implicit copying of object.
+  IcuErrorCode(const IcuErrorCode&);
+  void operator=(const IcuErrorCode&);
+};
+
+}  // namespace tesseract
+#endif  // TESSERACT_CCUTIL_ICUERRORCODE_H_
--- a/src/training/lang_model_helpers.cpp
+++ b/src/training/lang_model_helpers.cpp
--- a/src/training/lang_model_helpers.h
+++ b/src/training/lang_model_helpers.h
--- a/src/training/language-specific.sh
+++ b/src/training/language-specific.sh
--- a/src/training/ligature_table.cpp
+++ b/src/training/ligature_table.cpp
--- a/src/training/ligature_table.h
+++ b/src/training/ligature_table.h
--- a/src/training/lstmeval.cpp
+++ b/src/training/lstmeval.cpp
--- a/src/training/lstmtester.cpp
+++ b/src/training/lstmtester.cpp
--- a/src/training/lstmtester.h
+++ b/src/training/lstmtester.h
--- a/src/training/lstmtraining.cpp
+++ b/src/training/lstmtraining.cpp
--- a/src/training/merge_unicharsets.cpp
+++ b/src/training/merge_unicharsets.cpp
--- a/src/training/mergenf.cpp
+++ b/src/training/mergenf.cpp
@ -1,353 +1,353 @@
-/******************************************************************************
-**  Filename:    MergeNF.c
-**  Purpose:     Program for merging similar nano-feature protos
-**  Author:      Dan Johnson
-**  History:     Wed Nov 21 09:55:23 1990, DSJ, Created.
-**
- ** (c) Copyright Hewlett-Packard Company, 1988.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
-******************************************************************************/
-#include "mergenf.h"
-#include "host.h"
-#include "efio.h"
-#include "clusttool.h"
-#include "cluster.h"
-#include "oldlist.h"
-#include "protos.h"
-#include "ndminx.h"
-#include "ocrfeatures.h"
-#include "const.h"
-#include "featdefs.h"
-#include "intproto.h"
-#include "params.h"
-
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-
-/*-------------------once in subfeat---------------------------------*/
-double_VAR(training_angle_match_scale, 1.0, "Angle Match Scale ...");
-
-double_VAR(training_similarity_midpoint, 0.0075, "Similarity Midpoint ...");
-
-double_VAR(training_similarity_curl, 2.0, "Similarity Curl ...");
-
-/*-----------------------------once in fasttrain----------------------------------*/
-double_VAR(training_tangent_bbox_pad, 0.5, "Tangent bounding box pad ...");
-
-double_VAR(training_orthogonal_bbox_pad, 2.5, "Orthogonal bounding box pad ...");
-
-double_VAR(training_angle_pad, 45.0, "Angle pad ...");
-
-/**
- * Compare protos p1 and p2 and return an estimate of the
- * worst evidence rating that will result for any part of p1
- * that is compared to p2.  In other words, if p1 were broken
- * into pico-features and each pico-feature was matched to p2,
- * what is the worst evidence rating that will be achieved for
- * any pico-feature.
- *
- * @param p1, p2    protos to be compared
- *
- * Globals: none
- *
- * @return Worst possible result when matching p1 to p2.
- * @note Exceptions: none
- * @note History: Mon Nov 26 08:27:53 1990, DSJ, Created.
- */
-FLOAT32 CompareProtos(PROTO p1, PROTO p2) {
-  FEATURE Feature;
-  FLOAT32 WorstEvidence = WORST_EVIDENCE;
-  FLOAT32 Evidence;
-  FLOAT32 Angle, Length;
-
-  /* if p1 and p2 are not close in length, don't let them match */
-  Length = fabs (p1->Length - p2->Length);
-  if (Length > MAX_LENGTH_MISMATCH)
-    return (0.0);
-
-  /* create a dummy pico-feature to be used for comparisons */
-  Feature = NewFeature (&PicoFeatDesc);
-  Feature->Params[PicoFeatDir] = p1->Angle;
-
-  /* convert angle to radians */
-  Angle = p1->Angle * 2.0 * PI;
-
-  /* find distance from center of p1 to 1/2 picofeat from end */
-  Length = p1->Length / 2.0 - GetPicoFeatureLength () / 2.0;
-  if (Length < 0) Length = 0;
-
-  /* set the dummy pico-feature at one end of p1 and match it to p2 */
-  Feature->Params[PicoFeatX] = p1->X + cos (Angle) * Length;
-  Feature->Params[PicoFeatY] = p1->Y + sin (Angle) * Length;
-  if (DummyFastMatch (Feature, p2)) {
-    Evidence = SubfeatureEvidence (Feature, p2);
-    if (Evidence < WorstEvidence)
-      WorstEvidence = Evidence;
-  } else {
-    FreeFeature(Feature);
-    return 0.0;
-  }
-
-  /* set the dummy pico-feature at the other end of p1 and match it to p2 */
-  Feature->Params[PicoFeatX] = p1->X - cos (Angle) * Length;
-  Feature->Params[PicoFeatY] = p1->Y - sin (Angle) * Length;
-  if (DummyFastMatch (Feature, p2)) {
-    Evidence = SubfeatureEvidence (Feature, p2);
-    if (Evidence < WorstEvidence)
-      WorstEvidence = Evidence;
-  } else {
-    FreeFeature(Feature);
-    return 0.0;
-  }
-
-  FreeFeature (Feature);
-  return (WorstEvidence);
-
-} /* CompareProtos */
-
-/**
- * This routine computes a proto which is the weighted
- * average of protos p1 and p2.  The new proto is returned
- * in MergedProto.
- *
- * @param p1, p2    protos to be merged
- * @param w1, w2    weight of each proto
- * @param MergedProto place to put resulting merged proto
- *
- * Globals: none
- *
- * @return none (results are returned in MergedProto)
- * @note Exceptions: none
- * @note History: Mon Nov 26 08:15:08 1990, DSJ, Created.
- */
-void ComputeMergedProto (PROTO  p1,
-                         PROTO  p2,
-                         FLOAT32  w1,
-                         FLOAT32  w2,
-                         PROTO  MergedProto) {
-  FLOAT32 TotalWeight;
-
-  TotalWeight = w1 + w2;
-  w1 /= TotalWeight;
-  w2 /= TotalWeight;
-
-  MergedProto->X = p1->X * w1 + p2->X * w2;
-  MergedProto->Y = p1->Y * w1 + p2->Y * w2;
-  MergedProto->Length = p1->Length * w1 + p2->Length * w2;
-  MergedProto->Angle = p1->Angle * w1 + p2->Angle * w2;
-  FillABC(MergedProto);
-} /* ComputeMergedProto */
-
-/**
- * This routine searches through all of the prototypes in
- * Class and returns the id of the proto which would provide
- * the best approximation of Prototype.  If no close
- * approximation can be found, NO_PROTO is returned.
- *
- * @param Class   class to search for matching old proto in
- * @param NumMerged # of protos merged into each proto of Class
- * @param  Prototype new proto to find match for
- *
- * Globals: none
- *
- * @return Id of closest proto in Class or NO_PROTO.
- * @note Exceptions: none
- * @note History: Sat Nov 24 11:42:58 1990, DSJ, Created.
- */
-int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[],
-                             PROTOTYPE  *Prototype) {
-  PROTO_STRUCT  NewProto;
-  PROTO_STRUCT  MergedProto;
-  int   Pid;
-  PROTO   Proto;
-  int   BestProto;
-  FLOAT32 BestMatch;
-  FLOAT32 Match, OldMatch, NewMatch;
-
-  MakeNewFromOld (&NewProto, Prototype);
-
-  BestProto = NO_PROTO;
-  BestMatch = WORST_MATCH_ALLOWED;
-  for (Pid = 0; Pid < Class->NumProtos; Pid++) {
-    Proto  = ProtoIn(Class, Pid);
-    ComputeMergedProto(Proto, &NewProto,
-      (FLOAT32) NumMerged[Pid], 1.0, &MergedProto);
-    OldMatch = CompareProtos(Proto, &MergedProto);
-    NewMatch = CompareProtos(&NewProto, &MergedProto);
-    Match = MIN(OldMatch, NewMatch);
-    if (Match > BestMatch) {
-      BestProto = Pid;
-      BestMatch = Match;
-    }
-  }
-  return BestProto;
-} /* FindClosestExistingProto */
-
-/**
- * This fills in the fields of the New proto based on the
- * fields of the Old proto.
- *
- * @param New new proto to be filled in
- * @param Old old proto to be converted
- *
- *  Globals: none
- *
- * Exceptions: none
- * History: Mon Nov 26 09:45:39 1990, DSJ, Created.
- */
-void MakeNewFromOld(PROTO New, PROTOTYPE *Old) {
-  New->X = CenterX(Old->Mean);
-  New->Y = CenterY(Old->Mean);
-  New->Length = LengthOf(Old->Mean);
-  New->Angle = OrientationOf(Old->Mean);
-  FillABC(New);
-} /* MakeNewFromOld */
-
-/*-------------------once in subfeat---------------------------------*/
-
-/**
- * @name SubfeatureEvidence
- *
- * Compare a feature to a prototype. Print the result.
- */
-FLOAT32 SubfeatureEvidence(FEATURE Feature, PROTO Proto) {
-  float       Distance;
-  float       Dangle;
-
-  Dangle   = Proto->Angle - Feature->Params[PicoFeatDir];
-  if (Dangle < -0.5) Dangle += 1.0;
-  if (Dangle >  0.5) Dangle -= 1.0;
-  Dangle *= training_angle_match_scale;
-
-  Distance = Proto->A * Feature->Params[PicoFeatX] +
-    Proto->B * Feature->Params[PicoFeatY] +
-    Proto->C;
-
-  return (EvidenceOf (Distance * Distance + Dangle * Dangle));
-}
-
-/**
- * @name EvidenceOf
- *
- * Return the new type of evidence number corresponding to this
- * distance value.  This number is no longer based on the chi squared
- * approximation.  The equation that represents the transform is:
- *       1 / (1 + (sim / midpoint) ^ curl)
- */
-double EvidenceOf (double Similarity) {
-
-  Similarity /= training_similarity_midpoint;
-
-  if (training_similarity_curl == 3)
-    Similarity = Similarity * Similarity * Similarity;
-  else if (training_similarity_curl == 2)
-    Similarity = Similarity * Similarity;
-  else
-    Similarity = pow (Similarity, training_similarity_curl);
-
-  return (1.0 / (1.0 + Similarity));
-}
-
-/**
- * This routine returns TRUE if Feature would be matched
- * by a fast match table built from Proto.
- *
- * @param Feature   feature to be "fast matched" to proto
- * @param Proto   proto being "fast matched" against
- *
- * Globals:
- * - training_tangent_bbox_pad    bounding box pad tangent to proto
- * - training_orthogonal_bbox_pad bounding box pad orthogonal to proto
- *
- * @return TRUE if feature could match Proto.
- * @note Exceptions: none
- * @note History: Wed Nov 14 17:19:58 1990, DSJ, Created.
- */
-BOOL8 DummyFastMatch (
-     FEATURE  Feature,
-     PROTO  Proto)
-{
-  FRECT   BoundingBox;
-  FLOAT32 MaxAngleError;
-  FLOAT32 AngleError;
-
-  MaxAngleError = training_angle_pad / 360.0;
-  AngleError = fabs (Proto->Angle - Feature->Params[PicoFeatDir]);
-  if (AngleError > 0.5)
-    AngleError = 1.0 - AngleError;
-
-  if (AngleError > MaxAngleError)
-    return (FALSE);
-
-  ComputePaddedBoundingBox (Proto,
-    training_tangent_bbox_pad * GetPicoFeatureLength (),
-    training_orthogonal_bbox_pad * GetPicoFeatureLength (),
-    &BoundingBox);
-
-  return PointInside(&BoundingBox, Feature->Params[PicoFeatX],
-                     Feature->Params[PicoFeatY]);
-} /* DummyFastMatch */
-
-/**
- * This routine computes a bounding box that encloses the
- * specified proto along with some padding.  The
- * amount of padding is specified as separate distances
- * in the tangential and orthogonal directions.
- *
- * @param Proto   proto to compute bounding box for
- * @param TangentPad  amount of pad to add in direction of segment
- * @param OrthogonalPad amount of pad to add orthogonal to segment
- * @param[out] BoundingBox place to put results
- *
- * Globals: none
- *
- * @return none (results are returned in BoundingBox)
- * @note Exceptions: none
- * @note History: Wed Nov 14 14:55:30 1990, DSJ, Created.
- */
-void ComputePaddedBoundingBox (PROTO  Proto, FLOAT32  TangentPad,
-                               FLOAT32  OrthogonalPad, FRECT  *BoundingBox) {
-  FLOAT32 Pad, Length, Angle;
-  FLOAT32 CosOfAngle, SinOfAngle;
-
-  Length     = Proto->Length / 2.0 + TangentPad;
-  Angle      = Proto->Angle * 2.0 * PI;
-  CosOfAngle = fabs(cos(Angle));
-  SinOfAngle = fabs(sin(Angle));
-
-  Pad = MAX (CosOfAngle * Length, SinOfAngle * OrthogonalPad);
-  BoundingBox->MinX = Proto->X - Pad;
-  BoundingBox->MaxX = Proto->X + Pad;
-
-  Pad = MAX(SinOfAngle * Length, CosOfAngle * OrthogonalPad);
-  BoundingBox->MinY = Proto->Y - Pad;
-  BoundingBox->MaxY = Proto->Y + Pad;
-
-} /* ComputePaddedBoundingBox */
-
-/**
- * Return TRUE if point (X,Y) is inside of Rectangle.
- *
- * Globals: none
- *
- * @return TRUE if point (X,Y) is inside of Rectangle.
- * @note Exceptions: none
- * @note History: Wed Nov 14 17:26:35 1990, DSJ, Created.
- */
-BOOL8 PointInside(FRECT *Rectangle, FLOAT32 X, FLOAT32  Y) {
-  if (X < Rectangle->MinX) return (FALSE);
-  if (X > Rectangle->MaxX) return (FALSE);
-  if (Y < Rectangle->MinY) return (FALSE);
-  if (Y > Rectangle->MaxY) return (FALSE);
-  return (TRUE);
-
-} /* PointInside */
+/******************************************************************************
+**  Filename:    MergeNF.c
+**  Purpose:     Program for merging similar nano-feature protos
+**  Author:      Dan Johnson
+**  History:     Wed Nov 21 09:55:23 1990, DSJ, Created.
+**
+ ** (c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+******************************************************************************/
+#include "mergenf.h"
+#include "host.h"
+#include "efio.h"
+#include "clusttool.h"
+#include "cluster.h"
+#include "oldlist.h"
+#include "protos.h"
+#include "ndminx.h"
+#include "ocrfeatures.h"
+#include "const.h"
+#include "featdefs.h"
+#include "intproto.h"
+#include "params.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+/*-------------------once in subfeat---------------------------------*/
+double_VAR(training_angle_match_scale, 1.0, "Angle Match Scale ...");
+
+double_VAR(training_similarity_midpoint, 0.0075, "Similarity Midpoint ...");
+
+double_VAR(training_similarity_curl, 2.0, "Similarity Curl ...");
+
+/*-----------------------------once in fasttrain----------------------------------*/
+double_VAR(training_tangent_bbox_pad, 0.5, "Tangent bounding box pad ...");
+
+double_VAR(training_orthogonal_bbox_pad, 2.5, "Orthogonal bounding box pad ...");
+
+double_VAR(training_angle_pad, 45.0, "Angle pad ...");
+
+/**
+ * Compare protos p1 and p2 and return an estimate of the
+ * worst evidence rating that will result for any part of p1
+ * that is compared to p2.  In other words, if p1 were broken
+ * into pico-features and each pico-feature was matched to p2,
+ * what is the worst evidence rating that will be achieved for
+ * any pico-feature.
+ *
+ * @param p1, p2    protos to be compared
+ *
+ * Globals: none
+ *
+ * @return Worst possible result when matching p1 to p2.
+ * @note Exceptions: none
+ * @note History: Mon Nov 26 08:27:53 1990, DSJ, Created.
+ */
+FLOAT32 CompareProtos(PROTO p1, PROTO p2) {
+  FEATURE Feature;
+  FLOAT32 WorstEvidence = WORST_EVIDENCE;
+  FLOAT32 Evidence;
+  FLOAT32 Angle, Length;
+
+  /* if p1 and p2 are not close in length, don't let them match */
+  Length = fabs (p1->Length - p2->Length);
+  if (Length > MAX_LENGTH_MISMATCH)
+    return (0.0);
+
+  /* create a dummy pico-feature to be used for comparisons */
+  Feature = NewFeature (&PicoFeatDesc);
+  Feature->Params[PicoFeatDir] = p1->Angle;
+
+  /* convert angle to radians */
+  Angle = p1->Angle * 2.0 * PI;
+
+  /* find distance from center of p1 to 1/2 picofeat from end */
+  Length = p1->Length / 2.0 - GetPicoFeatureLength () / 2.0;
+  if (Length < 0) Length = 0;
+
+  /* set the dummy pico-feature at one end of p1 and match it to p2 */
+  Feature->Params[PicoFeatX] = p1->X + cos (Angle) * Length;
+  Feature->Params[PicoFeatY] = p1->Y + sin (Angle) * Length;
+  if (DummyFastMatch (Feature, p2)) {
+    Evidence = SubfeatureEvidence (Feature, p2);
+    if (Evidence < WorstEvidence)
+      WorstEvidence = Evidence;
+  } else {
+    FreeFeature(Feature);
+    return 0.0;
+  }
+
+  /* set the dummy pico-feature at the other end of p1 and match it to p2 */
+  Feature->Params[PicoFeatX] = p1->X - cos (Angle) * Length;
+  Feature->Params[PicoFeatY] = p1->Y - sin (Angle) * Length;
+  if (DummyFastMatch (Feature, p2)) {
+    Evidence = SubfeatureEvidence (Feature, p2);
+    if (Evidence < WorstEvidence)
+      WorstEvidence = Evidence;
+  } else {
+    FreeFeature(Feature);
+    return 0.0;
+  }
+
+  FreeFeature (Feature);
+  return (WorstEvidence);
+
+} /* CompareProtos */
+
+/**
+ * This routine computes a proto which is the weighted
+ * average of protos p1 and p2.  The new proto is returned
+ * in MergedProto.
+ *
+ * @param p1, p2    protos to be merged
+ * @param w1, w2    weight of each proto
+ * @param MergedProto place to put resulting merged proto
+ *
+ * Globals: none
+ *
+ * @return none (results are returned in MergedProto)
+ * @note Exceptions: none
+ * @note History: Mon Nov 26 08:15:08 1990, DSJ, Created.
+ */
+void ComputeMergedProto (PROTO  p1,
+                         PROTO  p2,
+                         FLOAT32  w1,
+                         FLOAT32  w2,
+                         PROTO  MergedProto) {
+  FLOAT32 TotalWeight;
+
+  TotalWeight = w1 + w2;
+  w1 /= TotalWeight;
+  w2 /= TotalWeight;
+
+  MergedProto->X = p1->X * w1 + p2->X * w2;
+  MergedProto->Y = p1->Y * w1 + p2->Y * w2;
+  MergedProto->Length = p1->Length * w1 + p2->Length * w2;
+  MergedProto->Angle = p1->Angle * w1 + p2->Angle * w2;
+  FillABC(MergedProto);
+} /* ComputeMergedProto */
+
+/**
+ * This routine searches through all of the prototypes in
+ * Class and returns the id of the proto which would provide
+ * the best approximation of Prototype.  If no close
+ * approximation can be found, NO_PROTO is returned.
+ *
+ * @param Class   class to search for matching old proto in
+ * @param NumMerged # of protos merged into each proto of Class
+ * @param  Prototype new proto to find match for
+ *
+ * Globals: none
+ *
+ * @return Id of closest proto in Class or NO_PROTO.
+ * @note Exceptions: none
+ * @note History: Sat Nov 24 11:42:58 1990, DSJ, Created.
+ */
+int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[],
+                             PROTOTYPE  *Prototype) {
+  PROTO_STRUCT  NewProto;
+  PROTO_STRUCT  MergedProto;
+  int   Pid;
+  PROTO   Proto;
+  int   BestProto;
+  FLOAT32 BestMatch;
+  FLOAT32 Match, OldMatch, NewMatch;
+
+  MakeNewFromOld (&NewProto, Prototype);
+
+  BestProto = NO_PROTO;
+  BestMatch = WORST_MATCH_ALLOWED;
+  for (Pid = 0; Pid < Class->NumProtos; Pid++) {
+    Proto  = ProtoIn(Class, Pid);
+    ComputeMergedProto(Proto, &NewProto,
+      (FLOAT32) NumMerged[Pid], 1.0, &MergedProto);
+    OldMatch = CompareProtos(Proto, &MergedProto);
+    NewMatch = CompareProtos(&NewProto, &MergedProto);
+    Match = MIN(OldMatch, NewMatch);
+    if (Match > BestMatch) {
+      BestProto = Pid;
+      BestMatch = Match;
+    }
+  }
+  return BestProto;
+} /* FindClosestExistingProto */
+
+/**
+ * This fills in the fields of the New proto based on the
+ * fields of the Old proto.
+ *
+ * @param New new proto to be filled in
+ * @param Old old proto to be converted
+ *
+ *  Globals: none
+ *
+ * Exceptions: none
+ * History: Mon Nov 26 09:45:39 1990, DSJ, Created.
+ */
+void MakeNewFromOld(PROTO New, PROTOTYPE *Old) {
+  New->X = CenterX(Old->Mean);
+  New->Y = CenterY(Old->Mean);
+  New->Length = LengthOf(Old->Mean);
+  New->Angle = OrientationOf(Old->Mean);
+  FillABC(New);
+} /* MakeNewFromOld */
+
+/*-------------------once in subfeat---------------------------------*/
+
+/**
+ * @name SubfeatureEvidence
+ *
+ * Compare a feature to a prototype. Print the result.
+ */
+FLOAT32 SubfeatureEvidence(FEATURE Feature, PROTO Proto) {
+  float       Distance;
+  float       Dangle;
+
+  Dangle   = Proto->Angle - Feature->Params[PicoFeatDir];
+  if (Dangle < -0.5) Dangle += 1.0;
+  if (Dangle >  0.5) Dangle -= 1.0;
+  Dangle *= training_angle_match_scale;
+
+  Distance = Proto->A * Feature->Params[PicoFeatX] +
+    Proto->B * Feature->Params[PicoFeatY] +
+    Proto->C;
+
+  return (EvidenceOf (Distance * Distance + Dangle * Dangle));
+}
+
+/**
+ * @name EvidenceOf
+ *
+ * Return the new type of evidence number corresponding to this
+ * distance value.  This number is no longer based on the chi squared
+ * approximation.  The equation that represents the transform is:
+ *       1 / (1 + (sim / midpoint) ^ curl)
+ */
+double EvidenceOf (double Similarity) {
+
+  Similarity /= training_similarity_midpoint;
+
+  if (training_similarity_curl == 3)
+    Similarity = Similarity * Similarity * Similarity;
+  else if (training_similarity_curl == 2)
+    Similarity = Similarity * Similarity;
+  else
+    Similarity = pow (Similarity, training_similarity_curl);
+
+  return (1.0 / (1.0 + Similarity));
+}
+
+/**
+ * This routine returns TRUE if Feature would be matched
+ * by a fast match table built from Proto.
+ *
+ * @param Feature   feature to be "fast matched" to proto
+ * @param Proto   proto being "fast matched" against
+ *
+ * Globals:
+ * - training_tangent_bbox_pad    bounding box pad tangent to proto
+ * - training_orthogonal_bbox_pad bounding box pad orthogonal to proto
+ *
+ * @return TRUE if feature could match Proto.
+ * @note Exceptions: none
+ * @note History: Wed Nov 14 17:19:58 1990, DSJ, Created.
+ */
+BOOL8 DummyFastMatch (
+     FEATURE  Feature,
+     PROTO  Proto)
+{
+  FRECT   BoundingBox;
+  FLOAT32 MaxAngleError;
+  FLOAT32 AngleError;
+
+  MaxAngleError = training_angle_pad / 360.0;
+  AngleError = fabs (Proto->Angle - Feature->Params[PicoFeatDir]);
+  if (AngleError > 0.5)
+    AngleError = 1.0 - AngleError;
+
+  if (AngleError > MaxAngleError)
+    return (FALSE);
+
+  ComputePaddedBoundingBox (Proto,
+    training_tangent_bbox_pad * GetPicoFeatureLength (),
+    training_orthogonal_bbox_pad * GetPicoFeatureLength (),
+    &BoundingBox);
+
+  return PointInside(&BoundingBox, Feature->Params[PicoFeatX],
+                     Feature->Params[PicoFeatY]);
+} /* DummyFastMatch */
+
+/**
+ * This routine computes a bounding box that encloses the
+ * specified proto along with some padding.  The
+ * amount of padding is specified as separate distances
+ * in the tangential and orthogonal directions.
+ *
+ * @param Proto   proto to compute bounding box for
+ * @param TangentPad  amount of pad to add in direction of segment
+ * @param OrthogonalPad amount of pad to add orthogonal to segment
+ * @param[out] BoundingBox place to put results
+ *
+ * Globals: none
+ *
+ * @return none (results are returned in BoundingBox)
+ * @note Exceptions: none
+ * @note History: Wed Nov 14 14:55:30 1990, DSJ, Created.
+ */
+void ComputePaddedBoundingBox (PROTO  Proto, FLOAT32  TangentPad,
+                               FLOAT32  OrthogonalPad, FRECT  *BoundingBox) {
+  FLOAT32 Pad, Length, Angle;
+  FLOAT32 CosOfAngle, SinOfAngle;
+
+  Length     = Proto->Length / 2.0 + TangentPad;
+  Angle      = Proto->Angle * 2.0 * PI;
+  CosOfAngle = fabs(cos(Angle));
+  SinOfAngle = fabs(sin(Angle));
+
+  Pad = MAX (CosOfAngle * Length, SinOfAngle * OrthogonalPad);
+  BoundingBox->MinX = Proto->X - Pad;
+  BoundingBox->MaxX = Proto->X + Pad;
+
+  Pad = MAX(SinOfAngle * Length, CosOfAngle * OrthogonalPad);
+  BoundingBox->MinY = Proto->Y - Pad;
+  BoundingBox->MaxY = Proto->Y + Pad;
+
+} /* ComputePaddedBoundingBox */
+
+/**
+ * Return TRUE if point (X,Y) is inside of Rectangle.
+ *
+ * Globals: none
+ *
+ * @return TRUE if point (X,Y) is inside of Rectangle.
+ * @note Exceptions: none
+ * @note History: Wed Nov 14 17:26:35 1990, DSJ, Created.
+ */
+BOOL8 PointInside(FRECT *Rectangle, FLOAT32 X, FLOAT32  Y) {
+  if (X < Rectangle->MinX) return (FALSE);
+  if (X > Rectangle->MaxX) return (FALSE);
+  if (Y < Rectangle->MinY) return (FALSE);
+  if (Y > Rectangle->MaxY) return (FALSE);
+  return (TRUE);
+
+} /* PointInside */
--- a/src/training/mergenf.h
+++ b/src/training/mergenf.h
@ -1,103 +1,103 @@
-/******************************************************************************
-**	Filename:    MergeNF.c
-**	Purpose:     Program for merging similar nano-feature protos
-**	Author:      Dan Johnson
-**	History:     Wed Nov 21 09:55:23 1990, DSJ, Created.
-**
- **	(c) Copyright Hewlett-Packard Company, 1988.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
-******************************************************************************/
-
-#ifndef TESSERACT_TRAINING_MERGENF_H_
-#define TESSERACT_TRAINING_MERGENF_H_
-
-/**----------------------------------------------------------------------------
-					Include Files and Type Defines
----------------------------------------------------------------------------**/
-#include "protos.h"
-#include "cluster.h"
-#include "ocrfeatures.h"
-#include "callcpp.h"
-#include "picofeat.h"
-
-
-#define WORST_MATCH_ALLOWED	(0.9)
-#define WORST_EVIDENCE (1.0)
-#define MAX_LENGTH_MISMATCH	(2.0 * GetPicoFeatureLength ())
-
-
-#define PROTO_SUFFIX		".mf.p"
-#define CONFIG_SUFFIX		".cl"
-#define NO_PROTO	(-1)
-#define XPOSITION			0
-#define YPOSITION			1
-#define MFLENGTH			2
-#define ORIENTATION			3
-
-typedef struct
-{
-  FLOAT32	MinX, MaxX, MinY, MaxY;
-} FRECT;
-
-/**----------------------------------------------------------------------------
-					Public Macros
----------------------------------------------------------------------------**/
-#define CenterX(M)		( (M)[XPOSITION] )
-#define CenterY(M)		( (M)[YPOSITION] )
-#define LengthOf(M)		( (M)[MFLENGTH] )
-#define OrientationOf(M)	( (M)[ORIENTATION] )
-
-/**----------------------------------------------------------------------------
-					Public Function Prototypes
----------------------------------------------------------------------------**/
-FLOAT32 CompareProtos (
-     PROTO	p1,
-	 PROTO	p2);
-
-void ComputeMergedProto (
-     PROTO	p1,
-	 PROTO	p2,
-     FLOAT32	w1,
-	 FLOAT32	w2,
-     PROTO	MergedProto);
-
-int FindClosestExistingProto (
-     CLASS_TYPE	Class,
-     int       	NumMerged[],
-     PROTOTYPE	*Prototype);
-
-void MakeNewFromOld (
-     PROTO	New,
-     PROTOTYPE	*Old);
-
-FLOAT32 SubfeatureEvidence (
-   FEATURE     Feature,
-   PROTO       Proto);
-
-double EvidenceOf (
-  register double   Similarity);
-
-BOOL8 DummyFastMatch (
-     FEATURE	Feature,
-     PROTO	Proto);
-
-void ComputePaddedBoundingBox (
-     PROTO	Proto,
-     FLOAT32	TangentPad,
-	 FLOAT32	OrthogonalPad,
-     FRECT	*BoundingBox);
-
-BOOL8 PointInside (
-     FRECT	*Rectangle,
-     FLOAT32	X,
-	 FLOAT32	Y);
-
-#endif  // TESSERACT_TRAINING_MERGENF_H_
+/******************************************************************************
+**	Filename:    MergeNF.c
+**	Purpose:     Program for merging similar nano-feature protos
+**	Author:      Dan Johnson
+**	History:     Wed Nov 21 09:55:23 1990, DSJ, Created.
+**
+ **	(c) Copyright Hewlett-Packard Company, 1988.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+******************************************************************************/
+
+#ifndef TESSERACT_TRAINING_MERGENF_H_
+#define TESSERACT_TRAINING_MERGENF_H_
+
+/**----------------------------------------------------------------------------
+					Include Files and Type Defines
+----------------------------------------------------------------------------**/
+#include "protos.h"
+#include "cluster.h"
+#include "ocrfeatures.h"
+#include "callcpp.h"
+#include "picofeat.h"
+
+
+#define WORST_MATCH_ALLOWED	(0.9)
+#define WORST_EVIDENCE (1.0)
+#define MAX_LENGTH_MISMATCH	(2.0 * GetPicoFeatureLength ())
+
+
+#define PROTO_SUFFIX		".mf.p"
+#define CONFIG_SUFFIX		".cl"
+#define NO_PROTO	(-1)
+#define XPOSITION			0
+#define YPOSITION			1
+#define MFLENGTH			2
+#define ORIENTATION			3
+
+typedef struct
+{
+  FLOAT32	MinX, MaxX, MinY, MaxY;
+} FRECT;
+
+/**----------------------------------------------------------------------------
+					Public Macros
+----------------------------------------------------------------------------**/
+#define CenterX(M)		( (M)[XPOSITION] )
+#define CenterY(M)		( (M)[YPOSITION] )
+#define LengthOf(M)		( (M)[MFLENGTH] )
+#define OrientationOf(M)	( (M)[ORIENTATION] )
+
+/**----------------------------------------------------------------------------
+					Public Function Prototypes
+----------------------------------------------------------------------------**/
+FLOAT32 CompareProtos (
+     PROTO	p1,
+	 PROTO	p2);
+
+void ComputeMergedProto (
+     PROTO	p1,
+	 PROTO	p2,
+     FLOAT32	w1,
+	 FLOAT32	w2,
+     PROTO	MergedProto);
+
+int FindClosestExistingProto (
+     CLASS_TYPE	Class,
+     int       	NumMerged[],
+     PROTOTYPE	*Prototype);
+
+void MakeNewFromOld (
+     PROTO	New,
+     PROTOTYPE	*Old);
+
+FLOAT32 SubfeatureEvidence (
+   FEATURE     Feature,
+   PROTO       Proto);
+
+double EvidenceOf (
+  register double   Similarity);
+
+BOOL8 DummyFastMatch (
+     FEATURE	Feature,
+     PROTO	Proto);
+
+void ComputePaddedBoundingBox (
+     PROTO	Proto,
+     FLOAT32	TangentPad,
+	 FLOAT32	OrthogonalPad,
+     FRECT	*BoundingBox);
+
+BOOL8 PointInside (
+     FRECT	*Rectangle,
+     FLOAT32	X,
+	 FLOAT32	Y);
+
+#endif  // TESSERACT_TRAINING_MERGENF_H_
--- a/src/training/mftraining.cpp
+++ b/src/training/mftraining.cpp
--- a/src/training/normstrngs.cpp
+++ b/src/training/normstrngs.cpp
--- a/src/training/normstrngs.h
+++ b/src/training/normstrngs.h
--- a/src/training/pango_font_info.cpp
+++ b/src/training/pango_font_info.cpp
--- a/src/training/pango_font_info.h
+++ b/src/training/pango_font_info.h
--- a/src/training/set_unicharset_properties.cpp
+++ b/src/training/set_unicharset_properties.cpp
--- a/src/training/shapeclustering.cpp
+++ b/src/training/shapeclustering.cpp
--- a/src/training/stringrenderer.cpp
+++ b/src/training/stringrenderer.cpp
--- a/src/training/stringrenderer.h
+++ b/src/training/stringrenderer.h
--- a/src/training/tessopt.cpp
+++ b/src/training/tessopt.cpp
--- a/src/training/tessopt.h
+++ b/src/training/tessopt.h
--- a/src/training/tesstrain.sh
+++ b/src/training/tesstrain.sh
--- a/src/training/tesstrain_utils.sh
+++ b/src/training/tesstrain_utils.sh
--- a/src/training/text2image.cpp
+++ b/src/training/text2image.cpp
--- a/src/training/tlog.cpp
+++ b/src/training/tlog.cpp
@ -1,23 +1,23 @@
-/**********************************************************************
- * File:        tlog.cpp
- * Description: Variant of printf with logging level controllable by a
- *              commandline flag.
- * Author:      Ranjith Unnikrishnan
- * Created:     Wed Nov 20 2013
- *
- * (C) Copyright 2013, Google Inc.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-
-#include "tlog.h"
-
-INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output");
+/**********************************************************************
+ * File:        tlog.cpp
+ * Description: Variant of printf with logging level controllable by a
+ *              commandline flag.
+ * Author:      Ranjith Unnikrishnan
+ * Created:     Wed Nov 20 2013
+ *
+ * (C) Copyright 2013, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "tlog.h"
+
+INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output");
--- a/src/training/tlog.h
+++ b/src/training/tlog.h
@ -1,41 +1,41 @@
-/**********************************************************************
- * File:        tlog.h
- * Description: Variant of printf with logging level controllable by a
- *              commandline flag.
- * Author:      Ranjith Unnikrishnan
- * Created:     Wed Nov 20 2013
- *
- * (C) Copyright 2013, Google Inc.
- ** Licensed under the Apache License, Version 2.0 (the "License");
- ** you may not use this file except in compliance with the License.
- ** You may obtain a copy of the License at
- ** http://www.apache.org/licenses/LICENSE-2.0
- ** Unless required by applicable law or agreed to in writing, software
- ** distributed under the License is distributed on an "AS IS" BASIS,
- ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ** See the License for the specific language governing permissions and
- ** limitations under the License.
- *
- **********************************************************************/
-#ifndef TESSERACT_TRAINING_TLOG_H_
-#define TESSERACT_TRAINING_TLOG_H_
-
-#include "commandlineflags.h"
-#include "errcode.h"
-#include "tprintf.h"
-
-DECLARE_INT_PARAM_FLAG(tlog_level);
-
-// Variant guarded by the numeric logging level parameter FLAGS_tlog_level
-// (default 0).  Code using ParseCommandLineFlags() can control its value using
-// the --tlog_level commandline argument. Otherwise it must be specified in a
-// config file like other params.
-#define tlog(level, ...) {                        \
-  if (FLAGS_tlog_level >= level) {                \
-    tprintf_internal(__VA_ARGS__);                \
-  }                                               \
-}
-
-#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level)
-
-#endif  // TESSERACT_TRAINING_TLOG_H_
+/**********************************************************************
+ * File:        tlog.h
+ * Description: Variant of printf with logging level controllable by a
+ *              commandline flag.
+ * Author:      Ranjith Unnikrishnan
+ * Created:     Wed Nov 20 2013
+ *
+ * (C) Copyright 2013, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+#ifndef TESSERACT_TRAINING_TLOG_H_
+#define TESSERACT_TRAINING_TLOG_H_
+
+#include "commandlineflags.h"
+#include "errcode.h"
+#include "tprintf.h"
+
+DECLARE_INT_PARAM_FLAG(tlog_level);
+
+// Variant guarded by the numeric logging level parameter FLAGS_tlog_level
+// (default 0).  Code using ParseCommandLineFlags() can control its value using
+// the --tlog_level commandline argument. Otherwise it must be specified in a
+// config file like other params.
+#define tlog(level, ...) {                        \
+  if (FLAGS_tlog_level >= level) {                \
+    tprintf_internal(__VA_ARGS__);                \
+  }                                               \
+}
+
+#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level)
+
+#endif  // TESSERACT_TRAINING_TLOG_H_
--- a/src/training/unicharset_extractor.cpp
+++ b/src/training/unicharset_extractor.cpp
--- a/src/training/unicharset_training_utils.cpp
+++ b/src/training/unicharset_training_utils.cpp
--- a/src/training/unicharset_training_utils.h
+++ b/src/training/unicharset_training_utils.h
--- a/src/training/util.h
+++ b/src/training/util.h
--- a/src/training/validate_grapheme.cpp
+++ b/src/training/validate_grapheme.cpp
--- a/src/training/validate_grapheme.h
+++ b/src/training/validate_grapheme.h
@ -1,35 +1,35 @@
-#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
-#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
-
-#include "validator.h"
-
-namespace tesseract {
-
-// Subclass of Validator that validates and segments generic unicode into
-// grapheme clusters, including Latin with diacritics.
-class ValidateGrapheme : public Validator {
- public:
-  ValidateGrapheme(ViramaScript script, bool report_errors)
-      : Validator(script, report_errors) {}
-  ~ValidateGrapheme() {}
-
- protected:
-  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
-  // parts_ and output_. Returns true if a valid Grapheme was consumed,
-  // otherwise does not increment codes_used_.
-  bool ConsumeGraphemeIfValid() override;
-  // Returns the CharClass corresponding to the given Unicode ch.
-  CharClass UnicodeToCharClass(char32 ch) const override;
-
- private:
-  // Helper returns true if the sequence prev_ch,ch is invalid.
-  bool IsBadlyFormed(char32 prev_ch, char32 ch);
-  // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
-  static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch);
-  // Helper returns true if the sequence prev_ch,ch is invalid Thai.
-  static bool IsBadlyFormedThai(char32 prev_ch, char32 ch);
-};
-
-}  // namespace tesseract
-
-#endif  // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
+#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
+#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
+
+#include "validator.h"
+
+namespace tesseract {
+
+// Subclass of Validator that validates and segments generic unicode into
+// grapheme clusters, including Latin with diacritics.
+class ValidateGrapheme : public Validator {
+ public:
+  ValidateGrapheme(ViramaScript script, bool report_errors)
+      : Validator(script, report_errors) {}
+  ~ValidateGrapheme() {}
+
+ protected:
+  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
+  // parts_ and output_. Returns true if a valid Grapheme was consumed,
+  // otherwise does not increment codes_used_.
+  bool ConsumeGraphemeIfValid() override;
+  // Returns the CharClass corresponding to the given Unicode ch.
+  CharClass UnicodeToCharClass(char32 ch) const override;
+
+ private:
+  // Helper returns true if the sequence prev_ch,ch is invalid.
+  bool IsBadlyFormed(char32 prev_ch, char32 ch);
+  // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
+  static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch);
+  // Helper returns true if the sequence prev_ch,ch is invalid Thai.
+  static bool IsBadlyFormedThai(char32 prev_ch, char32 ch);
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
--- a/src/training/validate_indic.cpp
+++ b/src/training/validate_indic.cpp
--- a/src/training/validate_indic.h
+++ b/src/training/validate_indic.h
@ -1,44 +1,44 @@
-#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_
-#define TESSERACT_TRAINING_VALIDATE_INDIC_H_
-
-#include "validator.h"
-
-namespace tesseract {
-
-// Subclass of Validator that validates and segments Indic scripts in the
-// unicode range 0x900-0xdff (Devanagari-Sinhala).
-class ValidateIndic : public Validator {
- public:
-  ValidateIndic(ViramaScript script, bool report_errors)
-      : Validator(script, report_errors) {}
-  ~ValidateIndic() {}
-
- protected:
-  // Returns whether codes matches the pattern for an Indic Grapheme.
-  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
-  // parts_ and output_. Returns true if a valid Grapheme was consumed,
-  // otherwise does not increment codes_used_.
-  bool ConsumeGraphemeIfValid() override;
-  // Returns the CharClass corresponding to the given Unicode ch.
-  Validator::CharClass UnicodeToCharClass(char32 ch) const override;
-
- private:
-  // Helper consumes/copies a virama and any associated post-virama joiners.
-  bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
-  // Helper consumes/copies a series of consonants separated by viramas while
-  // valid, but not any vowel or other modifiers.
-  bool ConsumeConsonantHeadIfValid();
-  // Helper consumes/copies a tail part of a consonant, comprising optional
-  // matra/piece, vowel modifier, vedic mark, terminating virama.
-  bool ConsumeConsonantTailIfValid();
-  // Helper consumes/copies a vowel and optional modifiers.
-  bool ConsumeVowelIfValid();
-
-  // Some special unicodes used only for Indic processing.
-  static const char32 kYayana = 0xdba;  // Sinhala Ya
-  static const char32 kRayana = 0xdbb;  // Sinhala Ra
-};
-
-}  // namespace tesseract
-
-#endif  // TESSERACT_TRAINING_VALIDATE_INDIC_H_
+#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_
+#define TESSERACT_TRAINING_VALIDATE_INDIC_H_
+
+#include "validator.h"
+
+namespace tesseract {
+
+// Subclass of Validator that validates and segments Indic scripts in the
+// unicode range 0x900-0xdff (Devanagari-Sinhala).
+class ValidateIndic : public Validator {
+ public:
+  ValidateIndic(ViramaScript script, bool report_errors)
+      : Validator(script, report_errors) {}
+  ~ValidateIndic() {}
+
+ protected:
+  // Returns whether codes matches the pattern for an Indic Grapheme.
+  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
+  // parts_ and output_. Returns true if a valid Grapheme was consumed,
+  // otherwise does not increment codes_used_.
+  bool ConsumeGraphemeIfValid() override;
+  // Returns the CharClass corresponding to the given Unicode ch.
+  Validator::CharClass UnicodeToCharClass(char32 ch) const override;
+
+ private:
+  // Helper consumes/copies a virama and any associated post-virama joiners.
+  bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
+  // Helper consumes/copies a series of consonants separated by viramas while
+  // valid, but not any vowel or other modifiers.
+  bool ConsumeConsonantHeadIfValid();
+  // Helper consumes/copies a tail part of a consonant, comprising optional
+  // matra/piece, vowel modifier, vedic mark, terminating virama.
+  bool ConsumeConsonantTailIfValid();
+  // Helper consumes/copies a vowel and optional modifiers.
+  bool ConsumeVowelIfValid();
+
+  // Some special unicodes used only for Indic processing.
+  static const char32 kYayana = 0xdba;  // Sinhala Ya
+  static const char32 kRayana = 0xdbb;  // Sinhala Ra
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_VALIDATE_INDIC_H_
--- a/src/training/validate_khmer.cpp
+++ b/src/training/validate_khmer.cpp
@ -1,106 +1,106 @@
-#include "validate_khmer.h"
-#include "errcode.h"
-#include "tprintf.h"
-
-namespace tesseract {
-
-// Returns whether codes matches the pattern for a Khmer Grapheme.
-// Taken from unicode standard:
-// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
-// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
-// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
-// Translated to the codes used by the CharClass enum:
-// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
-// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
-// Also the Consonant class here includes independent vowels, as they are
-// treated the same anyway.
-// In the split grapheme mode, the only characters that get grouped are the
-// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
-// the BNF syntax, so who knows what they do.
-bool ValidateKhmer::ConsumeGraphemeIfValid() {
-  int num_codes = codes_.size();
-  if (codes_used_ == num_codes) return false;
-  if (codes_[codes_used_].first == CharClass::kOther) {
-    UseMultiCode(1);
-    return true;
-  }
-  if (codes_[codes_used_].first != CharClass::kConsonant) {
-    if (report_errors_) {
-      tprintf("Invalid start of Khmer syllable:0x%x\n",
-              codes_[codes_used_].second);
-    }
-    return false;
-  }
-  if (UseMultiCode(1)) return true;
-  if (codes_[codes_used_].first == CharClass::kRobat ||
-      codes_[codes_used_].first == CharClass::kNukta) {
-    if (UseMultiCode(1)) return true;
-  }
-  while (codes_used_ + 1 < num_codes &&
-         codes_[codes_used_].first == CharClass::kVirama &&
-         codes_[codes_used_ + 1].first == CharClass::kConsonant) {
-    ASSERT_HOST(!CodeOnlyToOutput());
-    if (UseMultiCode(2)) return true;
-    if (codes_[codes_used_].first == CharClass::kRobat) {
-      if (UseMultiCode(1)) return true;
-    }
-  }
-  int num_matra_parts = 0;
-  if (codes_[codes_used_].second == kZeroWidthJoiner ||
-      codes_[codes_used_].second == kZeroWidthNonJoiner) {
-    if (CodeOnlyToOutput()) {
-      if (report_errors_) {
-        tprintf("Unterminated joiner: 0x%x\n", output_.back());
-      }
-      return false;
-    }
-    ++num_matra_parts;
-  }
-  // Not quite as shown by the BNF, the matra piece is allowed as a matra on its
-  // own or as an addition to other matras.
-  if (codes_[codes_used_].first == CharClass::kMatra ||
-      codes_[codes_used_].first == CharClass::kMatraPiece) {
-    ++num_matra_parts;
-    if (UseMultiCode(num_matra_parts)) return true;
-  } else if (num_matra_parts) {
-    if (report_errors_) {
-      tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
-              output_.back(), codes_[codes_used_].second);
-    }
-    return false;
-  }
-  if (codes_[codes_used_].first == CharClass::kMatraPiece &&
-      codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
-    if (UseMultiCode(1)) return true;
-  }
-  if (codes_[codes_used_].first == CharClass::kVowelModifier) {
-    if (UseMultiCode(1)) return true;
-  }
-  if (codes_used_ + 1 < num_codes &&
-      codes_[codes_used_].first == CharClass::kVirama &&
-      codes_[codes_used_ + 1].first == CharClass::kConsonant) {
-    ASSERT_HOST(!CodeOnlyToOutput());
-    if (UseMultiCode(2)) return true;
-  }
-  return true;
-}
-
-Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const {
-  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
-  if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
-  if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
-  // Offset from the start of the relevant unicode code block aka code page.
-  int off = ch - static_cast<char32>(script_);
-  // Anything in another code block is other.
-  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
-  if (off <= 0x33) return CharClass::kConsonant;
-  if (off <= 0x45) return CharClass::kMatra;
-  if (off == 0x46) return CharClass::kMatraPiece;
-  if (off == 0x4c) return CharClass::kRobat;
-  if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
-  if (off <= 0x51) return CharClass::kVowelModifier;
-  if (off == 0x52) return CharClass::kVirama;
-  return CharClass::kOther;
-}
-
-}  // namespace tesseract
+#include "validate_khmer.h"
+#include "errcode.h"
+#include "tprintf.h"
+
+namespace tesseract {
+
+// Returns whether codes matches the pattern for a Khmer Grapheme.
+// Taken from unicode standard:
+// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
+// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
+// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
+// Translated to the codes used by the CharClass enum:
+// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
+// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
+// Also the Consonant class here includes independent vowels, as they are
+// treated the same anyway.
+// In the split grapheme mode, the only characters that get grouped are the
+// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
+// the BNF syntax, so who knows what they do.
+bool ValidateKhmer::ConsumeGraphemeIfValid() {
+  int num_codes = codes_.size();
+  if (codes_used_ == num_codes) return false;
+  if (codes_[codes_used_].first == CharClass::kOther) {
+    UseMultiCode(1);
+    return true;
+  }
+  if (codes_[codes_used_].first != CharClass::kConsonant) {
+    if (report_errors_) {
+      tprintf("Invalid start of Khmer syllable:0x%x\n",
+              codes_[codes_used_].second);
+    }
+    return false;
+  }
+  if (UseMultiCode(1)) return true;
+  if (codes_[codes_used_].first == CharClass::kRobat ||
+      codes_[codes_used_].first == CharClass::kNukta) {
+    if (UseMultiCode(1)) return true;
+  }
+  while (codes_used_ + 1 < num_codes &&
+         codes_[codes_used_].first == CharClass::kVirama &&
+         codes_[codes_used_ + 1].first == CharClass::kConsonant) {
+    ASSERT_HOST(!CodeOnlyToOutput());
+    if (UseMultiCode(2)) return true;
+    if (codes_[codes_used_].first == CharClass::kRobat) {
+      if (UseMultiCode(1)) return true;
+    }
+  }
+  int num_matra_parts = 0;
+  if (codes_[codes_used_].second == kZeroWidthJoiner ||
+      codes_[codes_used_].second == kZeroWidthNonJoiner) {
+    if (CodeOnlyToOutput()) {
+      if (report_errors_) {
+        tprintf("Unterminated joiner: 0x%x\n", output_.back());
+      }
+      return false;
+    }
+    ++num_matra_parts;
+  }
+  // Not quite as shown by the BNF, the matra piece is allowed as a matra on its
+  // own or as an addition to other matras.
+  if (codes_[codes_used_].first == CharClass::kMatra ||
+      codes_[codes_used_].first == CharClass::kMatraPiece) {
+    ++num_matra_parts;
+    if (UseMultiCode(num_matra_parts)) return true;
+  } else if (num_matra_parts) {
+    if (report_errors_) {
+      tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
+              output_.back(), codes_[codes_used_].second);
+    }
+    return false;
+  }
+  if (codes_[codes_used_].first == CharClass::kMatraPiece &&
+      codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
+    if (UseMultiCode(1)) return true;
+  }
+  if (codes_[codes_used_].first == CharClass::kVowelModifier) {
+    if (UseMultiCode(1)) return true;
+  }
+  if (codes_used_ + 1 < num_codes &&
+      codes_[codes_used_].first == CharClass::kVirama &&
+      codes_[codes_used_ + 1].first == CharClass::kConsonant) {
+    ASSERT_HOST(!CodeOnlyToOutput());
+    if (UseMultiCode(2)) return true;
+  }
+  return true;
+}
+
+Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const {
+  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
+  if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
+  if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
+  // Offset from the start of the relevant unicode code block aka code page.
+  int off = ch - static_cast<char32>(script_);
+  // Anything in another code block is other.
+  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
+  if (off <= 0x33) return CharClass::kConsonant;
+  if (off <= 0x45) return CharClass::kMatra;
+  if (off == 0x46) return CharClass::kMatraPiece;
+  if (off == 0x4c) return CharClass::kRobat;
+  if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
+  if (off <= 0x51) return CharClass::kVowelModifier;
+  if (off == 0x52) return CharClass::kVirama;
+  return CharClass::kOther;
+}
+
+}  // namespace tesseract
--- a/src/training/validate_khmer.h
+++ b/src/training/validate_khmer.h
@ -1,27 +1,27 @@
-#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_
-#define TESSERACT_TRAINING_VALIDATE_KHMER_H_
-
-#include "validator.h"
-
-namespace tesseract {
-
-// Subclass of Validator that validates and segments Khmer.
-class ValidateKhmer : public Validator {
- public:
-  ValidateKhmer(ViramaScript script, bool report_errors)
-      : Validator(script, report_errors) {}
-  ~ValidateKhmer() {}
-
- protected:
-  // Returns whether codes matches the pattern for an Khmer Grapheme.
-  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
-  // parts_ and output_. Returns true if a valid Grapheme was consumed,
-  // otherwise does not increment codes_used_.
-  bool ConsumeGraphemeIfValid() override;
-  // Returns the CharClass corresponding to the given Unicode ch.
-  CharClass UnicodeToCharClass(char32 ch) const override;
-};
-
-}  // namespace tesseract
-
-#endif  // TESSERACT_TRAINING_VALIDATE_KHMER_H_
+#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_
+#define TESSERACT_TRAINING_VALIDATE_KHMER_H_
+
+#include "validator.h"
+
+namespace tesseract {
+
+// Subclass of Validator that validates and segments Khmer.
+class ValidateKhmer : public Validator {
+ public:
+  ValidateKhmer(ViramaScript script, bool report_errors)
+      : Validator(script, report_errors) {}
+  ~ValidateKhmer() {}
+
+ protected:
+  // Returns whether codes matches the pattern for an Khmer Grapheme.
+  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
+  // parts_ and output_. Returns true if a valid Grapheme was consumed,
+  // otherwise does not increment codes_used_.
+  bool ConsumeGraphemeIfValid() override;
+  // Returns the CharClass corresponding to the given Unicode ch.
+  CharClass UnicodeToCharClass(char32 ch) const override;
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_VALIDATE_KHMER_H_
--- a/src/training/validate_myanmar.cpp
+++ b/src/training/validate_myanmar.cpp
@ -1,160 +1,160 @@
-#include "validate_myanmar.h"
-#include "errcode.h"
-#include "icuerrorcode.h"
-#include "tprintf.h"
-#include "unicode/uchar.h"    // From libicu
-#include "unicode/uscript.h"  // From libicu
-
-namespace tesseract {
-
-// Returns whether codes matches the pattern for a Myanmar Grapheme.
-// Taken directly from the unicode table 16-3.
-// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
-bool ValidateMyanmar::ConsumeGraphemeIfValid() {
-  int num_codes = codes_.size();
-  if (codes_used_ == num_codes) return true;
-  // Other.
-  if (IsMyanmarOther(codes_[codes_used_].second)) {
-    UseMultiCode(1);
-    return true;
-  }
-  // Kinzi.
-  if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
-      codes_[codes_used_ + 1].second == kMyanmarAsat &&
-      codes_[codes_used_ + 2].second == kMyanmarVirama) {
-    ASSERT_HOST(!CodeOnlyToOutput());
-    ASSERT_HOST(!CodeOnlyToOutput());
-    if (UseMultiCode(3)) return true;
-  }
-  // Base consonant/vowel. NOTE that since everything in Myanmar appears to be
-  // optional, except the base, this is the only place where invalid input can
-  // be detected and false returned.
-  if (IsMyanmarLetter(codes_[codes_used_].second)) {
-    if (UseMultiCode(1)) return true;
-  } else {
-    if (report_errors_) {
-      tprintf("Invalid start of Myanmar syllable:0x%x\n",
-              codes_[codes_used_].second);
-    }
-    return false;  // One of these is required.
-  }
-  if (ConsumeSubscriptIfPresent()) return true;
-  ConsumeOptionalSignsIfPresent();
-  // What we have consumed so far is a valid syllable.
-  return true;
-}
-
-// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
-// is little correspondence between the content of table 16-3 and the char
-// classes of the Indic languages. (Experts may disagree and improve!)
-// In unicode table 16-3 there is basically a long list of optional characters,
-// which can be coded quite easily.
-// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
-// The table also allows sequences that still result in dotted circles!!
-// So with a lot of guesswork the rest have been added in a reasonable place.
-Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
-  if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
-  return CharClass::kOther;
-}
-
-// Helper consumes/copies a virama and any subscript consonant.
-// Returns true if the end of input is reached.
-bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
-  // Subscript consonant. It appears there can be only one.
-  int num_codes = codes_.size();
-  if (codes_used_ + 1 < num_codes &&
-      codes_[codes_used_].second == kMyanmarVirama) {
-    if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
-      ASSERT_HOST(!CodeOnlyToOutput());
-      if (UseMultiCode(2)) return true;
-    }
-  }
-  return false;
-}
-
-// Helper consumes/copies a series of optional signs.
-// Returns true if the end of input is reached.
-bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
-  // The following characters are allowed, all optional, and in sequence.
-  // An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
-  const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
-                                      0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
-                                      0x1081, 0x1031});
-  for (char32 ch : kMedials) {
-    if (codes_[codes_used_].second == ch) {
-      if (UseMultiCode(1)) return true;
-      if (ch == kMyanmarMedialYa &&
-          codes_[codes_used_].second == kMyanmarAsat) {
-        if (UseMultiCode(1)) return true;
-      }
-    }
-  }
-  // Vowel sign i, ii, ai.
-  char32 ch = codes_[codes_used_].second;
-  if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
-    if (UseMultiCode(1)) return true;
-  }
-  // Vowel sign u, uu, and extensions.
-  ch = codes_[codes_used_].second;
-  if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
-      ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
-      (0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
-      ch == 0x109c || ch == 0x109d) {
-    if (UseMultiCode(1)) return true;
-  }
-  // Tall aa, aa with optional asat.
-  if (codes_[codes_used_].second == 0x102b ||
-      codes_[codes_used_].second == 0x102c) {
-    if (UseMultiCode(1)) return true;
-    if (codes_[codes_used_].second == kMyanmarAsat) {
-      if (UseMultiCode(1)) return true;
-    }
-  }
-  // The following characters are allowed, all optional, and in sequence.
-  const std::vector<char32> kSigns({0x1036, 0x1037});
-  for (char32 ch : kSigns) {
-    if (codes_[codes_used_].second == ch) {
-      if (UseMultiCode(1)) return true;
-    }
-  }
-  // Tone mark extensions.
-  ch = codes_[codes_used_].second;
-  if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
-      (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
-      ch == 0x108f || ch == 0x109a || ch == 0x109b ||
-      (0xaa7b <= ch && ch <= 0xaa7d)) {
-    if (UseMultiCode(1)) return true;
-  }
-  return false;
-}
-
-// Returns true if the unicode is a Myanmar "letter" including consonants
-// and independent vowels. Although table 16-3 distinguishes between some
-// base consonants and vowels, the extensions make no such distinction, so we
-// put them all into a single bucket.
-/* static */
-bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
-  return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
-         (0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
-         ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
-         (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
-         ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
-         (0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
-         ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
-}
-
-// Returns true if ch is a Myanmar digit or other symbol that does not take
-// part in being a syllable.
-/* static */
-bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
-  IcuErrorCode err;
-  UScriptCode script_code = uscript_getScript(ch, err);
-  if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
-      ch != Validator::kZeroWidthNonJoiner)
-    return true;
-  return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
-         (0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
-         (0xaa74 <= ch && ch <= 0xaa79);
-}
-
-}  // namespace tesseract
+#include "validate_myanmar.h"
+#include "errcode.h"
+#include "icuerrorcode.h"
+#include "tprintf.h"
+#include "unicode/uchar.h"    // From libicu
+#include "unicode/uscript.h"  // From libicu
+
+namespace tesseract {
+
+// Returns whether codes matches the pattern for a Myanmar Grapheme.
+// Taken directly from the unicode table 16-3.
+// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
+bool ValidateMyanmar::ConsumeGraphemeIfValid() {
+  int num_codes = codes_.size();
+  if (codes_used_ == num_codes) return true;
+  // Other.
+  if (IsMyanmarOther(codes_[codes_used_].second)) {
+    UseMultiCode(1);
+    return true;
+  }
+  // Kinzi.
+  if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
+      codes_[codes_used_ + 1].second == kMyanmarAsat &&
+      codes_[codes_used_ + 2].second == kMyanmarVirama) {
+    ASSERT_HOST(!CodeOnlyToOutput());
+    ASSERT_HOST(!CodeOnlyToOutput());
+    if (UseMultiCode(3)) return true;
+  }
+  // Base consonant/vowel. NOTE that since everything in Myanmar appears to be
+  // optional, except the base, this is the only place where invalid input can
+  // be detected and false returned.
+  if (IsMyanmarLetter(codes_[codes_used_].second)) {
+    if (UseMultiCode(1)) return true;
+  } else {
+    if (report_errors_) {
+      tprintf("Invalid start of Myanmar syllable:0x%x\n",
+              codes_[codes_used_].second);
+    }
+    return false;  // One of these is required.
+  }
+  if (ConsumeSubscriptIfPresent()) return true;
+  ConsumeOptionalSignsIfPresent();
+  // What we have consumed so far is a valid syllable.
+  return true;
+}
+
+// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
+// is little correspondence between the content of table 16-3 and the char
+// classes of the Indic languages. (Experts may disagree and improve!)
+// In unicode table 16-3 there is basically a long list of optional characters,
+// which can be coded quite easily.
+// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
+// The table also allows sequences that still result in dotted circles!!
+// So with a lot of guesswork the rest have been added in a reasonable place.
+Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
+  if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
+  return CharClass::kOther;
+}
+
+// Helper consumes/copies a virama and any subscript consonant.
+// Returns true if the end of input is reached.
+bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
+  // Subscript consonant. It appears there can be only one.
+  int num_codes = codes_.size();
+  if (codes_used_ + 1 < num_codes &&
+      codes_[codes_used_].second == kMyanmarVirama) {
+    if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
+      ASSERT_HOST(!CodeOnlyToOutput());
+      if (UseMultiCode(2)) return true;
+    }
+  }
+  return false;
+}
+
+// Helper consumes/copies a series of optional signs.
+// Returns true if the end of input is reached.
+bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
+  // The following characters are allowed, all optional, and in sequence.
+  // An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
+  const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
+                                      0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
+                                      0x1081, 0x1031});
+  for (char32 ch : kMedials) {
+    if (codes_[codes_used_].second == ch) {
+      if (UseMultiCode(1)) return true;
+      if (ch == kMyanmarMedialYa &&
+          codes_[codes_used_].second == kMyanmarAsat) {
+        if (UseMultiCode(1)) return true;
+      }
+    }
+  }
+  // Vowel sign i, ii, ai.
+  char32 ch = codes_[codes_used_].second;
+  if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
+    if (UseMultiCode(1)) return true;
+  }
+  // Vowel sign u, uu, and extensions.
+  ch = codes_[codes_used_].second;
+  if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
+      ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
+      (0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
+      ch == 0x109c || ch == 0x109d) {
+    if (UseMultiCode(1)) return true;
+  }
+  // Tall aa, aa with optional asat.
+  if (codes_[codes_used_].second == 0x102b ||
+      codes_[codes_used_].second == 0x102c) {
+    if (UseMultiCode(1)) return true;
+    if (codes_[codes_used_].second == kMyanmarAsat) {
+      if (UseMultiCode(1)) return true;
+    }
+  }
+  // The following characters are allowed, all optional, and in sequence.
+  const std::vector<char32> kSigns({0x1036, 0x1037});
+  for (char32 ch : kSigns) {
+    if (codes_[codes_used_].second == ch) {
+      if (UseMultiCode(1)) return true;
+    }
+  }
+  // Tone mark extensions.
+  ch = codes_[codes_used_].second;
+  if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
+      (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
+      ch == 0x108f || ch == 0x109a || ch == 0x109b ||
+      (0xaa7b <= ch && ch <= 0xaa7d)) {
+    if (UseMultiCode(1)) return true;
+  }
+  return false;
+}
+
+// Returns true if the unicode is a Myanmar "letter" including consonants
+// and independent vowels. Although table 16-3 distinguishes between some
+// base consonants and vowels, the extensions make no such distinction, so we
+// put them all into a single bucket.
+/* static */
+bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
+  return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
+         (0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
+         ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
+         (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
+         ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
+         (0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
+         ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
+}
+
+// Returns true if ch is a Myanmar digit or other symbol that does not take
+// part in being a syllable.
+/* static */
+bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
+  IcuErrorCode err;
+  UScriptCode script_code = uscript_getScript(ch, err);
+  if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
+      ch != Validator::kZeroWidthNonJoiner)
+    return true;
+  return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
+         (0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
+         (0xaa74 <= ch && ch <= 0xaa79);
+}
+
+}  // namespace tesseract
--- a/src/training/validate_myanmar.h
+++ b/src/training/validate_myanmar.h
@ -1,47 +1,47 @@
-#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
-#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
-
-#include "validator.h"
-
-namespace tesseract {
-
-// Subclass of Validator that validates and segments Myanmar.
-class ValidateMyanmar : public Validator {
- public:
-  ValidateMyanmar(ViramaScript script, bool report_errors)
-      : Validator(script, report_errors) {}
-  ~ValidateMyanmar() {}
-
- protected:
-  // Returns whether codes matches the pattern for a Myanmar Grapheme.
-  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
-  // parts_ and output_. Returns true if a valid Grapheme was consumed,
-  // otherwise does not increment codes_used_.
-  bool ConsumeGraphemeIfValid() override;
-  // Returns the CharClass corresponding to the given Unicode ch.
-  Validator::CharClass UnicodeToCharClass(char32 ch) const override;
-
- private:
-  // Helper consumes/copies a virama and any subscript consonant.
-  // Returns true if the end of input is reached.
-  bool ConsumeSubscriptIfPresent();
-  // Helper consumes/copies a series of optional signs.
-  // Returns true if the end of input is reached.
-  bool ConsumeOptionalSignsIfPresent();
-  // Returns true if the unicode is a Myanmar "letter" including consonants
-  // and independent vowels. Although table 16-3 distinguishes between some
-  // base consonants and vowels, the extensions make no such distinction, so we
-  // put them all into a single bucket.
-  static bool IsMyanmarLetter(char32 ch);
-  // Returns true if ch is a Myanmar digit or other symbol that does not take
-  // part in being a syllable.
-  static bool IsMyanmarOther(char32 ch);
-
-  // Some special unicodes used only for Myanmar processing.
-  static const char32 kMyanmarAsat = 0x103a;
-  static const char32 kMyanmarMedialYa = 0x103b;
-};
-
-}  // namespace tesseract
-
-#endif  // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
+#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
+#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
+
+#include "validator.h"
+
+namespace tesseract {
+
+// Subclass of Validator that validates and segments Myanmar.
+class ValidateMyanmar : public Validator {
+ public:
+  ValidateMyanmar(ViramaScript script, bool report_errors)
+      : Validator(script, report_errors) {}
+  ~ValidateMyanmar() {}
+
+ protected:
+  // Returns whether codes matches the pattern for a Myanmar Grapheme.
+  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
+  // parts_ and output_. Returns true if a valid Grapheme was consumed,
+  // otherwise does not increment codes_used_.
+  bool ConsumeGraphemeIfValid() override;
+  // Returns the CharClass corresponding to the given Unicode ch.
+  Validator::CharClass UnicodeToCharClass(char32 ch) const override;
+
+ private:
+  // Helper consumes/copies a virama and any subscript consonant.
+  // Returns true if the end of input is reached.
+  bool ConsumeSubscriptIfPresent();
+  // Helper consumes/copies a series of optional signs.
+  // Returns true if the end of input is reached.
+  bool ConsumeOptionalSignsIfPresent();
+  // Returns true if the unicode is a Myanmar "letter" including consonants
+  // and independent vowels. Although table 16-3 distinguishes between some
+  // base consonants and vowels, the extensions make no such distinction, so we
+  // put them all into a single bucket.
+  static bool IsMyanmarLetter(char32 ch);
+  // Returns true if ch is a Myanmar digit or other symbol that does not take
+  // part in being a syllable.
+  static bool IsMyanmarOther(char32 ch);
+
+  // Some special unicodes used only for Myanmar processing.
+  static const char32 kMyanmarAsat = 0x103a;
+  static const char32 kMyanmarMedialYa = 0x103b;
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
--- a/src/training/validator.cpp
+++ b/src/training/validator.cpp
--- a/src/training/validator.h
+++ b/src/training/validator.h
@ -1,243 +1,243 @@
-/**********************************************************************
- * File:        validator.h
- * Description: Base class for various text validators. Intended mainly for
- *              scripts that use a virama character.
- * Author:      Ray Smith
- * Created:     Tue May 23 2017
- *
- * (C) Copyright 2017, Google Inc.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * http://www.apache.org/licenses/LICENSE-2.0
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- **********************************************************************/
-
-#ifndef TESSERACT_TRAINING_VALIDATOR_H_
-#define TESSERACT_TRAINING_VALIDATOR_H_
-
-#include <memory>
-#include <vector>
-#include "unichar.h"
-
-namespace tesseract {
-
-// Different kinds of grapheme normalization - not just for Indic!
-// A grapheme is a syllable unit in Indic and can be several unicodes.
-// In other scripts, a grapheme is a base character and accent/diacritic
-// combination, as not all accented characters have a single composed form.
-enum class GraphemeNormMode {
-  // Validation result is a single string, even if input is multi-word.
-  kSingleString,
-  // Standard unicode graphemes are validated and output as grapheme units.
-  kCombined,
-  // Graphemes are validated and sub-divided. For virama-using scripts, units
-  // that correspond to repeatable glyphs are generated. (Mostly single unicodes
-  // but viramas and joiners are paired with the most sensible neighbor.)
-  // For non-virama scripts, this means that base/accent pairs are separated,
-  // ie the output is individual unicodes.
-  kGlyphSplit,
-  // The output is always single unicodes, regardless of the script.
-  kIndividualUnicodes,
-};
-
-// An enum representing the scripts that use a virama character. It is
-// guaranteed that the value of any element, (except kNonVirama) can be cast
-// to a unicode (char32) value that represents the start of the unicode range
-// of the corresponding script.
-enum class ViramaScript : char32 {
-  kNonVirama = 0,
-  kDevanagari = 0x900,
-  kBengali = 0x980,
-  kGurmukhi = 0xa00,
-  kGujarati = 0xa80,
-  kOriya = 0xb00,
-  kTamil = 0xb80,
-  kTelugu = 0xc00,
-  kKannada = 0xc80,
-  kMalayalam = 0xd00,
-  kSinhala = 0xd80,
-  kMyanmar = 0x1000,
-  kKhmer = 0x1780,
-};
-
-// Base class offers a validation API and protected methods to allow subclasses
-// to easily build the validated/segmented output.
-class Validator {
- public:
-  // Validates and cleans the src vector of unicodes to the *dest, according to
-  // g_mode. In the case of kSingleString, a single vector containing the whole
-  // result is added to *dest. With kCombined, multiple vectors are added to
-  // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
-  // added to *dest with a smaller unit representing a glyph in each.
-  // In case of validation error, returns false and as much as possible of the
-  // input, without discarding invalid text.
-  static bool ValidateCleanAndSegment(GraphemeNormMode g_mode,
-                                      bool report_errors,
-                                      const std::vector<char32>& src,
-                                      std::vector<std::vector<char32>>* dest);
-
-  // Returns true if the unicode ch is a non-printing zero-width mark of no
-  // significance to OCR training or evaluation.
-  static bool IsZeroWidthMark(char32 ch) {
-    return ch == kZeroWidthSpace || ch == kLeftToRightMark ||
-           ch == kRightToLeftMark || ch == kInvalid;
-  }
-  virtual ~Validator() {}
-
-  // Some specific but universally useful unicodes.
-  static const char32 kZeroWidthSpace;
-  static const char32 kZeroWidthNonJoiner;
-  static const char32 kZeroWidthJoiner;
-  static const char32 kLeftToRightMark;
-  static const char32 kRightToLeftMark;
-  static const char32 kInvalid;
-
- protected:
-  // These are more or less the character class identifiers in the ISCII
-  // standard, section 8.  They have been augmented with the Unicode meta
-  // characters Zero Width Joiner and Zero Width Non Joiner, and the
-  // Unicode Vedic Marks.
-  // The best sources of information on Unicode and Indic scripts are:
-  //   http://varamozhi.sourceforge.net/iscii91.pdf
-  //   http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
-  //   http://unicode.org/faq/indic.html
-  //   http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
-  enum class CharClass {
-    // NOTE: The values of the enum members are meaningless and arbitrary, ie
-    // they are not used for sorting, or any other risky application.
-    // The reason they are what they are is they are a single character
-    // abbreviation that can be used in a regexp/BNF definition of a grammar,
-    // IN A COMMENT, and still not relied upon in the code.
-    kConsonant = 'C',
-    kVowel = 'V',
-    kVirama = 'H',              // (aka Halant)
-    kMatra = 'M',               // (aka Dependent Vowel)
-    kMatraPiece = 'P',          // unicode provides pieces of Matras.
-    kVowelModifier = 'D',       // (candrabindu, anusvara, visarga, other marks)
-    kZeroWidthNonJoiner = 'z',  // Unicode Zero Width Non-Joiner U+200C
-    kZeroWidthJoiner = 'Z',     // Unicode Zero Width Joiner U+200D
-    kVedicMark = 'v',           // Modifiers can come modify any indic syllable.
-    kNukta = 'N',               // Occurs only immediately after consonants.
-    kRobat = 'R',               // Khmer only.
-    kOther = 'O',               // (digits, measures, non-Indic, etc)
-    // Additional classes used only by ValidateGrapheme.
-    kWhitespace = ' ',
-    kCombiner = 'c',  // Combiners other than virama.
-  };
-  typedef std::pair<CharClass, char32> IndicPair;
-
-  Validator(ViramaScript script, bool report_errors)
-      : script_(script),
-        codes_used_(0),
-        output_used_(0),
-        report_errors_(report_errors) {}
-
-  // Factory method that understands how to map script to the right subclass.
-  static std::unique_ptr<Validator> ScriptValidator(ViramaScript script,
-                                                    bool report_errors);
-
-  // Internal version of the public static ValidateCleanAndSegment.
-  // Validates and cleans the src vector of unicodes to the *dest, according to
-  // its type and the given g_mode.
-  // In case of validation error, returns false and returns as much as possible
-  // of the input, without discarding invalid text.
-  bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode,
-                                       const std::vector<char32>& src,
-                                       std::vector<std::vector<char32>>* dest);
-  // Moves the results from parts_ or output_ to dest according to g_mode.
-  void MoveResultsToDest(GraphemeNormMode g_mode,
-                         std::vector<std::vector<char32>>* dest);
-
-  // Computes and returns the ViramaScript corresponding to the most frequent
-  // virama-using script in the input, or kNonVirama if none are present.
-  static ViramaScript MostFrequentViramaScript(
-      const std::vector<char32>& utf32);
-  // Returns true if the given UTF-32 unicode is a "virama" character.
-  static bool IsVirama(char32 unicode);
-  // Returns true if the given UTF-32 unicode is a vedic accent.
-  static bool IsVedicAccent(char32 unicode);
-  // Returns true if the script is one that uses subscripts for conjuncts.
-  bool IsSubscriptScript() const;
-
-  // Helper function appends the next element of codes_ only to output_,
-  // without touching parts_
-  // Returns true at the end of codes_.
-  bool CodeOnlyToOutput() {
-    output_.push_back(codes_[codes_used_].second);
-    return ++codes_used_ == codes_.size();
-  }
-
-  // Helper function adds a length-element vector to parts_ from the last length
-  // elements of output_. If there are more than length unused elements in
-  // output_, adds unicodes as single-element vectors to parts_ to catch
-  // output_used_ up to output->size() - length before adding the length-element
-  // vector.
-  void MultiCodePart(int length) {
-    while (output_used_ + length < output_.size()) {
-      parts_.emplace_back(
-          std::initializer_list<char32>{output_[output_used_++]});
-    }
-    parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
-    while (++output_used_ < output_.size()) {
-      parts_.back().push_back(output_[output_used_]);
-    }
-  }
-
-  // Helper function appends the next element of codes_ to output_, and then
-  // calls MultiCodePart to add the appropriate components to parts_.
-  // Returns true at the end of codes_.
-  bool UseMultiCode(int length) {
-    output_.push_back(codes_[codes_used_].second);
-    MultiCodePart(length);
-    return ++codes_used_ == codes_.size();
-  }
-
-  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
-  // parts_ and output_. Returns true if a valid Grapheme was consumed,
-  // otherwise does not increment codes_used_.
-  virtual bool ConsumeGraphemeIfValid() = 0;
-  // Sets codes_ to the class codes for the given unicode text.
-  void ComputeClassCodes(const std::vector<char32>& text);
-  // Returns the CharClass corresponding to the given Unicode ch.
-  virtual CharClass UnicodeToCharClass(char32 ch) const = 0;
-  // Resets to the initial state.
-  void Clear();
-
-  // Number of unicodes in each Indic codepage.
-  static const int kIndicCodePageSize = 128;
-  // Lowest unicode value of any Indic script. (Devanagari).
-  static const char32 kMinIndicUnicode = 0x900;
-  // Highest unicode value of any consistent (ISCII-based) Indic script.
-  static const char32 kMaxSinhalaUnicode = 0xdff;
-  // Highest unicode value of any virama-using script. (Khmer).
-  static const char32 kMaxViramaScriptUnicode = 0x17ff;
-  // Some special unicodes.
-  static const char32 kSinhalaVirama = 0xdca;
-  static const char32 kMyanmarVirama = 0x1039;
-  static const char32 kKhmerVirama = 0x17d2;
-
-  // Script we are operating on.
-  ViramaScript script_;
-  // Input unicodes with assigned CharClass is the data to be validated.
-  std::vector<IndicPair> codes_;
-  // Glyph-like components of the input.
-  std::vector<std::vector<char32>> parts_;
-  // Copied validated unicodes from codes_ that are OK to output.
-  std::vector<char32> output_;
-  // The number of elements of codes_ that have been processed so far.
-  int codes_used_;
-  // The number of elements of output_ that have already been added to parts_.
-  int output_used_;
-  // Log error messages for reasons why text is invalid.
-  bool report_errors_;
-};
-
-}  // namespace tesseract
-
-#endif  // TESSERACT_TRAINING_VALIDATOR_H_
+/**********************************************************************
+ * File:        validator.h
+ * Description: Base class for various text validators. Intended mainly for
+ *              scripts that use a virama character.
+ * Author:      Ray Smith
+ * Created:     Tue May 23 2017
+ *
+ * (C) Copyright 2017, Google Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_TRAINING_VALIDATOR_H_
+#define TESSERACT_TRAINING_VALIDATOR_H_
+
+#include <memory>
+#include <vector>
+#include "unichar.h"
+
+namespace tesseract {
+
+// Different kinds of grapheme normalization - not just for Indic!
+// A grapheme is a syllable unit in Indic and can be several unicodes.
+// In other scripts, a grapheme is a base character and accent/diacritic
+// combination, as not all accented characters have a single composed form.
+enum class GraphemeNormMode {
+  // Validation result is a single string, even if input is multi-word.
+  kSingleString,
+  // Standard unicode graphemes are validated and output as grapheme units.
+  kCombined,
+  // Graphemes are validated and sub-divided. For virama-using scripts, units
+  // that correspond to repeatable glyphs are generated. (Mostly single unicodes
+  // but viramas and joiners are paired with the most sensible neighbor.)
+  // For non-virama scripts, this means that base/accent pairs are separated,
+  // ie the output is individual unicodes.
+  kGlyphSplit,
+  // The output is always single unicodes, regardless of the script.
+  kIndividualUnicodes,
+};
+
+// An enum representing the scripts that use a virama character. It is
+// guaranteed that the value of any element, (except kNonVirama) can be cast
+// to a unicode (char32) value that represents the start of the unicode range
+// of the corresponding script.
+enum class ViramaScript : char32 {
+  kNonVirama = 0,
+  kDevanagari = 0x900,
+  kBengali = 0x980,
+  kGurmukhi = 0xa00,
+  kGujarati = 0xa80,
+  kOriya = 0xb00,
+  kTamil = 0xb80,
+  kTelugu = 0xc00,
+  kKannada = 0xc80,
+  kMalayalam = 0xd00,
+  kSinhala = 0xd80,
+  kMyanmar = 0x1000,
+  kKhmer = 0x1780,
+};
+
+// Base class offers a validation API and protected methods to allow subclasses
+// to easily build the validated/segmented output.
+class Validator {
+ public:
+  // Validates and cleans the src vector of unicodes to the *dest, according to
+  // g_mode. In the case of kSingleString, a single vector containing the whole
+  // result is added to *dest. With kCombined, multiple vectors are added to
+  // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
+  // added to *dest with a smaller unit representing a glyph in each.
+  // In case of validation error, returns false and as much as possible of the
+  // input, without discarding invalid text.
+  static bool ValidateCleanAndSegment(GraphemeNormMode g_mode,
+                                      bool report_errors,
+                                      const std::vector<char32>& src,
+                                      std::vector<std::vector<char32>>* dest);
+
+  // Returns true if the unicode ch is a non-printing zero-width mark of no
+  // significance to OCR training or evaluation.
+  static bool IsZeroWidthMark(char32 ch) {
+    return ch == kZeroWidthSpace || ch == kLeftToRightMark ||
+           ch == kRightToLeftMark || ch == kInvalid;
+  }
+  virtual ~Validator() {}
+
+  // Some specific but universally useful unicodes.
+  static const char32 kZeroWidthSpace;
+  static const char32 kZeroWidthNonJoiner;
+  static const char32 kZeroWidthJoiner;
+  static const char32 kLeftToRightMark;
+  static const char32 kRightToLeftMark;
+  static const char32 kInvalid;
+
+ protected:
+  // These are more or less the character class identifiers in the ISCII
+  // standard, section 8.  They have been augmented with the Unicode meta
+  // characters Zero Width Joiner and Zero Width Non Joiner, and the
+  // Unicode Vedic Marks.
+  // The best sources of information on Unicode and Indic scripts are:
+  //   http://varamozhi.sourceforge.net/iscii91.pdf
+  //   http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
+  //   http://unicode.org/faq/indic.html
+  //   http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
+  enum class CharClass {
+    // NOTE: The values of the enum members are meaningless and arbitrary, ie
+    // they are not used for sorting, or any other risky application.
+    // The reason they are what they are is they are a single character
+    // abbreviation that can be used in a regexp/BNF definition of a grammar,
+    // IN A COMMENT, and still not relied upon in the code.
+    kConsonant = 'C',
+    kVowel = 'V',
+    kVirama = 'H',              // (aka Halant)
+    kMatra = 'M',               // (aka Dependent Vowel)
+    kMatraPiece = 'P',          // unicode provides pieces of Matras.
+    kVowelModifier = 'D',       // (candrabindu, anusvara, visarga, other marks)
+    kZeroWidthNonJoiner = 'z',  // Unicode Zero Width Non-Joiner U+200C
+    kZeroWidthJoiner = 'Z',     // Unicode Zero Width Joiner U+200D
+    kVedicMark = 'v',           // Modifiers can come modify any indic syllable.
+    kNukta = 'N',               // Occurs only immediately after consonants.
+    kRobat = 'R',               // Khmer only.
+    kOther = 'O',               // (digits, measures, non-Indic, etc)
+    // Additional classes used only by ValidateGrapheme.
+    kWhitespace = ' ',
+    kCombiner = 'c',  // Combiners other than virama.
+  };
+  typedef std::pair<CharClass, char32> IndicPair;
+
+  Validator(ViramaScript script, bool report_errors)
+      : script_(script),
+        codes_used_(0),
+        output_used_(0),
+        report_errors_(report_errors) {}
+
+  // Factory method that understands how to map script to the right subclass.
+  static std::unique_ptr<Validator> ScriptValidator(ViramaScript script,
+                                                    bool report_errors);
+
+  // Internal version of the public static ValidateCleanAndSegment.
+  // Validates and cleans the src vector of unicodes to the *dest, according to
+  // its type and the given g_mode.
+  // In case of validation error, returns false and returns as much as possible
+  // of the input, without discarding invalid text.
+  bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode,
+                                       const std::vector<char32>& src,
+                                       std::vector<std::vector<char32>>* dest);
+  // Moves the results from parts_ or output_ to dest according to g_mode.
+  void MoveResultsToDest(GraphemeNormMode g_mode,
+                         std::vector<std::vector<char32>>* dest);
+
+  // Computes and returns the ViramaScript corresponding to the most frequent
+  // virama-using script in the input, or kNonVirama if none are present.
+  static ViramaScript MostFrequentViramaScript(
+      const std::vector<char32>& utf32);
+  // Returns true if the given UTF-32 unicode is a "virama" character.
+  static bool IsVirama(char32 unicode);
+  // Returns true if the given UTF-32 unicode is a vedic accent.
+  static bool IsVedicAccent(char32 unicode);
+  // Returns true if the script is one that uses subscripts for conjuncts.
+  bool IsSubscriptScript() const;
+
+  // Helper function appends the next element of codes_ only to output_,
+  // without touching parts_
+  // Returns true at the end of codes_.
+  bool CodeOnlyToOutput() {
+    output_.push_back(codes_[codes_used_].second);
+    return ++codes_used_ == codes_.size();
+  }
+
+  // Helper function adds a length-element vector to parts_ from the last length
+  // elements of output_. If there are more than length unused elements in
+  // output_, adds unicodes as single-element vectors to parts_ to catch
+  // output_used_ up to output->size() - length before adding the length-element
+  // vector.
+  void MultiCodePart(int length) {
+    while (output_used_ + length < output_.size()) {
+      parts_.emplace_back(
+          std::initializer_list<char32>{output_[output_used_++]});
+    }
+    parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
+    while (++output_used_ < output_.size()) {
+      parts_.back().push_back(output_[output_used_]);
+    }
+  }
+
+  // Helper function appends the next element of codes_ to output_, and then
+  // calls MultiCodePart to add the appropriate components to parts_.
+  // Returns true at the end of codes_.
+  bool UseMultiCode(int length) {
+    output_.push_back(codes_[codes_used_].second);
+    MultiCodePart(length);
+    return ++codes_used_ == codes_.size();
+  }
+
+  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
+  // parts_ and output_. Returns true if a valid Grapheme was consumed,
+  // otherwise does not increment codes_used_.
+  virtual bool ConsumeGraphemeIfValid() = 0;
+  // Sets codes_ to the class codes for the given unicode text.
+  void ComputeClassCodes(const std::vector<char32>& text);
+  // Returns the CharClass corresponding to the given Unicode ch.
+  virtual CharClass UnicodeToCharClass(char32 ch) const = 0;
+  // Resets to the initial state.
+  void Clear();
+
+  // Number of unicodes in each Indic codepage.
+  static const int kIndicCodePageSize = 128;
+  // Lowest unicode value of any Indic script. (Devanagari).
+  static const char32 kMinIndicUnicode = 0x900;
+  // Highest unicode value of any consistent (ISCII-based) Indic script.
+  static const char32 kMaxSinhalaUnicode = 0xdff;
+  // Highest unicode value of any virama-using script. (Khmer).
+  static const char32 kMaxViramaScriptUnicode = 0x17ff;
+  // Some special unicodes.
+  static const char32 kSinhalaVirama = 0xdca;
+  static const char32 kMyanmarVirama = 0x1039;
+  static const char32 kKhmerVirama = 0x17d2;
+
+  // Script we are operating on.
+  ViramaScript script_;
+  // Input unicodes with assigned CharClass is the data to be validated.
+  std::vector<IndicPair> codes_;
+  // Glyph-like components of the input.
+  std::vector<std::vector<char32>> parts_;
+  // Copied validated unicodes from codes_ that are OK to output.
+  std::vector<char32> output_;
+  // The number of elements of codes_ that have been processed so far.
+  int codes_used_;
+  // The number of elements of output_ that have already been added to parts_.
+  int output_used_;
+  // Log error messages for reasons why text is invalid.
+  bool report_errors_;
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_VALIDATOR_H_
--- a/src/training/wordlist2dawg.cpp
+++ b/src/training/wordlist2dawg.cpp