Move training to src.

This commit is contained in:
Egor Pugin 2018-04-25 11:35:26 +03:00
parent ca5c15e6a8
commit 104fe7931c
65 changed files with 1664 additions and 1664 deletions

View File

@ -308,7 +308,7 @@ if (BUILD_TESTS AND EXISTS ${PROJECT_SOURCE_DIR}/googletest/CMakeLists.txt)
endif()
if (BUILD_TRAINING_TOOLS)
add_subdirectory(training)
add_subdirectory(src/training)
endif()
get_target_property(tesseract_NAME libtesseract NAME)

View File

@ -502,7 +502,7 @@ AC_CONFIG_FILES([java/com/google/scrollview/Makefile])
AC_CONFIG_FILES([java/com/google/scrollview/events/Makefile])
AC_CONFIG_FILES([java/com/google/scrollview/ui/Makefile])
AC_CONFIG_FILES([doc/Makefile])
AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(training/Makefile)])
AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(src/training/Makefile)])
AC_OUTPUT
# Final message

View File

@ -172,7 +172,7 @@ projects:
tessopt:
type: lib
static_only: true
files: training/tessopt.*
files: src/training/tessopt.*
include_directories: training
dependencies: libtesseract
@ -180,104 +180,104 @@ projects:
type: lib
static_only: true
files:
- training/commandlineflags.cpp
- training/commandlineflags.h
- training/commontraining.cpp
- training/commontraining.h
- src/training/commandlineflags.cpp
- src/training/commandlineflags.h
- src/training/commontraining.cpp
- src/training/commontraining.h
include_directories: training
dependencies:
- tessopt
ambiguous_words:
files: training/ambiguous_words.cpp
files: src/training/ambiguous_words.cpp
dependencies:
- libtesseract
classifier_tester:
files: training/classifier_tester.cpp
files: src/training/classifier_tester.cpp
dependencies: common_training
combine_lang_model:
files: training/combine_lang_model.cpp
files: src/training/combine_lang_model.cpp
dependencies: unicharset_training
combine_tessdata:
files: training/combine_tessdata.cpp
files: src/training/combine_tessdata.cpp
dependencies: libtesseract
cntraining:
files: training/cntraining.cpp
files: src/training/cntraining.cpp
dependencies: common_training
dawg2wordlist:
files: training/dawg2wordlist.cpp
files: src/training/dawg2wordlist.cpp
dependencies: libtesseract
mftraining:
files:
- training/mftraining.cpp
- training/mergenf.*
- src/training/mftraining.cpp
- src/training/mergenf.*
dependencies: common_training
shapeclustering:
files: training/shapeclustering.cpp
files: src/training/shapeclustering.cpp
dependencies: common_training
unicharset_extractor:
files: training/unicharset_extractor.cpp
files: src/training/unicharset_extractor.cpp
dependencies: unicharset_training
wordlist2dawg:
files: training/wordlist2dawg.cpp
files: src/training/wordlist2dawg.cpp
dependencies: libtesseract
unicharset_training:
type: lib
static_only: true
files:
- training/fileio.*
- training/icuerrorcode.h
- training/lang_model_helpers.*
- training/lstmtester.*
- training/normstrngs.*
- training/unicharset_training_utils.*
- training/validat.*
- src/training/fileio.*
- src/training/icuerrorcode.h
- src/training/lang_model_helpers.*
- src/training/lstmtester.*
- src/training/normstrngs.*
- src/training/unicharset_training_utils.*
- src/training/validat.*
include_directories: training
dependencies:
- common_training
- pvt.cppan.demo.unicode.icu.i18n
lstmeval:
files: training/lstmeval.cpp
files: src/training/lstmeval.cpp
dependencies: unicharset_training
lstmtraining:
files: training/lstmtraining.cpp
files: src/training/lstmtraining.cpp
dependencies: unicharset_training
set_unicharset_properties:
files: training/set_unicharset_properties.cpp
files: src/training/set_unicharset_properties.cpp
dependencies: unicharset_training
text2image:
files:
- training/text2image.cpp
- training/boxchar.cpp
- training/boxchar.h
- training/degradeimage.cpp
- training/degradeimage.h
- training/ligature_table.cpp
- training/ligature_table.h
- training/normstrngs.cpp
- training/normstrngs.h
- training/pango_font_info.cpp
- training/pango_font_info.h
- training/stringrenderer.cpp
- training/stringrenderer.h
- training/tlog.cpp
- training/tlog.h
- training/util.h
- training/icuerrorcode.h
- src/training/text2image.cpp
- src/training/boxchar.cpp
- src/training/boxchar.h
- src/training/degradeimage.cpp
- src/training/degradeimage.h
- src/training/ligature_table.cpp
- src/training/ligature_table.h
- src/training/normstrngs.cpp
- src/training/normstrngs.h
- src/training/pango_font_info.cpp
- src/training/pango_font_info.h
- src/training/stringrenderer.cpp
- src/training/stringrenderer.h
- src/training/tlog.cpp
- src/training/tlog.h
- src/training/util.h
- src/training/icuerrorcode.h
dependencies:
- unicharset_training

View File

@ -1,310 +1,310 @@
/**********************************************************************
* File: degradeimage.cpp
* Description: Function to degrade an image (usually of text) as if it
* has been printed and then scanned.
* Authors: Ray Smith
* Created: Tue Nov 19 2013
*
* (C) Copyright 2013, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#include "degradeimage.h"
#include <stdlib.h>
#include "allheaders.h" // from leptonica
#include "genericvector.h"
#include "helpers.h" // For TRand.
#include "rect.h"
namespace tesseract {
// A randomized perspective distortion can be applied to synthetic input.
// The perspective distortion comes from leptonica, which uses 2 sets of 4
// corners to determine the distortion. There are random values for each of
// the x numbers x0..x3 and y0..y3, except for x2 and x3 which are instead
// defined in terms of a single shear value. This reduces the degrees of
// freedom enough to make the distortion more realistic than it would otherwise
// be if all 8 coordinates could move independently.
// One additional factor is used for the color of the pixels that don't exist
// in the source image.
// Name for each of the randomizing factors.
enum FactorNames {
FN_INCOLOR,
FN_Y0,
FN_Y1,
FN_Y2,
FN_Y3,
FN_X0,
FN_X1,
FN_SHEAR,
// x2 = x1 - shear
// x3 = x0 + shear
FN_NUM_FACTORS
};
// Rotation is +/- kRotationRange radians.
const float kRotationRange = 0.02f;
// Number of grey levels to shift by for each exposure step.
const int kExposureFactor = 16;
// Salt and pepper noise is +/- kSaltnPepper.
const int kSaltnPepper = 5;
// Min sum of width + height on which to operate the ramp.
const int kMinRampSize = 1000;
// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
// corresponding to darkening on the copier and <0 lighter and 0 not copied.
// Exposures in [-2,2] are most useful, with -3 and 3 being extreme.
// If rotation is nullptr, rotation is skipped. If *rotation is non-zero, the
// pix is rotated by *rotation else it is randomly rotated and *rotation is
// modified.
//
// HOW IT WORKS:
// Most of the process is really dictated by the fact that the minimum
// available convolution is 3X3, which is too big really to simulate a
// good quality print/scan process. (2X2 would be better.)
// 1 pixel wide inputs are heavily smeared by the 3X3 convolution, making the
// images generally biased to being too light, so most of the work is to make
// them darker. 3 levels of thickening/darkening are achieved with 2 dilations,
// (using a greyscale erosion) one heavy (by being before convolution) and one
// light (after convolution).
// With no dilation, after covolution, the images are so light that a heavy
// constant offset is required to make the 0 image look reasonable. A simple
// constant offset multiple of exposure to undo this value is enough to achieve
// all the required lightening. This gives the advantage that exposure level 1
// with a single dilation gives a good impression of the broken-yet-too-dark
// problem that is often seen in scans.
// A small random rotation gives some varying greyscale values on the edges,
// and some random salt and pepper noise on top helps to realistically jaggy-up
// the edges.
// Finally a greyscale ramp provides a continuum of effects between exposure
// levels.
Pix* DegradeImage(Pix* input, int exposure, TRand* randomizer,
float* rotation) {
Pix* pix = pixConvertTo8(input, false);
pixDestroy(&input);
input = pix;
int width = pixGetWidth(input);
int height = pixGetHeight(input);
if (exposure >= 2) {
// An erosion simulates the spreading darkening of a dark copy.
// This is backwards to binary morphology,
// see http://www.leptonica.com/grayscale-morphology.html
pix = input;
input = pixErodeGray(pix, 3, 3);
pixDestroy(&pix);
}
// A convolution is essential to any mode as no scanner produces an
// image as sharp as the electronic image.
pix = pixBlockconv(input, 1, 1);
pixDestroy(&input);
// A small random rotation helps to make the edges jaggy in a realistic way.
if (rotation != nullptr) {
float radians_clockwise = 0.0f;
if (*rotation) {
radians_clockwise = *rotation;
} else if (randomizer != nullptr) {
radians_clockwise = randomizer->SignedRand(kRotationRange);
}
input = pixRotate(pix, radians_clockwise,
L_ROTATE_AREA_MAP, L_BRING_IN_WHITE,
0, 0);
// Rotate the boxes to match.
*rotation = radians_clockwise;
pixDestroy(&pix);
} else {
input = pix;
}
if (exposure >= 3 || exposure == 1) {
// Erosion after the convolution is not as heavy as before, so it is
// good for level 1 and in addition as a level 3.
// This is backwards to binary morphology,
// see http://www.leptonica.com/grayscale-morphology.html
pix = input;
input = pixErodeGray(pix, 3, 3);
pixDestroy(&pix);
}
// The convolution really needed to be 2x2 to be realistic enough, but
// we only have 3x3, so we have to bias the image darker or lose thin
// strokes.
int erosion_offset = 0;
// For light and 0 exposure, there is no dilation, so compensate for the
// convolution with a big darkening bias which is undone for lighter
// exposures.
if (exposure <= 0)
erosion_offset = -3 * kExposureFactor;
// Add in a general offset of the greyscales for the exposure level so
// a threshold of 128 gives a reasonable binary result.
erosion_offset -= exposure * kExposureFactor;
// Add a gradual fade over the page and a small amount of salt and pepper
// noise to simulate noise in the sensor/paper fibres and varying
// illumination.
l_uint32* data = pixGetData(input);
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
int pixel = GET_DATA_BYTE(data, x);
if (randomizer != nullptr)
pixel += randomizer->IntRand() % (kSaltnPepper*2 + 1) - kSaltnPepper;
if (height + width > kMinRampSize)
pixel -= (2*x + y) * 32 / (height + width);
pixel += erosion_offset;
if (pixel < 0)
pixel = 0;
if (pixel > 255)
pixel = 255;
SET_DATA_BYTE(data, x, pixel);
}
data += input->wpl;
}
return input;
}
// Creates and returns a Pix distorted by various means according to the bool
// flags. If boxes is not nullptr, the boxes are resized/positioned according to
// any spatial distortion and also by the integer reduction factor box_scale
// so they will match what the network will output.
// Returns nullptr on error. The returned Pix must be pixDestroyed.
Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
bool white_noise, bool smooth_noise, bool blur,
int box_reduction, TRand* randomizer,
GenericVector<TBOX>* boxes) {
Pix* distorted = pixCopy(nullptr, const_cast<Pix*>(pix));
// Things to do to synthetic training data.
if (invert && randomizer->SignedRand(1.0) < 0)
pixInvert(distorted, distorted);
if ((white_noise || smooth_noise) && randomizer->SignedRand(1.0) > 0.0) {
// TODO(rays) Cook noise in a more thread-safe manner than rand().
// Attempt to make the sequences reproducible.
srand(randomizer->IntRand());
Pix* pixn = pixAddGaussianNoise(distorted, 8.0);
pixDestroy(&distorted);
if (smooth_noise) {
distorted = pixBlockconv(pixn, 1, 1);
pixDestroy(&pixn);
} else {
distorted = pixn;
}
}
if (blur && randomizer->SignedRand(1.0) > 0.0) {
Pix* blurred = pixBlockconv(distorted, 1, 1);
pixDestroy(&distorted);
distorted = blurred;
}
if (perspective)
GeneratePerspectiveDistortion(0, 0, randomizer, &distorted, boxes);
if (boxes != nullptr) {
for (int b = 0; b < boxes->size(); ++b) {
(*boxes)[b].scale(1.0f / box_reduction);
if ((*boxes)[b].width() <= 0)
(*boxes)[b].set_right((*boxes)[b].left() + 1);
}
}
return distorted;
}
// Distorts anything that has a non-null pointer with the same pseudo-random
// perspective distortion. Width and height only need to be set if there
// is no pix. If there is a pix, then they will be taken from there.
void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
Pix** pix, GenericVector<TBOX>* boxes) {
if (pix != nullptr && *pix != nullptr) {
width = pixGetWidth(*pix);
height = pixGetHeight(*pix);
}
float* im_coeffs = nullptr;
float* box_coeffs = nullptr;
l_int32 incolor =
ProjectiveCoeffs(width, height, randomizer, &im_coeffs, &box_coeffs);
if (pix != nullptr && *pix != nullptr) {
// Transform the image.
Pix* transformed = pixProjective(*pix, im_coeffs, incolor);
if (transformed == nullptr) {
tprintf("Projective transformation failed!!\n");
return;
}
pixDestroy(pix);
*pix = transformed;
}
if (boxes != nullptr) {
// Transform the boxes.
for (int b = 0; b < boxes->size(); ++b) {
int x1, y1, x2, y2;
const TBOX& box = (*boxes)[b];
projectiveXformSampledPt(box_coeffs, box.left(), height - box.top(), &x1,
&y1);
projectiveXformSampledPt(box_coeffs, box.right(), height - box.bottom(),
&x2, &y2);
TBOX new_box1(x1, height - y2, x2, height - y1);
projectiveXformSampledPt(box_coeffs, box.left(), height - box.bottom(),
&x1, &y1);
projectiveXformSampledPt(box_coeffs, box.right(), height - box.top(), &x2,
&y2);
TBOX new_box2(x1, height - y1, x2, height - y2);
(*boxes)[b] = new_box1.bounding_union(new_box2);
}
}
free(im_coeffs);
free(box_coeffs);
}
// Computes the coefficients of a randomized projective transformation.
// The image transform requires backward transformation coefficient, and the
// box transform the forward coefficients.
// Returns the incolor arg to pixProjective.
int ProjectiveCoeffs(int width, int height, TRand* randomizer,
float** im_coeffs, float** box_coeffs) {
// Setup "from" points.
Pta* src_pts = ptaCreate(4);
ptaAddPt(src_pts, 0.0f, 0.0f);
ptaAddPt(src_pts, width, 0.0f);
ptaAddPt(src_pts, width, height);
ptaAddPt(src_pts, 0.0f, height);
// Extract factors from pseudo-random sequence.
float factors[FN_NUM_FACTORS];
float shear = 0.0f; // Shear is signed.
for (int i = 0; i < FN_NUM_FACTORS; ++i) {
// Everything is squared to make wild values rarer.
if (i == FN_SHEAR) {
// Shear is signed.
shear = randomizer->SignedRand(0.5 / 3.0);
shear = shear >= 0.0 ? shear * shear : -shear * shear;
// Keep the sheared points within the original rectangle.
if (shear < -factors[FN_X0]) shear = -factors[FN_X0];
if (shear > factors[FN_X1]) shear = factors[FN_X1];
factors[i] = shear;
} else if (i != FN_INCOLOR) {
factors[i] = fabs(randomizer->SignedRand(1.0));
if (i <= FN_Y3)
factors[i] *= 5.0 / 8.0;
else
factors[i] *= 0.5;
factors[i] *= factors[i];
}
}
// Setup "to" points.
Pta* dest_pts = ptaCreate(4);
ptaAddPt(dest_pts, factors[FN_X0] * width, factors[FN_Y0] * height);
ptaAddPt(dest_pts, (1.0f - factors[FN_X1]) * width, factors[FN_Y1] * height);
ptaAddPt(dest_pts, (1.0f - factors[FN_X1] + shear) * width,
(1 - factors[FN_Y2]) * height);
ptaAddPt(dest_pts, (factors[FN_X0] + shear) * width,
(1 - factors[FN_Y3]) * height);
getProjectiveXformCoeffs(dest_pts, src_pts, im_coeffs);
getProjectiveXformCoeffs(src_pts, dest_pts, box_coeffs);
ptaDestroy(&src_pts);
ptaDestroy(&dest_pts);
return factors[FN_INCOLOR] > 0.5f ? L_BRING_IN_WHITE : L_BRING_IN_BLACK;
}
} // namespace tesseract
/**********************************************************************
* File: degradeimage.cpp
* Description: Function to degrade an image (usually of text) as if it
* has been printed and then scanned.
* Authors: Ray Smith
* Created: Tue Nov 19 2013
*
* (C) Copyright 2013, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#include "degradeimage.h"
#include <stdlib.h>
#include "allheaders.h" // from leptonica
#include "genericvector.h"
#include "helpers.h" // For TRand.
#include "rect.h"
namespace tesseract {
// A randomized perspective distortion can be applied to synthetic input.
// The perspective distortion comes from leptonica, which uses 2 sets of 4
// corners to determine the distortion. There are random values for each of
// the x numbers x0..x3 and y0..y3, except for x2 and x3 which are instead
// defined in terms of a single shear value. This reduces the degrees of
// freedom enough to make the distortion more realistic than it would otherwise
// be if all 8 coordinates could move independently.
// One additional factor is used for the color of the pixels that don't exist
// in the source image.
// Name for each of the randomizing factors.
enum FactorNames {
FN_INCOLOR,
FN_Y0,
FN_Y1,
FN_Y2,
FN_Y3,
FN_X0,
FN_X1,
FN_SHEAR,
// x2 = x1 - shear
// x3 = x0 + shear
FN_NUM_FACTORS
};
// Rotation is +/- kRotationRange radians.
const float kRotationRange = 0.02f;
// Number of grey levels to shift by for each exposure step.
const int kExposureFactor = 16;
// Salt and pepper noise is +/- kSaltnPepper.
const int kSaltnPepper = 5;
// Min sum of width + height on which to operate the ramp.
const int kMinRampSize = 1000;
// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
// corresponding to darkening on the copier and <0 lighter and 0 not copied.
// Exposures in [-2,2] are most useful, with -3 and 3 being extreme.
// If rotation is nullptr, rotation is skipped. If *rotation is non-zero, the
// pix is rotated by *rotation else it is randomly rotated and *rotation is
// modified.
//
// HOW IT WORKS:
// Most of the process is really dictated by the fact that the minimum
// available convolution is 3X3, which is too big really to simulate a
// good quality print/scan process. (2X2 would be better.)
// 1 pixel wide inputs are heavily smeared by the 3X3 convolution, making the
// images generally biased to being too light, so most of the work is to make
// them darker. 3 levels of thickening/darkening are achieved with 2 dilations,
// (using a greyscale erosion) one heavy (by being before convolution) and one
// light (after convolution).
// With no dilation, after covolution, the images are so light that a heavy
// constant offset is required to make the 0 image look reasonable. A simple
// constant offset multiple of exposure to undo this value is enough to achieve
// all the required lightening. This gives the advantage that exposure level 1
// with a single dilation gives a good impression of the broken-yet-too-dark
// problem that is often seen in scans.
// A small random rotation gives some varying greyscale values on the edges,
// and some random salt and pepper noise on top helps to realistically jaggy-up
// the edges.
// Finally a greyscale ramp provides a continuum of effects between exposure
// levels.
Pix* DegradeImage(Pix* input, int exposure, TRand* randomizer,
float* rotation) {
Pix* pix = pixConvertTo8(input, false);
pixDestroy(&input);
input = pix;
int width = pixGetWidth(input);
int height = pixGetHeight(input);
if (exposure >= 2) {
// An erosion simulates the spreading darkening of a dark copy.
// This is backwards to binary morphology,
// see http://www.leptonica.com/grayscale-morphology.html
pix = input;
input = pixErodeGray(pix, 3, 3);
pixDestroy(&pix);
}
// A convolution is essential to any mode as no scanner produces an
// image as sharp as the electronic image.
pix = pixBlockconv(input, 1, 1);
pixDestroy(&input);
// A small random rotation helps to make the edges jaggy in a realistic way.
if (rotation != nullptr) {
float radians_clockwise = 0.0f;
if (*rotation) {
radians_clockwise = *rotation;
} else if (randomizer != nullptr) {
radians_clockwise = randomizer->SignedRand(kRotationRange);
}
input = pixRotate(pix, radians_clockwise,
L_ROTATE_AREA_MAP, L_BRING_IN_WHITE,
0, 0);
// Rotate the boxes to match.
*rotation = radians_clockwise;
pixDestroy(&pix);
} else {
input = pix;
}
if (exposure >= 3 || exposure == 1) {
// Erosion after the convolution is not as heavy as before, so it is
// good for level 1 and in addition as a level 3.
// This is backwards to binary morphology,
// see http://www.leptonica.com/grayscale-morphology.html
pix = input;
input = pixErodeGray(pix, 3, 3);
pixDestroy(&pix);
}
// The convolution really needed to be 2x2 to be realistic enough, but
// we only have 3x3, so we have to bias the image darker or lose thin
// strokes.
int erosion_offset = 0;
// For light and 0 exposure, there is no dilation, so compensate for the
// convolution with a big darkening bias which is undone for lighter
// exposures.
if (exposure <= 0)
erosion_offset = -3 * kExposureFactor;
// Add in a general offset of the greyscales for the exposure level so
// a threshold of 128 gives a reasonable binary result.
erosion_offset -= exposure * kExposureFactor;
// Add a gradual fade over the page and a small amount of salt and pepper
// noise to simulate noise in the sensor/paper fibres and varying
// illumination.
l_uint32* data = pixGetData(input);
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
int pixel = GET_DATA_BYTE(data, x);
if (randomizer != nullptr)
pixel += randomizer->IntRand() % (kSaltnPepper*2 + 1) - kSaltnPepper;
if (height + width > kMinRampSize)
pixel -= (2*x + y) * 32 / (height + width);
pixel += erosion_offset;
if (pixel < 0)
pixel = 0;
if (pixel > 255)
pixel = 255;
SET_DATA_BYTE(data, x, pixel);
}
data += input->wpl;
}
return input;
}
// Creates and returns a Pix distorted by various means according to the bool
// flags. If boxes is not nullptr, the boxes are resized/positioned according to
// any spatial distortion and also by the integer reduction factor box_scale
// so they will match what the network will output.
// Returns nullptr on error. The returned Pix must be pixDestroyed.
Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
bool white_noise, bool smooth_noise, bool blur,
int box_reduction, TRand* randomizer,
GenericVector<TBOX>* boxes) {
Pix* distorted = pixCopy(nullptr, const_cast<Pix*>(pix));
// Things to do to synthetic training data.
if (invert && randomizer->SignedRand(1.0) < 0)
pixInvert(distorted, distorted);
if ((white_noise || smooth_noise) && randomizer->SignedRand(1.0) > 0.0) {
// TODO(rays) Cook noise in a more thread-safe manner than rand().
// Attempt to make the sequences reproducible.
srand(randomizer->IntRand());
Pix* pixn = pixAddGaussianNoise(distorted, 8.0);
pixDestroy(&distorted);
if (smooth_noise) {
distorted = pixBlockconv(pixn, 1, 1);
pixDestroy(&pixn);
} else {
distorted = pixn;
}
}
if (blur && randomizer->SignedRand(1.0) > 0.0) {
Pix* blurred = pixBlockconv(distorted, 1, 1);
pixDestroy(&distorted);
distorted = blurred;
}
if (perspective)
GeneratePerspectiveDistortion(0, 0, randomizer, &distorted, boxes);
if (boxes != nullptr) {
for (int b = 0; b < boxes->size(); ++b) {
(*boxes)[b].scale(1.0f / box_reduction);
if ((*boxes)[b].width() <= 0)
(*boxes)[b].set_right((*boxes)[b].left() + 1);
}
}
return distorted;
}
// Distorts anything that has a non-null pointer with the same pseudo-random
// perspective distortion. Width and height only need to be set if there
// is no pix. If there is a pix, then they will be taken from there.
void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
Pix** pix, GenericVector<TBOX>* boxes) {
if (pix != nullptr && *pix != nullptr) {
width = pixGetWidth(*pix);
height = pixGetHeight(*pix);
}
float* im_coeffs = nullptr;
float* box_coeffs = nullptr;
l_int32 incolor =
ProjectiveCoeffs(width, height, randomizer, &im_coeffs, &box_coeffs);
if (pix != nullptr && *pix != nullptr) {
// Transform the image.
Pix* transformed = pixProjective(*pix, im_coeffs, incolor);
if (transformed == nullptr) {
tprintf("Projective transformation failed!!\n");
return;
}
pixDestroy(pix);
*pix = transformed;
}
if (boxes != nullptr) {
// Transform the boxes.
for (int b = 0; b < boxes->size(); ++b) {
int x1, y1, x2, y2;
const TBOX& box = (*boxes)[b];
projectiveXformSampledPt(box_coeffs, box.left(), height - box.top(), &x1,
&y1);
projectiveXformSampledPt(box_coeffs, box.right(), height - box.bottom(),
&x2, &y2);
TBOX new_box1(x1, height - y2, x2, height - y1);
projectiveXformSampledPt(box_coeffs, box.left(), height - box.bottom(),
&x1, &y1);
projectiveXformSampledPt(box_coeffs, box.right(), height - box.top(), &x2,
&y2);
TBOX new_box2(x1, height - y1, x2, height - y2);
(*boxes)[b] = new_box1.bounding_union(new_box2);
}
}
free(im_coeffs);
free(box_coeffs);
}
// Computes the coefficients of a randomized projective transformation.
// The image transform requires backward transformation coefficient, and the
// box transform the forward coefficients.
// Returns the incolor arg to pixProjective.
int ProjectiveCoeffs(int width, int height, TRand* randomizer,
float** im_coeffs, float** box_coeffs) {
// Setup "from" points.
Pta* src_pts = ptaCreate(4);
ptaAddPt(src_pts, 0.0f, 0.0f);
ptaAddPt(src_pts, width, 0.0f);
ptaAddPt(src_pts, width, height);
ptaAddPt(src_pts, 0.0f, height);
// Extract factors from pseudo-random sequence.
float factors[FN_NUM_FACTORS];
float shear = 0.0f; // Shear is signed.
for (int i = 0; i < FN_NUM_FACTORS; ++i) {
// Everything is squared to make wild values rarer.
if (i == FN_SHEAR) {
// Shear is signed.
shear = randomizer->SignedRand(0.5 / 3.0);
shear = shear >= 0.0 ? shear * shear : -shear * shear;
// Keep the sheared points within the original rectangle.
if (shear < -factors[FN_X0]) shear = -factors[FN_X0];
if (shear > factors[FN_X1]) shear = factors[FN_X1];
factors[i] = shear;
} else if (i != FN_INCOLOR) {
factors[i] = fabs(randomizer->SignedRand(1.0));
if (i <= FN_Y3)
factors[i] *= 5.0 / 8.0;
else
factors[i] *= 0.5;
factors[i] *= factors[i];
}
}
// Setup "to" points.
Pta* dest_pts = ptaCreate(4);
ptaAddPt(dest_pts, factors[FN_X0] * width, factors[FN_Y0] * height);
ptaAddPt(dest_pts, (1.0f - factors[FN_X1]) * width, factors[FN_Y1] * height);
ptaAddPt(dest_pts, (1.0f - factors[FN_X1] + shear) * width,
(1 - factors[FN_Y2]) * height);
ptaAddPt(dest_pts, (factors[FN_X0] + shear) * width,
(1 - factors[FN_Y3]) * height);
getProjectiveXformCoeffs(dest_pts, src_pts, im_coeffs);
getProjectiveXformCoeffs(src_pts, dest_pts, box_coeffs);
ptaDestroy(&src_pts);
ptaDestroy(&dest_pts);
return factors[FN_INCOLOR] > 0.5f ? L_BRING_IN_WHITE : L_BRING_IN_BLACK;
}
} // namespace tesseract

View File

@ -1,61 +1,61 @@
/**********************************************************************
* File: degradeimage.h
* Description: Function to degrade an image (usually of text) as if it
* has been printed and then scanned.
* Authors: Ray Smith
* Created: Tue Nov 19 2013
*
* (C) Copyright 2013, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_
#define TESSERACT_TRAINING_DEGRADEIMAGE_H_
#include "allheaders.h"
#include "genericvector.h"
#include "helpers.h" // For TRand.
#include "rect.h"
namespace tesseract {
// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
// corresponding to darkening on the copier and <0 lighter and 0 not copied.
// If rotation is not nullptr, the clockwise rotation in radians is saved there.
// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.)
// The input image is destroyed and a different image returned.
struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer,
float* rotation);
// Creates and returns a Pix distorted by various means according to the bool
// flags. If boxes is not nullptr, the boxes are resized/positioned according to
// any spatial distortion and also by the integer reduction factor box_scale
// so they will match what the network will output.
// Returns nullptr on error. The returned Pix must be pixDestroyed.
Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
bool white_noise, bool smooth_noise, bool blur,
int box_reduction, TRand* randomizer,
GenericVector<TBOX>* boxes);
// Distorts anything that has a non-null pointer with the same pseudo-random
// perspective distortion. Width and height only need to be set if there
// is no pix. If there is a pix, then they will be taken from there.
void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
Pix** pix, GenericVector<TBOX>* boxes);
// Computes the coefficients of a randomized projective transformation.
// The image transform requires backward transformation coefficient, and the
// box transform the forward coefficients.
// Returns the incolor arg to pixProjective.
int ProjectiveCoeffs(int width, int height, TRand* randomizer,
float** im_coeffs, float** box_coeffs);
} // namespace tesseract
#endif // TESSERACT_TRAINING_DEGRADEIMAGE_H_
/**********************************************************************
* File: degradeimage.h
* Description: Function to degrade an image (usually of text) as if it
* has been printed and then scanned.
* Authors: Ray Smith
* Created: Tue Nov 19 2013
*
* (C) Copyright 2013, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_
#define TESSERACT_TRAINING_DEGRADEIMAGE_H_
#include "allheaders.h"
#include "genericvector.h"
#include "helpers.h" // For TRand.
#include "rect.h"
namespace tesseract {
// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
// corresponding to darkening on the copier and <0 lighter and 0 not copied.
// If rotation is not nullptr, the clockwise rotation in radians is saved there.
// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.)
// The input image is destroyed and a different image returned.
struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer,
float* rotation);
// Creates and returns a Pix distorted by various means according to the bool
// flags. If boxes is not nullptr, the boxes are resized/positioned according to
// any spatial distortion and also by the integer reduction factor box_scale
// so they will match what the network will output.
// Returns nullptr on error. The returned Pix must be pixDestroyed.
Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
bool white_noise, bool smooth_noise, bool blur,
int box_reduction, TRand* randomizer,
GenericVector<TBOX>* boxes);
// Distorts anything that has a non-null pointer with the same pseudo-random
// perspective distortion. Width and height only need to be set if there
// is no pix. If there is a pix, then they will be taken from there.
void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
Pix** pix, GenericVector<TBOX>* boxes);
// Computes the coefficients of a randomized projective transformation.
// The image transform requires backward transformation coefficient, and the
// box transform the forward coefficients.
// Returns the incolor arg to pixProjective.
int ProjectiveCoeffs(int width, int height, TRand* randomizer,
float** im_coeffs, float** box_coeffs);
} // namespace tesseract
#endif // TESSERACT_TRAINING_DEGRADEIMAGE_H_

View File

@ -1,66 +1,66 @@
/**********************************************************************
* File: icuerrorcode.h
* Description: Wrapper class for UErrorCode, with conversion operators for
* direct use in ICU C and C++ APIs.
* Author: Fredrik Roubert
* Created: Thu July 4 2013
*
* Features:
* - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,
* removing one common source of errors.
* - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking
* UErrorCode& (reference), via conversion operators.
* - Automatic checking for success when it goes out of scope. On failure,
* the destructor will log an error message and exit.
*
* Most of ICU will handle errors gracefully and provide sensible fallbacks.
* Using IcuErrorCode, it is therefore possible to write very compact code
* that does sensible things on failure and provides logging for debugging.
*
* Example:
* IcuErrorCode icuerrorcode;
* return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL;
*
* (C) Copyright 2013, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_
#define TESSERACT_CCUTIL_ICUERRORCODE_H_
#include "tprintf.h"
#include "unicode/errorcode.h" // From libicu
namespace tesseract {
class IcuErrorCode : public icu::ErrorCode {
public:
IcuErrorCode() {}
virtual ~IcuErrorCode() {
if (isFailure()) {
handleFailure();
}
}
protected:
virtual void handleFailure() const {
tprintf("ICU ERROR: %s", errorName());
exit(errorCode);
}
private:
// Disallow implicit copying of object.
IcuErrorCode(const IcuErrorCode&);
void operator=(const IcuErrorCode&);
};
} // namespace tesseract
#endif // TESSERACT_CCUTIL_ICUERRORCODE_H_
/**********************************************************************
* File: icuerrorcode.h
* Description: Wrapper class for UErrorCode, with conversion operators for
* direct use in ICU C and C++ APIs.
* Author: Fredrik Roubert
* Created: Thu July 4 2013
*
* Features:
* - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,
* removing one common source of errors.
* - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking
* UErrorCode& (reference), via conversion operators.
* - Automatic checking for success when it goes out of scope. On failure,
* the destructor will log an error message and exit.
*
* Most of ICU will handle errors gracefully and provide sensible fallbacks.
* Using IcuErrorCode, it is therefore possible to write very compact code
* that does sensible things on failure and provides logging for debugging.
*
* Example:
* IcuErrorCode icuerrorcode;
* return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL;
*
* (C) Copyright 2013, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_
#define TESSERACT_CCUTIL_ICUERRORCODE_H_
#include "tprintf.h"
#include "unicode/errorcode.h" // From libicu
namespace tesseract {
class IcuErrorCode : public icu::ErrorCode {
public:
IcuErrorCode() {}
virtual ~IcuErrorCode() {
if (isFailure()) {
handleFailure();
}
}
protected:
virtual void handleFailure() const {
tprintf("ICU ERROR: %s", errorName());
exit(errorCode);
}
private:
// Disallow implicit copying of object.
IcuErrorCode(const IcuErrorCode&);
void operator=(const IcuErrorCode&);
};
} // namespace tesseract
#endif // TESSERACT_CCUTIL_ICUERRORCODE_H_

View File

View File

@ -1,353 +1,353 @@
/******************************************************************************
** Filename: MergeNF.c
** Purpose: Program for merging similar nano-feature protos
** Author: Dan Johnson
** History: Wed Nov 21 09:55:23 1990, DSJ, Created.
**
** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
******************************************************************************/
#include "mergenf.h"
#include "host.h"
#include "efio.h"
#include "clusttool.h"
#include "cluster.h"
#include "oldlist.h"
#include "protos.h"
#include "ndminx.h"
#include "ocrfeatures.h"
#include "const.h"
#include "featdefs.h"
#include "intproto.h"
#include "params.h"
#include <stdio.h>
#include <string.h>
#include <math.h>
/*-------------------once in subfeat---------------------------------*/
double_VAR(training_angle_match_scale, 1.0, "Angle Match Scale ...");
double_VAR(training_similarity_midpoint, 0.0075, "Similarity Midpoint ...");
double_VAR(training_similarity_curl, 2.0, "Similarity Curl ...");
/*-----------------------------once in fasttrain----------------------------------*/
double_VAR(training_tangent_bbox_pad, 0.5, "Tangent bounding box pad ...");
double_VAR(training_orthogonal_bbox_pad, 2.5, "Orthogonal bounding box pad ...");
double_VAR(training_angle_pad, 45.0, "Angle pad ...");
/**
* Compare protos p1 and p2 and return an estimate of the
* worst evidence rating that will result for any part of p1
* that is compared to p2. In other words, if p1 were broken
* into pico-features and each pico-feature was matched to p2,
* what is the worst evidence rating that will be achieved for
* any pico-feature.
*
* @param p1, p2 protos to be compared
*
* Globals: none
*
* @return Worst possible result when matching p1 to p2.
* @note Exceptions: none
* @note History: Mon Nov 26 08:27:53 1990, DSJ, Created.
*/
FLOAT32 CompareProtos(PROTO p1, PROTO p2) {
FEATURE Feature;
FLOAT32 WorstEvidence = WORST_EVIDENCE;
FLOAT32 Evidence;
FLOAT32 Angle, Length;
/* if p1 and p2 are not close in length, don't let them match */
Length = fabs (p1->Length - p2->Length);
if (Length > MAX_LENGTH_MISMATCH)
return (0.0);
/* create a dummy pico-feature to be used for comparisons */
Feature = NewFeature (&PicoFeatDesc);
Feature->Params[PicoFeatDir] = p1->Angle;
/* convert angle to radians */
Angle = p1->Angle * 2.0 * PI;
/* find distance from center of p1 to 1/2 picofeat from end */
Length = p1->Length / 2.0 - GetPicoFeatureLength () / 2.0;
if (Length < 0) Length = 0;
/* set the dummy pico-feature at one end of p1 and match it to p2 */
Feature->Params[PicoFeatX] = p1->X + cos (Angle) * Length;
Feature->Params[PicoFeatY] = p1->Y + sin (Angle) * Length;
if (DummyFastMatch (Feature, p2)) {
Evidence = SubfeatureEvidence (Feature, p2);
if (Evidence < WorstEvidence)
WorstEvidence = Evidence;
} else {
FreeFeature(Feature);
return 0.0;
}
/* set the dummy pico-feature at the other end of p1 and match it to p2 */
Feature->Params[PicoFeatX] = p1->X - cos (Angle) * Length;
Feature->Params[PicoFeatY] = p1->Y - sin (Angle) * Length;
if (DummyFastMatch (Feature, p2)) {
Evidence = SubfeatureEvidence (Feature, p2);
if (Evidence < WorstEvidence)
WorstEvidence = Evidence;
} else {
FreeFeature(Feature);
return 0.0;
}
FreeFeature (Feature);
return (WorstEvidence);
} /* CompareProtos */
/**
* This routine computes a proto which is the weighted
* average of protos p1 and p2. The new proto is returned
* in MergedProto.
*
* @param p1, p2 protos to be merged
* @param w1, w2 weight of each proto
* @param MergedProto place to put resulting merged proto
*
* Globals: none
*
* @return none (results are returned in MergedProto)
* @note Exceptions: none
* @note History: Mon Nov 26 08:15:08 1990, DSJ, Created.
*/
void ComputeMergedProto (PROTO p1,
PROTO p2,
FLOAT32 w1,
FLOAT32 w2,
PROTO MergedProto) {
FLOAT32 TotalWeight;
TotalWeight = w1 + w2;
w1 /= TotalWeight;
w2 /= TotalWeight;
MergedProto->X = p1->X * w1 + p2->X * w2;
MergedProto->Y = p1->Y * w1 + p2->Y * w2;
MergedProto->Length = p1->Length * w1 + p2->Length * w2;
MergedProto->Angle = p1->Angle * w1 + p2->Angle * w2;
FillABC(MergedProto);
} /* ComputeMergedProto */
/**
* This routine searches through all of the prototypes in
* Class and returns the id of the proto which would provide
* the best approximation of Prototype. If no close
* approximation can be found, NO_PROTO is returned.
*
* @param Class class to search for matching old proto in
* @param NumMerged # of protos merged into each proto of Class
* @param Prototype new proto to find match for
*
* Globals: none
*
* @return Id of closest proto in Class or NO_PROTO.
* @note Exceptions: none
* @note History: Sat Nov 24 11:42:58 1990, DSJ, Created.
*/
int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[],
PROTOTYPE *Prototype) {
PROTO_STRUCT NewProto;
PROTO_STRUCT MergedProto;
int Pid;
PROTO Proto;
int BestProto;
FLOAT32 BestMatch;
FLOAT32 Match, OldMatch, NewMatch;
MakeNewFromOld (&NewProto, Prototype);
BestProto = NO_PROTO;
BestMatch = WORST_MATCH_ALLOWED;
for (Pid = 0; Pid < Class->NumProtos; Pid++) {
Proto = ProtoIn(Class, Pid);
ComputeMergedProto(Proto, &NewProto,
(FLOAT32) NumMerged[Pid], 1.0, &MergedProto);
OldMatch = CompareProtos(Proto, &MergedProto);
NewMatch = CompareProtos(&NewProto, &MergedProto);
Match = MIN(OldMatch, NewMatch);
if (Match > BestMatch) {
BestProto = Pid;
BestMatch = Match;
}
}
return BestProto;
} /* FindClosestExistingProto */
/**
* This fills in the fields of the New proto based on the
* fields of the Old proto.
*
* @param New new proto to be filled in
* @param Old old proto to be converted
*
* Globals: none
*
* Exceptions: none
* History: Mon Nov 26 09:45:39 1990, DSJ, Created.
*/
void MakeNewFromOld(PROTO New, PROTOTYPE *Old) {
New->X = CenterX(Old->Mean);
New->Y = CenterY(Old->Mean);
New->Length = LengthOf(Old->Mean);
New->Angle = OrientationOf(Old->Mean);
FillABC(New);
} /* MakeNewFromOld */
/*-------------------once in subfeat---------------------------------*/
/**
* @name SubfeatureEvidence
*
* Compare a feature to a prototype. Print the result.
*/
FLOAT32 SubfeatureEvidence(FEATURE Feature, PROTO Proto) {
float Distance;
float Dangle;
Dangle = Proto->Angle - Feature->Params[PicoFeatDir];
if (Dangle < -0.5) Dangle += 1.0;
if (Dangle > 0.5) Dangle -= 1.0;
Dangle *= training_angle_match_scale;
Distance = Proto->A * Feature->Params[PicoFeatX] +
Proto->B * Feature->Params[PicoFeatY] +
Proto->C;
return (EvidenceOf (Distance * Distance + Dangle * Dangle));
}
/**
* @name EvidenceOf
*
* Return the new type of evidence number corresponding to this
* distance value. This number is no longer based on the chi squared
* approximation. The equation that represents the transform is:
* 1 / (1 + (sim / midpoint) ^ curl)
*/
double EvidenceOf (double Similarity) {
Similarity /= training_similarity_midpoint;
if (training_similarity_curl == 3)
Similarity = Similarity * Similarity * Similarity;
else if (training_similarity_curl == 2)
Similarity = Similarity * Similarity;
else
Similarity = pow (Similarity, training_similarity_curl);
return (1.0 / (1.0 + Similarity));
}
/**
* This routine returns TRUE if Feature would be matched
* by a fast match table built from Proto.
*
* @param Feature feature to be "fast matched" to proto
* @param Proto proto being "fast matched" against
*
* Globals:
* - training_tangent_bbox_pad bounding box pad tangent to proto
* - training_orthogonal_bbox_pad bounding box pad orthogonal to proto
*
* @return TRUE if feature could match Proto.
* @note Exceptions: none
* @note History: Wed Nov 14 17:19:58 1990, DSJ, Created.
*/
BOOL8 DummyFastMatch (
FEATURE Feature,
PROTO Proto)
{
FRECT BoundingBox;
FLOAT32 MaxAngleError;
FLOAT32 AngleError;
MaxAngleError = training_angle_pad / 360.0;
AngleError = fabs (Proto->Angle - Feature->Params[PicoFeatDir]);
if (AngleError > 0.5)
AngleError = 1.0 - AngleError;
if (AngleError > MaxAngleError)
return (FALSE);
ComputePaddedBoundingBox (Proto,
training_tangent_bbox_pad * GetPicoFeatureLength (),
training_orthogonal_bbox_pad * GetPicoFeatureLength (),
&BoundingBox);
return PointInside(&BoundingBox, Feature->Params[PicoFeatX],
Feature->Params[PicoFeatY]);
} /* DummyFastMatch */
/**
* This routine computes a bounding box that encloses the
* specified proto along with some padding. The
* amount of padding is specified as separate distances
* in the tangential and orthogonal directions.
*
* @param Proto proto to compute bounding box for
* @param TangentPad amount of pad to add in direction of segment
* @param OrthogonalPad amount of pad to add orthogonal to segment
* @param[out] BoundingBox place to put results
*
* Globals: none
*
* @return none (results are returned in BoundingBox)
* @note Exceptions: none
* @note History: Wed Nov 14 14:55:30 1990, DSJ, Created.
*/
void ComputePaddedBoundingBox (PROTO Proto, FLOAT32 TangentPad,
FLOAT32 OrthogonalPad, FRECT *BoundingBox) {
FLOAT32 Pad, Length, Angle;
FLOAT32 CosOfAngle, SinOfAngle;
Length = Proto->Length / 2.0 + TangentPad;
Angle = Proto->Angle * 2.0 * PI;
CosOfAngle = fabs(cos(Angle));
SinOfAngle = fabs(sin(Angle));
Pad = MAX (CosOfAngle * Length, SinOfAngle * OrthogonalPad);
BoundingBox->MinX = Proto->X - Pad;
BoundingBox->MaxX = Proto->X + Pad;
Pad = MAX(SinOfAngle * Length, CosOfAngle * OrthogonalPad);
BoundingBox->MinY = Proto->Y - Pad;
BoundingBox->MaxY = Proto->Y + Pad;
} /* ComputePaddedBoundingBox */
/**
* Return TRUE if point (X,Y) is inside of Rectangle.
*
* Globals: none
*
* @return TRUE if point (X,Y) is inside of Rectangle.
* @note Exceptions: none
* @note History: Wed Nov 14 17:26:35 1990, DSJ, Created.
*/
BOOL8 PointInside(FRECT *Rectangle, FLOAT32 X, FLOAT32 Y) {
if (X < Rectangle->MinX) return (FALSE);
if (X > Rectangle->MaxX) return (FALSE);
if (Y < Rectangle->MinY) return (FALSE);
if (Y > Rectangle->MaxY) return (FALSE);
return (TRUE);
} /* PointInside */
/******************************************************************************
** Filename: MergeNF.c
** Purpose: Program for merging similar nano-feature protos
** Author: Dan Johnson
** History: Wed Nov 21 09:55:23 1990, DSJ, Created.
**
** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
******************************************************************************/
#include "mergenf.h"
#include "host.h"
#include "efio.h"
#include "clusttool.h"
#include "cluster.h"
#include "oldlist.h"
#include "protos.h"
#include "ndminx.h"
#include "ocrfeatures.h"
#include "const.h"
#include "featdefs.h"
#include "intproto.h"
#include "params.h"
#include <stdio.h>
#include <string.h>
#include <math.h>
/*-------------------once in subfeat---------------------------------*/
double_VAR(training_angle_match_scale, 1.0, "Angle Match Scale ...");
double_VAR(training_similarity_midpoint, 0.0075, "Similarity Midpoint ...");
double_VAR(training_similarity_curl, 2.0, "Similarity Curl ...");
/*-----------------------------once in fasttrain----------------------------------*/
double_VAR(training_tangent_bbox_pad, 0.5, "Tangent bounding box pad ...");
double_VAR(training_orthogonal_bbox_pad, 2.5, "Orthogonal bounding box pad ...");
double_VAR(training_angle_pad, 45.0, "Angle pad ...");
/**
* Compare protos p1 and p2 and return an estimate of the
* worst evidence rating that will result for any part of p1
* that is compared to p2. In other words, if p1 were broken
* into pico-features and each pico-feature was matched to p2,
* what is the worst evidence rating that will be achieved for
* any pico-feature.
*
* @param p1, p2 protos to be compared
*
* Globals: none
*
* @return Worst possible result when matching p1 to p2.
* @note Exceptions: none
* @note History: Mon Nov 26 08:27:53 1990, DSJ, Created.
*/
FLOAT32 CompareProtos(PROTO p1, PROTO p2) {
FEATURE Feature;
FLOAT32 WorstEvidence = WORST_EVIDENCE;
FLOAT32 Evidence;
FLOAT32 Angle, Length;
/* if p1 and p2 are not close in length, don't let them match */
Length = fabs (p1->Length - p2->Length);
if (Length > MAX_LENGTH_MISMATCH)
return (0.0);
/* create a dummy pico-feature to be used for comparisons */
Feature = NewFeature (&PicoFeatDesc);
Feature->Params[PicoFeatDir] = p1->Angle;
/* convert angle to radians */
Angle = p1->Angle * 2.0 * PI;
/* find distance from center of p1 to 1/2 picofeat from end */
Length = p1->Length / 2.0 - GetPicoFeatureLength () / 2.0;
if (Length < 0) Length = 0;
/* set the dummy pico-feature at one end of p1 and match it to p2 */
Feature->Params[PicoFeatX] = p1->X + cos (Angle) * Length;
Feature->Params[PicoFeatY] = p1->Y + sin (Angle) * Length;
if (DummyFastMatch (Feature, p2)) {
Evidence = SubfeatureEvidence (Feature, p2);
if (Evidence < WorstEvidence)
WorstEvidence = Evidence;
} else {
FreeFeature(Feature);
return 0.0;
}
/* set the dummy pico-feature at the other end of p1 and match it to p2 */
Feature->Params[PicoFeatX] = p1->X - cos (Angle) * Length;
Feature->Params[PicoFeatY] = p1->Y - sin (Angle) * Length;
if (DummyFastMatch (Feature, p2)) {
Evidence = SubfeatureEvidence (Feature, p2);
if (Evidence < WorstEvidence)
WorstEvidence = Evidence;
} else {
FreeFeature(Feature);
return 0.0;
}
FreeFeature (Feature);
return (WorstEvidence);
} /* CompareProtos */
/**
* This routine computes a proto which is the weighted
* average of protos p1 and p2. The new proto is returned
* in MergedProto.
*
* @param p1, p2 protos to be merged
* @param w1, w2 weight of each proto
* @param MergedProto place to put resulting merged proto
*
* Globals: none
*
* @return none (results are returned in MergedProto)
* @note Exceptions: none
* @note History: Mon Nov 26 08:15:08 1990, DSJ, Created.
*/
void ComputeMergedProto (PROTO p1,
PROTO p2,
FLOAT32 w1,
FLOAT32 w2,
PROTO MergedProto) {
FLOAT32 TotalWeight;
TotalWeight = w1 + w2;
w1 /= TotalWeight;
w2 /= TotalWeight;
MergedProto->X = p1->X * w1 + p2->X * w2;
MergedProto->Y = p1->Y * w1 + p2->Y * w2;
MergedProto->Length = p1->Length * w1 + p2->Length * w2;
MergedProto->Angle = p1->Angle * w1 + p2->Angle * w2;
FillABC(MergedProto);
} /* ComputeMergedProto */
/**
* This routine searches through all of the prototypes in
* Class and returns the id of the proto which would provide
* the best approximation of Prototype. If no close
* approximation can be found, NO_PROTO is returned.
*
* @param Class class to search for matching old proto in
* @param NumMerged # of protos merged into each proto of Class
* @param Prototype new proto to find match for
*
* Globals: none
*
* @return Id of closest proto in Class or NO_PROTO.
* @note Exceptions: none
* @note History: Sat Nov 24 11:42:58 1990, DSJ, Created.
*/
int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[],
PROTOTYPE *Prototype) {
PROTO_STRUCT NewProto;
PROTO_STRUCT MergedProto;
int Pid;
PROTO Proto;
int BestProto;
FLOAT32 BestMatch;
FLOAT32 Match, OldMatch, NewMatch;
MakeNewFromOld (&NewProto, Prototype);
BestProto = NO_PROTO;
BestMatch = WORST_MATCH_ALLOWED;
for (Pid = 0; Pid < Class->NumProtos; Pid++) {
Proto = ProtoIn(Class, Pid);
ComputeMergedProto(Proto, &NewProto,
(FLOAT32) NumMerged[Pid], 1.0, &MergedProto);
OldMatch = CompareProtos(Proto, &MergedProto);
NewMatch = CompareProtos(&NewProto, &MergedProto);
Match = MIN(OldMatch, NewMatch);
if (Match > BestMatch) {
BestProto = Pid;
BestMatch = Match;
}
}
return BestProto;
} /* FindClosestExistingProto */
/**
* This fills in the fields of the New proto based on the
* fields of the Old proto.
*
* @param New new proto to be filled in
* @param Old old proto to be converted
*
* Globals: none
*
* Exceptions: none
* History: Mon Nov 26 09:45:39 1990, DSJ, Created.
*/
void MakeNewFromOld(PROTO New, PROTOTYPE *Old) {
New->X = CenterX(Old->Mean);
New->Y = CenterY(Old->Mean);
New->Length = LengthOf(Old->Mean);
New->Angle = OrientationOf(Old->Mean);
FillABC(New);
} /* MakeNewFromOld */
/*-------------------once in subfeat---------------------------------*/
/**
* @name SubfeatureEvidence
*
* Compare a feature to a prototype. Print the result.
*/
FLOAT32 SubfeatureEvidence(FEATURE Feature, PROTO Proto) {
float Distance;
float Dangle;
Dangle = Proto->Angle - Feature->Params[PicoFeatDir];
if (Dangle < -0.5) Dangle += 1.0;
if (Dangle > 0.5) Dangle -= 1.0;
Dangle *= training_angle_match_scale;
Distance = Proto->A * Feature->Params[PicoFeatX] +
Proto->B * Feature->Params[PicoFeatY] +
Proto->C;
return (EvidenceOf (Distance * Distance + Dangle * Dangle));
}
/**
* @name EvidenceOf
*
* Return the new type of evidence number corresponding to this
* distance value. This number is no longer based on the chi squared
* approximation. The equation that represents the transform is:
* 1 / (1 + (sim / midpoint) ^ curl)
*/
double EvidenceOf (double Similarity) {
Similarity /= training_similarity_midpoint;
if (training_similarity_curl == 3)
Similarity = Similarity * Similarity * Similarity;
else if (training_similarity_curl == 2)
Similarity = Similarity * Similarity;
else
Similarity = pow (Similarity, training_similarity_curl);
return (1.0 / (1.0 + Similarity));
}
/**
* This routine returns TRUE if Feature would be matched
* by a fast match table built from Proto.
*
* @param Feature feature to be "fast matched" to proto
* @param Proto proto being "fast matched" against
*
* Globals:
* - training_tangent_bbox_pad bounding box pad tangent to proto
* - training_orthogonal_bbox_pad bounding box pad orthogonal to proto
*
* @return TRUE if feature could match Proto.
* @note Exceptions: none
* @note History: Wed Nov 14 17:19:58 1990, DSJ, Created.
*/
BOOL8 DummyFastMatch (
FEATURE Feature,
PROTO Proto)
{
FRECT BoundingBox;
FLOAT32 MaxAngleError;
FLOAT32 AngleError;
MaxAngleError = training_angle_pad / 360.0;
AngleError = fabs (Proto->Angle - Feature->Params[PicoFeatDir]);
if (AngleError > 0.5)
AngleError = 1.0 - AngleError;
if (AngleError > MaxAngleError)
return (FALSE);
ComputePaddedBoundingBox (Proto,
training_tangent_bbox_pad * GetPicoFeatureLength (),
training_orthogonal_bbox_pad * GetPicoFeatureLength (),
&BoundingBox);
return PointInside(&BoundingBox, Feature->Params[PicoFeatX],
Feature->Params[PicoFeatY]);
} /* DummyFastMatch */
/**
* This routine computes a bounding box that encloses the
* specified proto along with some padding. The
* amount of padding is specified as separate distances
* in the tangential and orthogonal directions.
*
* @param Proto proto to compute bounding box for
* @param TangentPad amount of pad to add in direction of segment
* @param OrthogonalPad amount of pad to add orthogonal to segment
* @param[out] BoundingBox place to put results
*
* Globals: none
*
* @return none (results are returned in BoundingBox)
* @note Exceptions: none
* @note History: Wed Nov 14 14:55:30 1990, DSJ, Created.
*/
void ComputePaddedBoundingBox (PROTO Proto, FLOAT32 TangentPad,
FLOAT32 OrthogonalPad, FRECT *BoundingBox) {
FLOAT32 Pad, Length, Angle;
FLOAT32 CosOfAngle, SinOfAngle;
Length = Proto->Length / 2.0 + TangentPad;
Angle = Proto->Angle * 2.0 * PI;
CosOfAngle = fabs(cos(Angle));
SinOfAngle = fabs(sin(Angle));
Pad = MAX (CosOfAngle * Length, SinOfAngle * OrthogonalPad);
BoundingBox->MinX = Proto->X - Pad;
BoundingBox->MaxX = Proto->X + Pad;
Pad = MAX(SinOfAngle * Length, CosOfAngle * OrthogonalPad);
BoundingBox->MinY = Proto->Y - Pad;
BoundingBox->MaxY = Proto->Y + Pad;
} /* ComputePaddedBoundingBox */
/**
* Return TRUE if point (X,Y) is inside of Rectangle.
*
* Globals: none
*
* @return TRUE if point (X,Y) is inside of Rectangle.
* @note Exceptions: none
* @note History: Wed Nov 14 17:26:35 1990, DSJ, Created.
*/
BOOL8 PointInside(FRECT *Rectangle, FLOAT32 X, FLOAT32 Y) {
if (X < Rectangle->MinX) return (FALSE);
if (X > Rectangle->MaxX) return (FALSE);
if (Y < Rectangle->MinY) return (FALSE);
if (Y > Rectangle->MaxY) return (FALSE);
return (TRUE);
} /* PointInside */

View File

@ -1,103 +1,103 @@
/******************************************************************************
** Filename: MergeNF.c
** Purpose: Program for merging similar nano-feature protos
** Author: Dan Johnson
** History: Wed Nov 21 09:55:23 1990, DSJ, Created.
**
** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
******************************************************************************/
#ifndef TESSERACT_TRAINING_MERGENF_H_
#define TESSERACT_TRAINING_MERGENF_H_
/**----------------------------------------------------------------------------
Include Files and Type Defines
----------------------------------------------------------------------------**/
#include "protos.h"
#include "cluster.h"
#include "ocrfeatures.h"
#include "callcpp.h"
#include "picofeat.h"
#define WORST_MATCH_ALLOWED (0.9)
#define WORST_EVIDENCE (1.0)
#define MAX_LENGTH_MISMATCH (2.0 * GetPicoFeatureLength ())
#define PROTO_SUFFIX ".mf.p"
#define CONFIG_SUFFIX ".cl"
#define NO_PROTO (-1)
#define XPOSITION 0
#define YPOSITION 1
#define MFLENGTH 2
#define ORIENTATION 3
typedef struct
{
FLOAT32 MinX, MaxX, MinY, MaxY;
} FRECT;
/**----------------------------------------------------------------------------
Public Macros
----------------------------------------------------------------------------**/
#define CenterX(M) ( (M)[XPOSITION] )
#define CenterY(M) ( (M)[YPOSITION] )
#define LengthOf(M) ( (M)[MFLENGTH] )
#define OrientationOf(M) ( (M)[ORIENTATION] )
/**----------------------------------------------------------------------------
Public Function Prototypes
----------------------------------------------------------------------------**/
FLOAT32 CompareProtos (
PROTO p1,
PROTO p2);
void ComputeMergedProto (
PROTO p1,
PROTO p2,
FLOAT32 w1,
FLOAT32 w2,
PROTO MergedProto);
int FindClosestExistingProto (
CLASS_TYPE Class,
int NumMerged[],
PROTOTYPE *Prototype);
void MakeNewFromOld (
PROTO New,
PROTOTYPE *Old);
FLOAT32 SubfeatureEvidence (
FEATURE Feature,
PROTO Proto);
double EvidenceOf (
register double Similarity);
BOOL8 DummyFastMatch (
FEATURE Feature,
PROTO Proto);
void ComputePaddedBoundingBox (
PROTO Proto,
FLOAT32 TangentPad,
FLOAT32 OrthogonalPad,
FRECT *BoundingBox);
BOOL8 PointInside (
FRECT *Rectangle,
FLOAT32 X,
FLOAT32 Y);
#endif // TESSERACT_TRAINING_MERGENF_H_
/******************************************************************************
** Filename: MergeNF.c
** Purpose: Program for merging similar nano-feature protos
** Author: Dan Johnson
** History: Wed Nov 21 09:55:23 1990, DSJ, Created.
**
** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
******************************************************************************/
#ifndef TESSERACT_TRAINING_MERGENF_H_
#define TESSERACT_TRAINING_MERGENF_H_
/**----------------------------------------------------------------------------
Include Files and Type Defines
----------------------------------------------------------------------------**/
#include "protos.h"
#include "cluster.h"
#include "ocrfeatures.h"
#include "callcpp.h"
#include "picofeat.h"
#define WORST_MATCH_ALLOWED (0.9)
#define WORST_EVIDENCE (1.0)
#define MAX_LENGTH_MISMATCH (2.0 * GetPicoFeatureLength ())
#define PROTO_SUFFIX ".mf.p"
#define CONFIG_SUFFIX ".cl"
#define NO_PROTO (-1)
#define XPOSITION 0
#define YPOSITION 1
#define MFLENGTH 2
#define ORIENTATION 3
typedef struct
{
FLOAT32 MinX, MaxX, MinY, MaxY;
} FRECT;
/**----------------------------------------------------------------------------
Public Macros
----------------------------------------------------------------------------**/
#define CenterX(M) ( (M)[XPOSITION] )
#define CenterY(M) ( (M)[YPOSITION] )
#define LengthOf(M) ( (M)[MFLENGTH] )
#define OrientationOf(M) ( (M)[ORIENTATION] )
/**----------------------------------------------------------------------------
Public Function Prototypes
----------------------------------------------------------------------------**/
FLOAT32 CompareProtos (
PROTO p1,
PROTO p2);
void ComputeMergedProto (
PROTO p1,
PROTO p2,
FLOAT32 w1,
FLOAT32 w2,
PROTO MergedProto);
int FindClosestExistingProto (
CLASS_TYPE Class,
int NumMerged[],
PROTOTYPE *Prototype);
void MakeNewFromOld (
PROTO New,
PROTOTYPE *Old);
FLOAT32 SubfeatureEvidence (
FEATURE Feature,
PROTO Proto);
double EvidenceOf (
register double Similarity);
BOOL8 DummyFastMatch (
FEATURE Feature,
PROTO Proto);
void ComputePaddedBoundingBox (
PROTO Proto,
FLOAT32 TangentPad,
FLOAT32 OrthogonalPad,
FRECT *BoundingBox);
BOOL8 PointInside (
FRECT *Rectangle,
FLOAT32 X,
FLOAT32 Y);
#endif // TESSERACT_TRAINING_MERGENF_H_

0
training/tesstrain.sh → src/training/tesstrain.sh Executable file → Normal file
View File

View File

View File

@ -1,23 +1,23 @@
/**********************************************************************
* File: tlog.cpp
* Description: Variant of printf with logging level controllable by a
* commandline flag.
* Author: Ranjith Unnikrishnan
* Created: Wed Nov 20 2013
*
* (C) Copyright 2013, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "tlog.h"
INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output");
/**********************************************************************
* File: tlog.cpp
* Description: Variant of printf with logging level controllable by a
* commandline flag.
* Author: Ranjith Unnikrishnan
* Created: Wed Nov 20 2013
*
* (C) Copyright 2013, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "tlog.h"
INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output");

View File

@ -1,41 +1,41 @@
/**********************************************************************
* File: tlog.h
* Description: Variant of printf with logging level controllable by a
* commandline flag.
* Author: Ranjith Unnikrishnan
* Created: Wed Nov 20 2013
*
* (C) Copyright 2013, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_TLOG_H_
#define TESSERACT_TRAINING_TLOG_H_
#include "commandlineflags.h"
#include "errcode.h"
#include "tprintf.h"
DECLARE_INT_PARAM_FLAG(tlog_level);
// Variant guarded by the numeric logging level parameter FLAGS_tlog_level
// (default 0). Code using ParseCommandLineFlags() can control its value using
// the --tlog_level commandline argument. Otherwise it must be specified in a
// config file like other params.
#define tlog(level, ...) { \
if (FLAGS_tlog_level >= level) { \
tprintf_internal(__VA_ARGS__); \
} \
}
#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level)
#endif // TESSERACT_TRAINING_TLOG_H_
/**********************************************************************
* File: tlog.h
* Description: Variant of printf with logging level controllable by a
* commandline flag.
* Author: Ranjith Unnikrishnan
* Created: Wed Nov 20 2013
*
* (C) Copyright 2013, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_TLOG_H_
#define TESSERACT_TRAINING_TLOG_H_
#include "commandlineflags.h"
#include "errcode.h"
#include "tprintf.h"
DECLARE_INT_PARAM_FLAG(tlog_level);
// Variant guarded by the numeric logging level parameter FLAGS_tlog_level
// (default 0). Code using ParseCommandLineFlags() can control its value using
// the --tlog_level commandline argument. Otherwise it must be specified in a
// config file like other params.
#define tlog(level, ...) { \
if (FLAGS_tlog_level >= level) { \
tprintf_internal(__VA_ARGS__); \
} \
}
#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level)
#endif // TESSERACT_TRAINING_TLOG_H_

View File

@ -1,35 +1,35 @@
#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments generic unicode into
// grapheme clusters, including Latin with diacritics.
class ValidateGrapheme : public Validator {
public:
ValidateGrapheme(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateGrapheme() {}
protected:
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass UnicodeToCharClass(char32 ch) const override;
private:
// Helper returns true if the sequence prev_ch,ch is invalid.
bool IsBadlyFormed(char32 prev_ch, char32 ch);
// Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch);
// Helper returns true if the sequence prev_ch,ch is invalid Thai.
static bool IsBadlyFormedThai(char32 prev_ch, char32 ch);
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments generic unicode into
// grapheme clusters, including Latin with diacritics.
class ValidateGrapheme : public Validator {
public:
ValidateGrapheme(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateGrapheme() {}
protected:
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass UnicodeToCharClass(char32 ch) const override;
private:
// Helper returns true if the sequence prev_ch,ch is invalid.
bool IsBadlyFormed(char32 prev_ch, char32 ch);
// Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch);
// Helper returns true if the sequence prev_ch,ch is invalid Thai.
static bool IsBadlyFormedThai(char32 prev_ch, char32 ch);
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_

View File

@ -1,44 +1,44 @@
#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_
#define TESSERACT_TRAINING_VALIDATE_INDIC_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Indic scripts in the
// unicode range 0x900-0xdff (Devanagari-Sinhala).
class ValidateIndic : public Validator {
public:
ValidateIndic(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateIndic() {}
protected:
// Returns whether codes matches the pattern for an Indic Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
private:
// Helper consumes/copies a virama and any associated post-virama joiners.
bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
// Helper consumes/copies a series of consonants separated by viramas while
// valid, but not any vowel or other modifiers.
bool ConsumeConsonantHeadIfValid();
// Helper consumes/copies a tail part of a consonant, comprising optional
// matra/piece, vowel modifier, vedic mark, terminating virama.
bool ConsumeConsonantTailIfValid();
// Helper consumes/copies a vowel and optional modifiers.
bool ConsumeVowelIfValid();
// Some special unicodes used only for Indic processing.
static const char32 kYayana = 0xdba; // Sinhala Ya
static const char32 kRayana = 0xdbb; // Sinhala Ra
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_
#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_
#define TESSERACT_TRAINING_VALIDATE_INDIC_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Indic scripts in the
// unicode range 0x900-0xdff (Devanagari-Sinhala).
class ValidateIndic : public Validator {
public:
ValidateIndic(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateIndic() {}
protected:
// Returns whether codes matches the pattern for an Indic Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
private:
// Helper consumes/copies a virama and any associated post-virama joiners.
bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
// Helper consumes/copies a series of consonants separated by viramas while
// valid, but not any vowel or other modifiers.
bool ConsumeConsonantHeadIfValid();
// Helper consumes/copies a tail part of a consonant, comprising optional
// matra/piece, vowel modifier, vedic mark, terminating virama.
bool ConsumeConsonantTailIfValid();
// Helper consumes/copies a vowel and optional modifiers.
bool ConsumeVowelIfValid();
// Some special unicodes used only for Indic processing.
static const char32 kYayana = 0xdba; // Sinhala Ya
static const char32 kRayana = 0xdbb; // Sinhala Ra
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_

View File

@ -1,106 +1,106 @@
#include "validate_khmer.h"
#include "errcode.h"
#include "tprintf.h"
namespace tesseract {
// Returns whether codes matches the pattern for a Khmer Grapheme.
// Taken from unicode standard:
// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
// Translated to the codes used by the CharClass enum:
// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
// Also the Consonant class here includes independent vowels, as they are
// treated the same anyway.
// In the split grapheme mode, the only characters that get grouped are the
// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
// the BNF syntax, so who knows what they do.
bool ValidateKhmer::ConsumeGraphemeIfValid() {
int num_codes = codes_.size();
if (codes_used_ == num_codes) return false;
if (codes_[codes_used_].first == CharClass::kOther) {
UseMultiCode(1);
return true;
}
if (codes_[codes_used_].first != CharClass::kConsonant) {
if (report_errors_) {
tprintf("Invalid start of Khmer syllable:0x%x\n",
codes_[codes_used_].second);
}
return false;
}
if (UseMultiCode(1)) return true;
if (codes_[codes_used_].first == CharClass::kRobat ||
codes_[codes_used_].first == CharClass::kNukta) {
if (UseMultiCode(1)) return true;
}
while (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
if (codes_[codes_used_].first == CharClass::kRobat) {
if (UseMultiCode(1)) return true;
}
}
int num_matra_parts = 0;
if (codes_[codes_used_].second == kZeroWidthJoiner ||
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (CodeOnlyToOutput()) {
if (report_errors_) {
tprintf("Unterminated joiner: 0x%x\n", output_.back());
}
return false;
}
++num_matra_parts;
}
// Not quite as shown by the BNF, the matra piece is allowed as a matra on its
// own or as an addition to other matras.
if (codes_[codes_used_].first == CharClass::kMatra ||
codes_[codes_used_].first == CharClass::kMatraPiece) {
++num_matra_parts;
if (UseMultiCode(num_matra_parts)) return true;
} else if (num_matra_parts) {
if (report_errors_) {
tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
output_.back(), codes_[codes_used_].second);
}
return false;
}
if (codes_[codes_used_].first == CharClass::kMatraPiece &&
codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
if (UseMultiCode(1)) return true;
}
if (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
}
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
}
return true;
}
Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const {
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
// Offset from the start of the relevant unicode code block aka code page.
int off = ch - static_cast<char32>(script_);
// Anything in another code block is other.
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
if (off <= 0x33) return CharClass::kConsonant;
if (off <= 0x45) return CharClass::kMatra;
if (off == 0x46) return CharClass::kMatraPiece;
if (off == 0x4c) return CharClass::kRobat;
if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
if (off <= 0x51) return CharClass::kVowelModifier;
if (off == 0x52) return CharClass::kVirama;
return CharClass::kOther;
}
} // namespace tesseract
#include "validate_khmer.h"
#include "errcode.h"
#include "tprintf.h"
namespace tesseract {
// Returns whether codes matches the pattern for a Khmer Grapheme.
// Taken from unicode standard:
// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
// Translated to the codes used by the CharClass enum:
// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
// Also the Consonant class here includes independent vowels, as they are
// treated the same anyway.
// In the split grapheme mode, the only characters that get grouped are the
// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
// the BNF syntax, so who knows what they do.
bool ValidateKhmer::ConsumeGraphemeIfValid() {
int num_codes = codes_.size();
if (codes_used_ == num_codes) return false;
if (codes_[codes_used_].first == CharClass::kOther) {
UseMultiCode(1);
return true;
}
if (codes_[codes_used_].first != CharClass::kConsonant) {
if (report_errors_) {
tprintf("Invalid start of Khmer syllable:0x%x\n",
codes_[codes_used_].second);
}
return false;
}
if (UseMultiCode(1)) return true;
if (codes_[codes_used_].first == CharClass::kRobat ||
codes_[codes_used_].first == CharClass::kNukta) {
if (UseMultiCode(1)) return true;
}
while (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
if (codes_[codes_used_].first == CharClass::kRobat) {
if (UseMultiCode(1)) return true;
}
}
int num_matra_parts = 0;
if (codes_[codes_used_].second == kZeroWidthJoiner ||
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (CodeOnlyToOutput()) {
if (report_errors_) {
tprintf("Unterminated joiner: 0x%x\n", output_.back());
}
return false;
}
++num_matra_parts;
}
// Not quite as shown by the BNF, the matra piece is allowed as a matra on its
// own or as an addition to other matras.
if (codes_[codes_used_].first == CharClass::kMatra ||
codes_[codes_used_].first == CharClass::kMatraPiece) {
++num_matra_parts;
if (UseMultiCode(num_matra_parts)) return true;
} else if (num_matra_parts) {
if (report_errors_) {
tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
output_.back(), codes_[codes_used_].second);
}
return false;
}
if (codes_[codes_used_].first == CharClass::kMatraPiece &&
codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
if (UseMultiCode(1)) return true;
}
if (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
}
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
}
return true;
}
Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const {
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
// Offset from the start of the relevant unicode code block aka code page.
int off = ch - static_cast<char32>(script_);
// Anything in another code block is other.
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
if (off <= 0x33) return CharClass::kConsonant;
if (off <= 0x45) return CharClass::kMatra;
if (off == 0x46) return CharClass::kMatraPiece;
if (off == 0x4c) return CharClass::kRobat;
if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
if (off <= 0x51) return CharClass::kVowelModifier;
if (off == 0x52) return CharClass::kVirama;
return CharClass::kOther;
}
} // namespace tesseract

View File

@ -1,27 +1,27 @@
#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_
#define TESSERACT_TRAINING_VALIDATE_KHMER_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Khmer.
class ValidateKhmer : public Validator {
public:
ValidateKhmer(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateKhmer() {}
protected:
// Returns whether codes matches the pattern for an Khmer Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass UnicodeToCharClass(char32 ch) const override;
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_
#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_
#define TESSERACT_TRAINING_VALIDATE_KHMER_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Khmer.
class ValidateKhmer : public Validator {
public:
ValidateKhmer(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateKhmer() {}
protected:
// Returns whether codes matches the pattern for an Khmer Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass UnicodeToCharClass(char32 ch) const override;
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_

View File

@ -1,160 +1,160 @@
#include "validate_myanmar.h"
#include "errcode.h"
#include "icuerrorcode.h"
#include "tprintf.h"
#include "unicode/uchar.h" // From libicu
#include "unicode/uscript.h" // From libicu
namespace tesseract {
// Returns whether codes matches the pattern for a Myanmar Grapheme.
// Taken directly from the unicode table 16-3.
// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
bool ValidateMyanmar::ConsumeGraphemeIfValid() {
int num_codes = codes_.size();
if (codes_used_ == num_codes) return true;
// Other.
if (IsMyanmarOther(codes_[codes_used_].second)) {
UseMultiCode(1);
return true;
}
// Kinzi.
if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
codes_[codes_used_ + 1].second == kMyanmarAsat &&
codes_[codes_used_ + 2].second == kMyanmarVirama) {
ASSERT_HOST(!CodeOnlyToOutput());
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(3)) return true;
}
// Base consonant/vowel. NOTE that since everything in Myanmar appears to be
// optional, except the base, this is the only place where invalid input can
// be detected and false returned.
if (IsMyanmarLetter(codes_[codes_used_].second)) {
if (UseMultiCode(1)) return true;
} else {
if (report_errors_) {
tprintf("Invalid start of Myanmar syllable:0x%x\n",
codes_[codes_used_].second);
}
return false; // One of these is required.
}
if (ConsumeSubscriptIfPresent()) return true;
ConsumeOptionalSignsIfPresent();
// What we have consumed so far is a valid syllable.
return true;
}
// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
// is little correspondence between the content of table 16-3 and the char
// classes of the Indic languages. (Experts may disagree and improve!)
// In unicode table 16-3 there is basically a long list of optional characters,
// which can be coded quite easily.
// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
// The table also allows sequences that still result in dotted circles!!
// So with a lot of guesswork the rest have been added in a reasonable place.
Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
return CharClass::kOther;
}
// Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached.
bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
// Subscript consonant. It appears there can be only one.
int num_codes = codes_.size();
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_].second == kMyanmarVirama) {
if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
}
}
return false;
}
// Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached.
bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
// The following characters are allowed, all optional, and in sequence.
// An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
0x1081, 0x1031});
for (char32 ch : kMedials) {
if (codes_[codes_used_].second == ch) {
if (UseMultiCode(1)) return true;
if (ch == kMyanmarMedialYa &&
codes_[codes_used_].second == kMyanmarAsat) {
if (UseMultiCode(1)) return true;
}
}
}
// Vowel sign i, ii, ai.
char32 ch = codes_[codes_used_].second;
if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
if (UseMultiCode(1)) return true;
}
// Vowel sign u, uu, and extensions.
ch = codes_[codes_used_].second;
if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
(0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
ch == 0x109c || ch == 0x109d) {
if (UseMultiCode(1)) return true;
}
// Tall aa, aa with optional asat.
if (codes_[codes_used_].second == 0x102b ||
codes_[codes_used_].second == 0x102c) {
if (UseMultiCode(1)) return true;
if (codes_[codes_used_].second == kMyanmarAsat) {
if (UseMultiCode(1)) return true;
}
}
// The following characters are allowed, all optional, and in sequence.
const std::vector<char32> kSigns({0x1036, 0x1037});
for (char32 ch : kSigns) {
if (codes_[codes_used_].second == ch) {
if (UseMultiCode(1)) return true;
}
}
// Tone mark extensions.
ch = codes_[codes_used_].second;
if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
(0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
ch == 0x108f || ch == 0x109a || ch == 0x109b ||
(0xaa7b <= ch && ch <= 0xaa7d)) {
if (UseMultiCode(1)) return true;
}
return false;
}
// Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket.
/* static */
bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
(0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
(0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
(0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
}
// Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable.
/* static */
bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
IcuErrorCode err;
UScriptCode script_code = uscript_getScript(ch, err);
if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
ch != Validator::kZeroWidthNonJoiner)
return true;
return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
(0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
(0xaa74 <= ch && ch <= 0xaa79);
}
} // namespace tesseract
#include "validate_myanmar.h"
#include "errcode.h"
#include "icuerrorcode.h"
#include "tprintf.h"
#include "unicode/uchar.h" // From libicu
#include "unicode/uscript.h" // From libicu
namespace tesseract {
// Returns whether codes matches the pattern for a Myanmar Grapheme.
// Taken directly from the unicode table 16-3.
// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
bool ValidateMyanmar::ConsumeGraphemeIfValid() {
int num_codes = codes_.size();
if (codes_used_ == num_codes) return true;
// Other.
if (IsMyanmarOther(codes_[codes_used_].second)) {
UseMultiCode(1);
return true;
}
// Kinzi.
if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
codes_[codes_used_ + 1].second == kMyanmarAsat &&
codes_[codes_used_ + 2].second == kMyanmarVirama) {
ASSERT_HOST(!CodeOnlyToOutput());
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(3)) return true;
}
// Base consonant/vowel. NOTE that since everything in Myanmar appears to be
// optional, except the base, this is the only place where invalid input can
// be detected and false returned.
if (IsMyanmarLetter(codes_[codes_used_].second)) {
if (UseMultiCode(1)) return true;
} else {
if (report_errors_) {
tprintf("Invalid start of Myanmar syllable:0x%x\n",
codes_[codes_used_].second);
}
return false; // One of these is required.
}
if (ConsumeSubscriptIfPresent()) return true;
ConsumeOptionalSignsIfPresent();
// What we have consumed so far is a valid syllable.
return true;
}
// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
// is little correspondence between the content of table 16-3 and the char
// classes of the Indic languages. (Experts may disagree and improve!)
// In unicode table 16-3 there is basically a long list of optional characters,
// which can be coded quite easily.
// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
// The table also allows sequences that still result in dotted circles!!
// So with a lot of guesswork the rest have been added in a reasonable place.
Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
return CharClass::kOther;
}
// Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached.
bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
// Subscript consonant. It appears there can be only one.
int num_codes = codes_.size();
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_].second == kMyanmarVirama) {
if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
}
}
return false;
}
// Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached.
bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
// The following characters are allowed, all optional, and in sequence.
// An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
0x1081, 0x1031});
for (char32 ch : kMedials) {
if (codes_[codes_used_].second == ch) {
if (UseMultiCode(1)) return true;
if (ch == kMyanmarMedialYa &&
codes_[codes_used_].second == kMyanmarAsat) {
if (UseMultiCode(1)) return true;
}
}
}
// Vowel sign i, ii, ai.
char32 ch = codes_[codes_used_].second;
if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
if (UseMultiCode(1)) return true;
}
// Vowel sign u, uu, and extensions.
ch = codes_[codes_used_].second;
if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
(0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
ch == 0x109c || ch == 0x109d) {
if (UseMultiCode(1)) return true;
}
// Tall aa, aa with optional asat.
if (codes_[codes_used_].second == 0x102b ||
codes_[codes_used_].second == 0x102c) {
if (UseMultiCode(1)) return true;
if (codes_[codes_used_].second == kMyanmarAsat) {
if (UseMultiCode(1)) return true;
}
}
// The following characters are allowed, all optional, and in sequence.
const std::vector<char32> kSigns({0x1036, 0x1037});
for (char32 ch : kSigns) {
if (codes_[codes_used_].second == ch) {
if (UseMultiCode(1)) return true;
}
}
// Tone mark extensions.
ch = codes_[codes_used_].second;
if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
(0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
ch == 0x108f || ch == 0x109a || ch == 0x109b ||
(0xaa7b <= ch && ch <= 0xaa7d)) {
if (UseMultiCode(1)) return true;
}
return false;
}
// Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket.
/* static */
bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
(0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
(0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
(0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
}
// Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable.
/* static */
bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
IcuErrorCode err;
UScriptCode script_code = uscript_getScript(ch, err);
if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
ch != Validator::kZeroWidthNonJoiner)
return true;
return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
(0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
(0xaa74 <= ch && ch <= 0xaa79);
}
} // namespace tesseract

View File

@ -1,47 +1,47 @@
#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Myanmar.
class ValidateMyanmar : public Validator {
public:
ValidateMyanmar(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateMyanmar() {}
protected:
// Returns whether codes matches the pattern for a Myanmar Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
private:
// Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached.
bool ConsumeSubscriptIfPresent();
// Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached.
bool ConsumeOptionalSignsIfPresent();
// Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket.
static bool IsMyanmarLetter(char32 ch);
// Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable.
static bool IsMyanmarOther(char32 ch);
// Some special unicodes used only for Myanmar processing.
static const char32 kMyanmarAsat = 0x103a;
static const char32 kMyanmarMedialYa = 0x103b;
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
#include "validator.h"
namespace tesseract {
// Subclass of Validator that validates and segments Myanmar.
class ValidateMyanmar : public Validator {
public:
ValidateMyanmar(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateMyanmar() {}
protected:
// Returns whether codes matches the pattern for a Myanmar Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
private:
// Helper consumes/copies a virama and any subscript consonant.
// Returns true if the end of input is reached.
bool ConsumeSubscriptIfPresent();
// Helper consumes/copies a series of optional signs.
// Returns true if the end of input is reached.
bool ConsumeOptionalSignsIfPresent();
// Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket.
static bool IsMyanmarLetter(char32 ch);
// Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable.
static bool IsMyanmarOther(char32 ch);
// Some special unicodes used only for Myanmar processing.
static const char32 kMyanmarAsat = 0x103a;
static const char32 kMyanmarMedialYa = 0x103b;
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_

View File

@ -1,243 +1,243 @@
/**********************************************************************
* File: validator.h
* Description: Base class for various text validators. Intended mainly for
* scripts that use a virama character.
* Author: Ray Smith
* Created: Tue May 23 2017
*
* (C) Copyright 2017, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_VALIDATOR_H_
#define TESSERACT_TRAINING_VALIDATOR_H_
#include <memory>
#include <vector>
#include "unichar.h"
namespace tesseract {
// Different kinds of grapheme normalization - not just for Indic!
// A grapheme is a syllable unit in Indic and can be several unicodes.
// In other scripts, a grapheme is a base character and accent/diacritic
// combination, as not all accented characters have a single composed form.
enum class GraphemeNormMode {
// Validation result is a single string, even if input is multi-word.
kSingleString,
// Standard unicode graphemes are validated and output as grapheme units.
kCombined,
// Graphemes are validated and sub-divided. For virama-using scripts, units
// that correspond to repeatable glyphs are generated. (Mostly single unicodes
// but viramas and joiners are paired with the most sensible neighbor.)
// For non-virama scripts, this means that base/accent pairs are separated,
// ie the output is individual unicodes.
kGlyphSplit,
// The output is always single unicodes, regardless of the script.
kIndividualUnicodes,
};
// An enum representing the scripts that use a virama character. It is
// guaranteed that the value of any element, (except kNonVirama) can be cast
// to a unicode (char32) value that represents the start of the unicode range
// of the corresponding script.
enum class ViramaScript : char32 {
kNonVirama = 0,
kDevanagari = 0x900,
kBengali = 0x980,
kGurmukhi = 0xa00,
kGujarati = 0xa80,
kOriya = 0xb00,
kTamil = 0xb80,
kTelugu = 0xc00,
kKannada = 0xc80,
kMalayalam = 0xd00,
kSinhala = 0xd80,
kMyanmar = 0x1000,
kKhmer = 0x1780,
};
// Base class offers a validation API and protected methods to allow subclasses
// to easily build the validated/segmented output.
class Validator {
public:
// Validates and cleans the src vector of unicodes to the *dest, according to
// g_mode. In the case of kSingleString, a single vector containing the whole
// result is added to *dest. With kCombined, multiple vectors are added to
// *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
// added to *dest with a smaller unit representing a glyph in each.
// In case of validation error, returns false and as much as possible of the
// input, without discarding invalid text.
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode,
bool report_errors,
const std::vector<char32>& src,
std::vector<std::vector<char32>>* dest);
// Returns true if the unicode ch is a non-printing zero-width mark of no
// significance to OCR training or evaluation.
static bool IsZeroWidthMark(char32 ch) {
return ch == kZeroWidthSpace || ch == kLeftToRightMark ||
ch == kRightToLeftMark || ch == kInvalid;
}
virtual ~Validator() {}
// Some specific but universally useful unicodes.
static const char32 kZeroWidthSpace;
static const char32 kZeroWidthNonJoiner;
static const char32 kZeroWidthJoiner;
static const char32 kLeftToRightMark;
static const char32 kRightToLeftMark;
static const char32 kInvalid;
protected:
// These are more or less the character class identifiers in the ISCII
// standard, section 8. They have been augmented with the Unicode meta
// characters Zero Width Joiner and Zero Width Non Joiner, and the
// Unicode Vedic Marks.
// The best sources of information on Unicode and Indic scripts are:
// http://varamozhi.sourceforge.net/iscii91.pdf
// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
// http://unicode.org/faq/indic.html
// http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
enum class CharClass {
// NOTE: The values of the enum members are meaningless and arbitrary, ie
// they are not used for sorting, or any other risky application.
// The reason they are what they are is they are a single character
// abbreviation that can be used in a regexp/BNF definition of a grammar,
// IN A COMMENT, and still not relied upon in the code.
kConsonant = 'C',
kVowel = 'V',
kVirama = 'H', // (aka Halant)
kMatra = 'M', // (aka Dependent Vowel)
kMatraPiece = 'P', // unicode provides pieces of Matras.
kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks)
kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C
kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D
kVedicMark = 'v', // Modifiers can come modify any indic syllable.
kNukta = 'N', // Occurs only immediately after consonants.
kRobat = 'R', // Khmer only.
kOther = 'O', // (digits, measures, non-Indic, etc)
// Additional classes used only by ValidateGrapheme.
kWhitespace = ' ',
kCombiner = 'c', // Combiners other than virama.
};
typedef std::pair<CharClass, char32> IndicPair;
Validator(ViramaScript script, bool report_errors)
: script_(script),
codes_used_(0),
output_used_(0),
report_errors_(report_errors) {}
// Factory method that understands how to map script to the right subclass.
static std::unique_ptr<Validator> ScriptValidator(ViramaScript script,
bool report_errors);
// Internal version of the public static ValidateCleanAndSegment.
// Validates and cleans the src vector of unicodes to the *dest, according to
// its type and the given g_mode.
// In case of validation error, returns false and returns as much as possible
// of the input, without discarding invalid text.
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode,
const std::vector<char32>& src,
std::vector<std::vector<char32>>* dest);
// Moves the results from parts_ or output_ to dest according to g_mode.
void MoveResultsToDest(GraphemeNormMode g_mode,
std::vector<std::vector<char32>>* dest);
// Computes and returns the ViramaScript corresponding to the most frequent
// virama-using script in the input, or kNonVirama if none are present.
static ViramaScript MostFrequentViramaScript(
const std::vector<char32>& utf32);
// Returns true if the given UTF-32 unicode is a "virama" character.
static bool IsVirama(char32 unicode);
// Returns true if the given UTF-32 unicode is a vedic accent.
static bool IsVedicAccent(char32 unicode);
// Returns true if the script is one that uses subscripts for conjuncts.
bool IsSubscriptScript() const;
// Helper function appends the next element of codes_ only to output_,
// without touching parts_
// Returns true at the end of codes_.
bool CodeOnlyToOutput() {
output_.push_back(codes_[codes_used_].second);
return ++codes_used_ == codes_.size();
}
// Helper function adds a length-element vector to parts_ from the last length
// elements of output_. If there are more than length unused elements in
// output_, adds unicodes as single-element vectors to parts_ to catch
// output_used_ up to output->size() - length before adding the length-element
// vector.
void MultiCodePart(int length) {
while (output_used_ + length < output_.size()) {
parts_.emplace_back(
std::initializer_list<char32>{output_[output_used_++]});
}
parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
while (++output_used_ < output_.size()) {
parts_.back().push_back(output_[output_used_]);
}
}
// Helper function appends the next element of codes_ to output_, and then
// calls MultiCodePart to add the appropriate components to parts_.
// Returns true at the end of codes_.
bool UseMultiCode(int length) {
output_.push_back(codes_[codes_used_].second);
MultiCodePart(length);
return ++codes_used_ == codes_.size();
}
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
virtual bool ConsumeGraphemeIfValid() = 0;
// Sets codes_ to the class codes for the given unicode text.
void ComputeClassCodes(const std::vector<char32>& text);
// Returns the CharClass corresponding to the given Unicode ch.
virtual CharClass UnicodeToCharClass(char32 ch) const = 0;
// Resets to the initial state.
void Clear();
// Number of unicodes in each Indic codepage.
static const int kIndicCodePageSize = 128;
// Lowest unicode value of any Indic script. (Devanagari).
static const char32 kMinIndicUnicode = 0x900;
// Highest unicode value of any consistent (ISCII-based) Indic script.
static const char32 kMaxSinhalaUnicode = 0xdff;
// Highest unicode value of any virama-using script. (Khmer).
static const char32 kMaxViramaScriptUnicode = 0x17ff;
// Some special unicodes.
static const char32 kSinhalaVirama = 0xdca;
static const char32 kMyanmarVirama = 0x1039;
static const char32 kKhmerVirama = 0x17d2;
// Script we are operating on.
ViramaScript script_;
// Input unicodes with assigned CharClass is the data to be validated.
std::vector<IndicPair> codes_;
// Glyph-like components of the input.
std::vector<std::vector<char32>> parts_;
// Copied validated unicodes from codes_ that are OK to output.
std::vector<char32> output_;
// The number of elements of codes_ that have been processed so far.
int codes_used_;
// The number of elements of output_ that have already been added to parts_.
int output_used_;
// Log error messages for reasons why text is invalid.
bool report_errors_;
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATOR_H_
/**********************************************************************
* File: validator.h
* Description: Base class for various text validators. Intended mainly for
* scripts that use a virama character.
* Author: Ray Smith
* Created: Tue May 23 2017
*
* (C) Copyright 2017, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_VALIDATOR_H_
#define TESSERACT_TRAINING_VALIDATOR_H_
#include <memory>
#include <vector>
#include "unichar.h"
namespace tesseract {
// Different kinds of grapheme normalization - not just for Indic!
// A grapheme is a syllable unit in Indic and can be several unicodes.
// In other scripts, a grapheme is a base character and accent/diacritic
// combination, as not all accented characters have a single composed form.
enum class GraphemeNormMode {
// Validation result is a single string, even if input is multi-word.
kSingleString,
// Standard unicode graphemes are validated and output as grapheme units.
kCombined,
// Graphemes are validated and sub-divided. For virama-using scripts, units
// that correspond to repeatable glyphs are generated. (Mostly single unicodes
// but viramas and joiners are paired with the most sensible neighbor.)
// For non-virama scripts, this means that base/accent pairs are separated,
// ie the output is individual unicodes.
kGlyphSplit,
// The output is always single unicodes, regardless of the script.
kIndividualUnicodes,
};
// An enum representing the scripts that use a virama character. It is
// guaranteed that the value of any element, (except kNonVirama) can be cast
// to a unicode (char32) value that represents the start of the unicode range
// of the corresponding script.
enum class ViramaScript : char32 {
kNonVirama = 0,
kDevanagari = 0x900,
kBengali = 0x980,
kGurmukhi = 0xa00,
kGujarati = 0xa80,
kOriya = 0xb00,
kTamil = 0xb80,
kTelugu = 0xc00,
kKannada = 0xc80,
kMalayalam = 0xd00,
kSinhala = 0xd80,
kMyanmar = 0x1000,
kKhmer = 0x1780,
};
// Base class offers a validation API and protected methods to allow subclasses
// to easily build the validated/segmented output.
class Validator {
public:
// Validates and cleans the src vector of unicodes to the *dest, according to
// g_mode. In the case of kSingleString, a single vector containing the whole
// result is added to *dest. With kCombined, multiple vectors are added to
// *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
// added to *dest with a smaller unit representing a glyph in each.
// In case of validation error, returns false and as much as possible of the
// input, without discarding invalid text.
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode,
bool report_errors,
const std::vector<char32>& src,
std::vector<std::vector<char32>>* dest);
// Returns true if the unicode ch is a non-printing zero-width mark of no
// significance to OCR training or evaluation.
static bool IsZeroWidthMark(char32 ch) {
return ch == kZeroWidthSpace || ch == kLeftToRightMark ||
ch == kRightToLeftMark || ch == kInvalid;
}
virtual ~Validator() {}
// Some specific but universally useful unicodes.
static const char32 kZeroWidthSpace;
static const char32 kZeroWidthNonJoiner;
static const char32 kZeroWidthJoiner;
static const char32 kLeftToRightMark;
static const char32 kRightToLeftMark;
static const char32 kInvalid;
protected:
// These are more or less the character class identifiers in the ISCII
// standard, section 8. They have been augmented with the Unicode meta
// characters Zero Width Joiner and Zero Width Non Joiner, and the
// Unicode Vedic Marks.
// The best sources of information on Unicode and Indic scripts are:
// http://varamozhi.sourceforge.net/iscii91.pdf
// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
// http://unicode.org/faq/indic.html
// http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
enum class CharClass {
// NOTE: The values of the enum members are meaningless and arbitrary, ie
// they are not used for sorting, or any other risky application.
// The reason they are what they are is they are a single character
// abbreviation that can be used in a regexp/BNF definition of a grammar,
// IN A COMMENT, and still not relied upon in the code.
kConsonant = 'C',
kVowel = 'V',
kVirama = 'H', // (aka Halant)
kMatra = 'M', // (aka Dependent Vowel)
kMatraPiece = 'P', // unicode provides pieces of Matras.
kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks)
kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C
kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D
kVedicMark = 'v', // Modifiers can come modify any indic syllable.
kNukta = 'N', // Occurs only immediately after consonants.
kRobat = 'R', // Khmer only.
kOther = 'O', // (digits, measures, non-Indic, etc)
// Additional classes used only by ValidateGrapheme.
kWhitespace = ' ',
kCombiner = 'c', // Combiners other than virama.
};
typedef std::pair<CharClass, char32> IndicPair;
Validator(ViramaScript script, bool report_errors)
: script_(script),
codes_used_(0),
output_used_(0),
report_errors_(report_errors) {}
// Factory method that understands how to map script to the right subclass.
static std::unique_ptr<Validator> ScriptValidator(ViramaScript script,
bool report_errors);
// Internal version of the public static ValidateCleanAndSegment.
// Validates and cleans the src vector of unicodes to the *dest, according to
// its type and the given g_mode.
// In case of validation error, returns false and returns as much as possible
// of the input, without discarding invalid text.
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode,
const std::vector<char32>& src,
std::vector<std::vector<char32>>* dest);
// Moves the results from parts_ or output_ to dest according to g_mode.
void MoveResultsToDest(GraphemeNormMode g_mode,
std::vector<std::vector<char32>>* dest);
// Computes and returns the ViramaScript corresponding to the most frequent
// virama-using script in the input, or kNonVirama if none are present.
static ViramaScript MostFrequentViramaScript(
const std::vector<char32>& utf32);
// Returns true if the given UTF-32 unicode is a "virama" character.
static bool IsVirama(char32 unicode);
// Returns true if the given UTF-32 unicode is a vedic accent.
static bool IsVedicAccent(char32 unicode);
// Returns true if the script is one that uses subscripts for conjuncts.
bool IsSubscriptScript() const;
// Helper function appends the next element of codes_ only to output_,
// without touching parts_
// Returns true at the end of codes_.
bool CodeOnlyToOutput() {
output_.push_back(codes_[codes_used_].second);
return ++codes_used_ == codes_.size();
}
// Helper function adds a length-element vector to parts_ from the last length
// elements of output_. If there are more than length unused elements in
// output_, adds unicodes as single-element vectors to parts_ to catch
// output_used_ up to output->size() - length before adding the length-element
// vector.
void MultiCodePart(int length) {
while (output_used_ + length < output_.size()) {
parts_.emplace_back(
std::initializer_list<char32>{output_[output_used_++]});
}
parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
while (++output_used_ < output_.size()) {
parts_.back().push_back(output_[output_used_]);
}
}
// Helper function appends the next element of codes_ to output_, and then
// calls MultiCodePart to add the appropriate components to parts_.
// Returns true at the end of codes_.
bool UseMultiCode(int length) {
output_.push_back(codes_[codes_used_].second);
MultiCodePart(length);
return ++codes_used_ == codes_.size();
}
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
virtual bool ConsumeGraphemeIfValid() = 0;
// Sets codes_ to the class codes for the given unicode text.
void ComputeClassCodes(const std::vector<char32>& text);
// Returns the CharClass corresponding to the given Unicode ch.
virtual CharClass UnicodeToCharClass(char32 ch) const = 0;
// Resets to the initial state.
void Clear();
// Number of unicodes in each Indic codepage.
static const int kIndicCodePageSize = 128;
// Lowest unicode value of any Indic script. (Devanagari).
static const char32 kMinIndicUnicode = 0x900;
// Highest unicode value of any consistent (ISCII-based) Indic script.
static const char32 kMaxSinhalaUnicode = 0xdff;
// Highest unicode value of any virama-using script. (Khmer).
static const char32 kMaxViramaScriptUnicode = 0x17ff;
// Some special unicodes.
static const char32 kSinhalaVirama = 0xdca;
static const char32 kMyanmarVirama = 0x1039;
static const char32 kKhmerVirama = 0x17d2;
// Script we are operating on.
ViramaScript script_;
// Input unicodes with assigned CharClass is the data to be validated.
std::vector<IndicPair> codes_;
// Glyph-like components of the input.
std::vector<std::vector<char32>> parts_;
// Copied validated unicodes from codes_ that are OK to output.
std::vector<char32> output_;
// The number of elements of codes_ that have been processed so far.
int codes_used_;
// The number of elements of output_ that have already been added to parts_.
int output_used_;
// Log error messages for reasons why text is invalid.
bool report_errors_;
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATOR_H_