mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-06 17:32:41 +08:00
Move training to src.
This commit is contained in:
parent
ca5c15e6a8
commit
104fe7931c
@ -308,7 +308,7 @@ if (BUILD_TESTS AND EXISTS ${PROJECT_SOURCE_DIR}/googletest/CMakeLists.txt)
|
||||
endif()
|
||||
|
||||
if (BUILD_TRAINING_TOOLS)
|
||||
add_subdirectory(training)
|
||||
add_subdirectory(src/training)
|
||||
endif()
|
||||
|
||||
get_target_property(tesseract_NAME libtesseract NAME)
|
||||
|
@ -502,7 +502,7 @@ AC_CONFIG_FILES([java/com/google/scrollview/Makefile])
|
||||
AC_CONFIG_FILES([java/com/google/scrollview/events/Makefile])
|
||||
AC_CONFIG_FILES([java/com/google/scrollview/ui/Makefile])
|
||||
AC_CONFIG_FILES([doc/Makefile])
|
||||
AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(training/Makefile)])
|
||||
AM_COND_IF([ENABLE_TRAINING], [AC_CONFIG_FILES(src/training/Makefile)])
|
||||
AC_OUTPUT
|
||||
|
||||
# Final message
|
||||
|
86
cppan.yml
86
cppan.yml
@ -172,7 +172,7 @@ projects:
|
||||
tessopt:
|
||||
type: lib
|
||||
static_only: true
|
||||
files: training/tessopt.*
|
||||
files: src/training/tessopt.*
|
||||
include_directories: training
|
||||
dependencies: libtesseract
|
||||
|
||||
@ -180,104 +180,104 @@ projects:
|
||||
type: lib
|
||||
static_only: true
|
||||
files:
|
||||
- training/commandlineflags.cpp
|
||||
- training/commandlineflags.h
|
||||
- training/commontraining.cpp
|
||||
- training/commontraining.h
|
||||
- src/training/commandlineflags.cpp
|
||||
- src/training/commandlineflags.h
|
||||
- src/training/commontraining.cpp
|
||||
- src/training/commontraining.h
|
||||
include_directories: training
|
||||
dependencies:
|
||||
- tessopt
|
||||
|
||||
ambiguous_words:
|
||||
files: training/ambiguous_words.cpp
|
||||
files: src/training/ambiguous_words.cpp
|
||||
dependencies:
|
||||
- libtesseract
|
||||
|
||||
classifier_tester:
|
||||
files: training/classifier_tester.cpp
|
||||
files: src/training/classifier_tester.cpp
|
||||
dependencies: common_training
|
||||
|
||||
combine_lang_model:
|
||||
files: training/combine_lang_model.cpp
|
||||
files: src/training/combine_lang_model.cpp
|
||||
dependencies: unicharset_training
|
||||
|
||||
combine_tessdata:
|
||||
files: training/combine_tessdata.cpp
|
||||
files: src/training/combine_tessdata.cpp
|
||||
dependencies: libtesseract
|
||||
|
||||
cntraining:
|
||||
files: training/cntraining.cpp
|
||||
files: src/training/cntraining.cpp
|
||||
dependencies: common_training
|
||||
|
||||
dawg2wordlist:
|
||||
files: training/dawg2wordlist.cpp
|
||||
files: src/training/dawg2wordlist.cpp
|
||||
dependencies: libtesseract
|
||||
|
||||
mftraining:
|
||||
files:
|
||||
- training/mftraining.cpp
|
||||
- training/mergenf.*
|
||||
- src/training/mftraining.cpp
|
||||
- src/training/mergenf.*
|
||||
dependencies: common_training
|
||||
|
||||
shapeclustering:
|
||||
files: training/shapeclustering.cpp
|
||||
files: src/training/shapeclustering.cpp
|
||||
dependencies: common_training
|
||||
|
||||
unicharset_extractor:
|
||||
files: training/unicharset_extractor.cpp
|
||||
files: src/training/unicharset_extractor.cpp
|
||||
dependencies: unicharset_training
|
||||
|
||||
wordlist2dawg:
|
||||
files: training/wordlist2dawg.cpp
|
||||
files: src/training/wordlist2dawg.cpp
|
||||
dependencies: libtesseract
|
||||
|
||||
unicharset_training:
|
||||
type: lib
|
||||
static_only: true
|
||||
files:
|
||||
- training/fileio.*
|
||||
- training/icuerrorcode.h
|
||||
- training/lang_model_helpers.*
|
||||
- training/lstmtester.*
|
||||
- training/normstrngs.*
|
||||
- training/unicharset_training_utils.*
|
||||
- training/validat.*
|
||||
- src/training/fileio.*
|
||||
- src/training/icuerrorcode.h
|
||||
- src/training/lang_model_helpers.*
|
||||
- src/training/lstmtester.*
|
||||
- src/training/normstrngs.*
|
||||
- src/training/unicharset_training_utils.*
|
||||
- src/training/validat.*
|
||||
include_directories: training
|
||||
dependencies:
|
||||
- common_training
|
||||
- pvt.cppan.demo.unicode.icu.i18n
|
||||
|
||||
lstmeval:
|
||||
files: training/lstmeval.cpp
|
||||
files: src/training/lstmeval.cpp
|
||||
dependencies: unicharset_training
|
||||
|
||||
lstmtraining:
|
||||
files: training/lstmtraining.cpp
|
||||
files: src/training/lstmtraining.cpp
|
||||
dependencies: unicharset_training
|
||||
|
||||
set_unicharset_properties:
|
||||
files: training/set_unicharset_properties.cpp
|
||||
files: src/training/set_unicharset_properties.cpp
|
||||
dependencies: unicharset_training
|
||||
|
||||
text2image:
|
||||
files:
|
||||
- training/text2image.cpp
|
||||
- training/boxchar.cpp
|
||||
- training/boxchar.h
|
||||
- training/degradeimage.cpp
|
||||
- training/degradeimage.h
|
||||
- training/ligature_table.cpp
|
||||
- training/ligature_table.h
|
||||
- training/normstrngs.cpp
|
||||
- training/normstrngs.h
|
||||
- training/pango_font_info.cpp
|
||||
- training/pango_font_info.h
|
||||
- training/stringrenderer.cpp
|
||||
- training/stringrenderer.h
|
||||
- training/tlog.cpp
|
||||
- training/tlog.h
|
||||
- training/util.h
|
||||
- training/icuerrorcode.h
|
||||
- src/training/text2image.cpp
|
||||
- src/training/boxchar.cpp
|
||||
- src/training/boxchar.h
|
||||
- src/training/degradeimage.cpp
|
||||
- src/training/degradeimage.h
|
||||
- src/training/ligature_table.cpp
|
||||
- src/training/ligature_table.h
|
||||
- src/training/normstrngs.cpp
|
||||
- src/training/normstrngs.h
|
||||
- src/training/pango_font_info.cpp
|
||||
- src/training/pango_font_info.h
|
||||
- src/training/stringrenderer.cpp
|
||||
- src/training/stringrenderer.h
|
||||
- src/training/tlog.cpp
|
||||
- src/training/tlog.h
|
||||
- src/training/util.h
|
||||
- src/training/icuerrorcode.h
|
||||
|
||||
dependencies:
|
||||
- unicharset_training
|
||||
|
@ -1,310 +1,310 @@
|
||||
/**********************************************************************
|
||||
* File: degradeimage.cpp
|
||||
* Description: Function to degrade an image (usually of text) as if it
|
||||
* has been printed and then scanned.
|
||||
* Authors: Ray Smith
|
||||
* Created: Tue Nov 19 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "degradeimage.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "allheaders.h" // from leptonica
|
||||
#include "genericvector.h"
|
||||
#include "helpers.h" // For TRand.
|
||||
#include "rect.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// A randomized perspective distortion can be applied to synthetic input.
|
||||
// The perspective distortion comes from leptonica, which uses 2 sets of 4
|
||||
// corners to determine the distortion. There are random values for each of
|
||||
// the x numbers x0..x3 and y0..y3, except for x2 and x3 which are instead
|
||||
// defined in terms of a single shear value. This reduces the degrees of
|
||||
// freedom enough to make the distortion more realistic than it would otherwise
|
||||
// be if all 8 coordinates could move independently.
|
||||
// One additional factor is used for the color of the pixels that don't exist
|
||||
// in the source image.
|
||||
// Name for each of the randomizing factors.
|
||||
enum FactorNames {
|
||||
FN_INCOLOR,
|
||||
FN_Y0,
|
||||
FN_Y1,
|
||||
FN_Y2,
|
||||
FN_Y3,
|
||||
FN_X0,
|
||||
FN_X1,
|
||||
FN_SHEAR,
|
||||
// x2 = x1 - shear
|
||||
// x3 = x0 + shear
|
||||
FN_NUM_FACTORS
|
||||
};
|
||||
|
||||
// Rotation is +/- kRotationRange radians.
|
||||
const float kRotationRange = 0.02f;
|
||||
// Number of grey levels to shift by for each exposure step.
|
||||
const int kExposureFactor = 16;
|
||||
// Salt and pepper noise is +/- kSaltnPepper.
|
||||
const int kSaltnPepper = 5;
|
||||
// Min sum of width + height on which to operate the ramp.
|
||||
const int kMinRampSize = 1000;
|
||||
|
||||
// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
|
||||
// corresponding to darkening on the copier and <0 lighter and 0 not copied.
|
||||
// Exposures in [-2,2] are most useful, with -3 and 3 being extreme.
|
||||
// If rotation is nullptr, rotation is skipped. If *rotation is non-zero, the
|
||||
// pix is rotated by *rotation else it is randomly rotated and *rotation is
|
||||
// modified.
|
||||
//
|
||||
// HOW IT WORKS:
|
||||
// Most of the process is really dictated by the fact that the minimum
|
||||
// available convolution is 3X3, which is too big really to simulate a
|
||||
// good quality print/scan process. (2X2 would be better.)
|
||||
// 1 pixel wide inputs are heavily smeared by the 3X3 convolution, making the
|
||||
// images generally biased to being too light, so most of the work is to make
|
||||
// them darker. 3 levels of thickening/darkening are achieved with 2 dilations,
|
||||
// (using a greyscale erosion) one heavy (by being before convolution) and one
|
||||
// light (after convolution).
|
||||
// With no dilation, after covolution, the images are so light that a heavy
|
||||
// constant offset is required to make the 0 image look reasonable. A simple
|
||||
// constant offset multiple of exposure to undo this value is enough to achieve
|
||||
// all the required lightening. This gives the advantage that exposure level 1
|
||||
// with a single dilation gives a good impression of the broken-yet-too-dark
|
||||
// problem that is often seen in scans.
|
||||
// A small random rotation gives some varying greyscale values on the edges,
|
||||
// and some random salt and pepper noise on top helps to realistically jaggy-up
|
||||
// the edges.
|
||||
// Finally a greyscale ramp provides a continuum of effects between exposure
|
||||
// levels.
|
||||
Pix* DegradeImage(Pix* input, int exposure, TRand* randomizer,
|
||||
float* rotation) {
|
||||
Pix* pix = pixConvertTo8(input, false);
|
||||
pixDestroy(&input);
|
||||
input = pix;
|
||||
int width = pixGetWidth(input);
|
||||
int height = pixGetHeight(input);
|
||||
if (exposure >= 2) {
|
||||
// An erosion simulates the spreading darkening of a dark copy.
|
||||
// This is backwards to binary morphology,
|
||||
// see http://www.leptonica.com/grayscale-morphology.html
|
||||
pix = input;
|
||||
input = pixErodeGray(pix, 3, 3);
|
||||
pixDestroy(&pix);
|
||||
}
|
||||
// A convolution is essential to any mode as no scanner produces an
|
||||
// image as sharp as the electronic image.
|
||||
pix = pixBlockconv(input, 1, 1);
|
||||
pixDestroy(&input);
|
||||
// A small random rotation helps to make the edges jaggy in a realistic way.
|
||||
if (rotation != nullptr) {
|
||||
float radians_clockwise = 0.0f;
|
||||
if (*rotation) {
|
||||
radians_clockwise = *rotation;
|
||||
} else if (randomizer != nullptr) {
|
||||
radians_clockwise = randomizer->SignedRand(kRotationRange);
|
||||
}
|
||||
|
||||
input = pixRotate(pix, radians_clockwise,
|
||||
L_ROTATE_AREA_MAP, L_BRING_IN_WHITE,
|
||||
0, 0);
|
||||
// Rotate the boxes to match.
|
||||
*rotation = radians_clockwise;
|
||||
pixDestroy(&pix);
|
||||
} else {
|
||||
input = pix;
|
||||
}
|
||||
|
||||
if (exposure >= 3 || exposure == 1) {
|
||||
// Erosion after the convolution is not as heavy as before, so it is
|
||||
// good for level 1 and in addition as a level 3.
|
||||
// This is backwards to binary morphology,
|
||||
// see http://www.leptonica.com/grayscale-morphology.html
|
||||
pix = input;
|
||||
input = pixErodeGray(pix, 3, 3);
|
||||
pixDestroy(&pix);
|
||||
}
|
||||
// The convolution really needed to be 2x2 to be realistic enough, but
|
||||
// we only have 3x3, so we have to bias the image darker or lose thin
|
||||
// strokes.
|
||||
int erosion_offset = 0;
|
||||
// For light and 0 exposure, there is no dilation, so compensate for the
|
||||
// convolution with a big darkening bias which is undone for lighter
|
||||
// exposures.
|
||||
if (exposure <= 0)
|
||||
erosion_offset = -3 * kExposureFactor;
|
||||
// Add in a general offset of the greyscales for the exposure level so
|
||||
// a threshold of 128 gives a reasonable binary result.
|
||||
erosion_offset -= exposure * kExposureFactor;
|
||||
// Add a gradual fade over the page and a small amount of salt and pepper
|
||||
// noise to simulate noise in the sensor/paper fibres and varying
|
||||
// illumination.
|
||||
l_uint32* data = pixGetData(input);
|
||||
for (int y = 0; y < height; ++y) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
int pixel = GET_DATA_BYTE(data, x);
|
||||
if (randomizer != nullptr)
|
||||
pixel += randomizer->IntRand() % (kSaltnPepper*2 + 1) - kSaltnPepper;
|
||||
if (height + width > kMinRampSize)
|
||||
pixel -= (2*x + y) * 32 / (height + width);
|
||||
pixel += erosion_offset;
|
||||
if (pixel < 0)
|
||||
pixel = 0;
|
||||
if (pixel > 255)
|
||||
pixel = 255;
|
||||
SET_DATA_BYTE(data, x, pixel);
|
||||
}
|
||||
data += input->wpl;
|
||||
}
|
||||
return input;
|
||||
}
|
||||
|
||||
// Creates and returns a Pix distorted by various means according to the bool
|
||||
// flags. If boxes is not nullptr, the boxes are resized/positioned according to
|
||||
// any spatial distortion and also by the integer reduction factor box_scale
|
||||
// so they will match what the network will output.
|
||||
// Returns nullptr on error. The returned Pix must be pixDestroyed.
|
||||
Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
|
||||
bool white_noise, bool smooth_noise, bool blur,
|
||||
int box_reduction, TRand* randomizer,
|
||||
GenericVector<TBOX>* boxes) {
|
||||
Pix* distorted = pixCopy(nullptr, const_cast<Pix*>(pix));
|
||||
// Things to do to synthetic training data.
|
||||
if (invert && randomizer->SignedRand(1.0) < 0)
|
||||
pixInvert(distorted, distorted);
|
||||
if ((white_noise || smooth_noise) && randomizer->SignedRand(1.0) > 0.0) {
|
||||
// TODO(rays) Cook noise in a more thread-safe manner than rand().
|
||||
// Attempt to make the sequences reproducible.
|
||||
srand(randomizer->IntRand());
|
||||
Pix* pixn = pixAddGaussianNoise(distorted, 8.0);
|
||||
pixDestroy(&distorted);
|
||||
if (smooth_noise) {
|
||||
distorted = pixBlockconv(pixn, 1, 1);
|
||||
pixDestroy(&pixn);
|
||||
} else {
|
||||
distorted = pixn;
|
||||
}
|
||||
}
|
||||
if (blur && randomizer->SignedRand(1.0) > 0.0) {
|
||||
Pix* blurred = pixBlockconv(distorted, 1, 1);
|
||||
pixDestroy(&distorted);
|
||||
distorted = blurred;
|
||||
}
|
||||
if (perspective)
|
||||
GeneratePerspectiveDistortion(0, 0, randomizer, &distorted, boxes);
|
||||
if (boxes != nullptr) {
|
||||
for (int b = 0; b < boxes->size(); ++b) {
|
||||
(*boxes)[b].scale(1.0f / box_reduction);
|
||||
if ((*boxes)[b].width() <= 0)
|
||||
(*boxes)[b].set_right((*boxes)[b].left() + 1);
|
||||
}
|
||||
}
|
||||
return distorted;
|
||||
}
|
||||
|
||||
// Distorts anything that has a non-null pointer with the same pseudo-random
|
||||
// perspective distortion. Width and height only need to be set if there
|
||||
// is no pix. If there is a pix, then they will be taken from there.
|
||||
void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
|
||||
Pix** pix, GenericVector<TBOX>* boxes) {
|
||||
if (pix != nullptr && *pix != nullptr) {
|
||||
width = pixGetWidth(*pix);
|
||||
height = pixGetHeight(*pix);
|
||||
}
|
||||
float* im_coeffs = nullptr;
|
||||
float* box_coeffs = nullptr;
|
||||
l_int32 incolor =
|
||||
ProjectiveCoeffs(width, height, randomizer, &im_coeffs, &box_coeffs);
|
||||
if (pix != nullptr && *pix != nullptr) {
|
||||
// Transform the image.
|
||||
Pix* transformed = pixProjective(*pix, im_coeffs, incolor);
|
||||
if (transformed == nullptr) {
|
||||
tprintf("Projective transformation failed!!\n");
|
||||
return;
|
||||
}
|
||||
pixDestroy(pix);
|
||||
*pix = transformed;
|
||||
}
|
||||
if (boxes != nullptr) {
|
||||
// Transform the boxes.
|
||||
for (int b = 0; b < boxes->size(); ++b) {
|
||||
int x1, y1, x2, y2;
|
||||
const TBOX& box = (*boxes)[b];
|
||||
projectiveXformSampledPt(box_coeffs, box.left(), height - box.top(), &x1,
|
||||
&y1);
|
||||
projectiveXformSampledPt(box_coeffs, box.right(), height - box.bottom(),
|
||||
&x2, &y2);
|
||||
TBOX new_box1(x1, height - y2, x2, height - y1);
|
||||
projectiveXformSampledPt(box_coeffs, box.left(), height - box.bottom(),
|
||||
&x1, &y1);
|
||||
projectiveXformSampledPt(box_coeffs, box.right(), height - box.top(), &x2,
|
||||
&y2);
|
||||
TBOX new_box2(x1, height - y1, x2, height - y2);
|
||||
(*boxes)[b] = new_box1.bounding_union(new_box2);
|
||||
}
|
||||
}
|
||||
free(im_coeffs);
|
||||
free(box_coeffs);
|
||||
}
|
||||
|
||||
// Computes the coefficients of a randomized projective transformation.
|
||||
// The image transform requires backward transformation coefficient, and the
|
||||
// box transform the forward coefficients.
|
||||
// Returns the incolor arg to pixProjective.
|
||||
int ProjectiveCoeffs(int width, int height, TRand* randomizer,
|
||||
float** im_coeffs, float** box_coeffs) {
|
||||
// Setup "from" points.
|
||||
Pta* src_pts = ptaCreate(4);
|
||||
ptaAddPt(src_pts, 0.0f, 0.0f);
|
||||
ptaAddPt(src_pts, width, 0.0f);
|
||||
ptaAddPt(src_pts, width, height);
|
||||
ptaAddPt(src_pts, 0.0f, height);
|
||||
// Extract factors from pseudo-random sequence.
|
||||
float factors[FN_NUM_FACTORS];
|
||||
float shear = 0.0f; // Shear is signed.
|
||||
for (int i = 0; i < FN_NUM_FACTORS; ++i) {
|
||||
// Everything is squared to make wild values rarer.
|
||||
if (i == FN_SHEAR) {
|
||||
// Shear is signed.
|
||||
shear = randomizer->SignedRand(0.5 / 3.0);
|
||||
shear = shear >= 0.0 ? shear * shear : -shear * shear;
|
||||
// Keep the sheared points within the original rectangle.
|
||||
if (shear < -factors[FN_X0]) shear = -factors[FN_X0];
|
||||
if (shear > factors[FN_X1]) shear = factors[FN_X1];
|
||||
factors[i] = shear;
|
||||
} else if (i != FN_INCOLOR) {
|
||||
factors[i] = fabs(randomizer->SignedRand(1.0));
|
||||
if (i <= FN_Y3)
|
||||
factors[i] *= 5.0 / 8.0;
|
||||
else
|
||||
factors[i] *= 0.5;
|
||||
factors[i] *= factors[i];
|
||||
}
|
||||
}
|
||||
// Setup "to" points.
|
||||
Pta* dest_pts = ptaCreate(4);
|
||||
ptaAddPt(dest_pts, factors[FN_X0] * width, factors[FN_Y0] * height);
|
||||
ptaAddPt(dest_pts, (1.0f - factors[FN_X1]) * width, factors[FN_Y1] * height);
|
||||
ptaAddPt(dest_pts, (1.0f - factors[FN_X1] + shear) * width,
|
||||
(1 - factors[FN_Y2]) * height);
|
||||
ptaAddPt(dest_pts, (factors[FN_X0] + shear) * width,
|
||||
(1 - factors[FN_Y3]) * height);
|
||||
getProjectiveXformCoeffs(dest_pts, src_pts, im_coeffs);
|
||||
getProjectiveXformCoeffs(src_pts, dest_pts, box_coeffs);
|
||||
ptaDestroy(&src_pts);
|
||||
ptaDestroy(&dest_pts);
|
||||
return factors[FN_INCOLOR] > 0.5f ? L_BRING_IN_WHITE : L_BRING_IN_BLACK;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
/**********************************************************************
|
||||
* File: degradeimage.cpp
|
||||
* Description: Function to degrade an image (usually of text) as if it
|
||||
* has been printed and then scanned.
|
||||
* Authors: Ray Smith
|
||||
* Created: Tue Nov 19 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "degradeimage.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "allheaders.h" // from leptonica
|
||||
#include "genericvector.h"
|
||||
#include "helpers.h" // For TRand.
|
||||
#include "rect.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// A randomized perspective distortion can be applied to synthetic input.
|
||||
// The perspective distortion comes from leptonica, which uses 2 sets of 4
|
||||
// corners to determine the distortion. There are random values for each of
|
||||
// the x numbers x0..x3 and y0..y3, except for x2 and x3 which are instead
|
||||
// defined in terms of a single shear value. This reduces the degrees of
|
||||
// freedom enough to make the distortion more realistic than it would otherwise
|
||||
// be if all 8 coordinates could move independently.
|
||||
// One additional factor is used for the color of the pixels that don't exist
|
||||
// in the source image.
|
||||
// Name for each of the randomizing factors.
|
||||
enum FactorNames {
|
||||
FN_INCOLOR,
|
||||
FN_Y0,
|
||||
FN_Y1,
|
||||
FN_Y2,
|
||||
FN_Y3,
|
||||
FN_X0,
|
||||
FN_X1,
|
||||
FN_SHEAR,
|
||||
// x2 = x1 - shear
|
||||
// x3 = x0 + shear
|
||||
FN_NUM_FACTORS
|
||||
};
|
||||
|
||||
// Rotation is +/- kRotationRange radians.
|
||||
const float kRotationRange = 0.02f;
|
||||
// Number of grey levels to shift by for each exposure step.
|
||||
const int kExposureFactor = 16;
|
||||
// Salt and pepper noise is +/- kSaltnPepper.
|
||||
const int kSaltnPepper = 5;
|
||||
// Min sum of width + height on which to operate the ramp.
|
||||
const int kMinRampSize = 1000;
|
||||
|
||||
// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
|
||||
// corresponding to darkening on the copier and <0 lighter and 0 not copied.
|
||||
// Exposures in [-2,2] are most useful, with -3 and 3 being extreme.
|
||||
// If rotation is nullptr, rotation is skipped. If *rotation is non-zero, the
|
||||
// pix is rotated by *rotation else it is randomly rotated and *rotation is
|
||||
// modified.
|
||||
//
|
||||
// HOW IT WORKS:
|
||||
// Most of the process is really dictated by the fact that the minimum
|
||||
// available convolution is 3X3, which is too big really to simulate a
|
||||
// good quality print/scan process. (2X2 would be better.)
|
||||
// 1 pixel wide inputs are heavily smeared by the 3X3 convolution, making the
|
||||
// images generally biased to being too light, so most of the work is to make
|
||||
// them darker. 3 levels of thickening/darkening are achieved with 2 dilations,
|
||||
// (using a greyscale erosion) one heavy (by being before convolution) and one
|
||||
// light (after convolution).
|
||||
// With no dilation, after covolution, the images are so light that a heavy
|
||||
// constant offset is required to make the 0 image look reasonable. A simple
|
||||
// constant offset multiple of exposure to undo this value is enough to achieve
|
||||
// all the required lightening. This gives the advantage that exposure level 1
|
||||
// with a single dilation gives a good impression of the broken-yet-too-dark
|
||||
// problem that is often seen in scans.
|
||||
// A small random rotation gives some varying greyscale values on the edges,
|
||||
// and some random salt and pepper noise on top helps to realistically jaggy-up
|
||||
// the edges.
|
||||
// Finally a greyscale ramp provides a continuum of effects between exposure
|
||||
// levels.
|
||||
Pix* DegradeImage(Pix* input, int exposure, TRand* randomizer,
|
||||
float* rotation) {
|
||||
Pix* pix = pixConvertTo8(input, false);
|
||||
pixDestroy(&input);
|
||||
input = pix;
|
||||
int width = pixGetWidth(input);
|
||||
int height = pixGetHeight(input);
|
||||
if (exposure >= 2) {
|
||||
// An erosion simulates the spreading darkening of a dark copy.
|
||||
// This is backwards to binary morphology,
|
||||
// see http://www.leptonica.com/grayscale-morphology.html
|
||||
pix = input;
|
||||
input = pixErodeGray(pix, 3, 3);
|
||||
pixDestroy(&pix);
|
||||
}
|
||||
// A convolution is essential to any mode as no scanner produces an
|
||||
// image as sharp as the electronic image.
|
||||
pix = pixBlockconv(input, 1, 1);
|
||||
pixDestroy(&input);
|
||||
// A small random rotation helps to make the edges jaggy in a realistic way.
|
||||
if (rotation != nullptr) {
|
||||
float radians_clockwise = 0.0f;
|
||||
if (*rotation) {
|
||||
radians_clockwise = *rotation;
|
||||
} else if (randomizer != nullptr) {
|
||||
radians_clockwise = randomizer->SignedRand(kRotationRange);
|
||||
}
|
||||
|
||||
input = pixRotate(pix, radians_clockwise,
|
||||
L_ROTATE_AREA_MAP, L_BRING_IN_WHITE,
|
||||
0, 0);
|
||||
// Rotate the boxes to match.
|
||||
*rotation = radians_clockwise;
|
||||
pixDestroy(&pix);
|
||||
} else {
|
||||
input = pix;
|
||||
}
|
||||
|
||||
if (exposure >= 3 || exposure == 1) {
|
||||
// Erosion after the convolution is not as heavy as before, so it is
|
||||
// good for level 1 and in addition as a level 3.
|
||||
// This is backwards to binary morphology,
|
||||
// see http://www.leptonica.com/grayscale-morphology.html
|
||||
pix = input;
|
||||
input = pixErodeGray(pix, 3, 3);
|
||||
pixDestroy(&pix);
|
||||
}
|
||||
// The convolution really needed to be 2x2 to be realistic enough, but
|
||||
// we only have 3x3, so we have to bias the image darker or lose thin
|
||||
// strokes.
|
||||
int erosion_offset = 0;
|
||||
// For light and 0 exposure, there is no dilation, so compensate for the
|
||||
// convolution with a big darkening bias which is undone for lighter
|
||||
// exposures.
|
||||
if (exposure <= 0)
|
||||
erosion_offset = -3 * kExposureFactor;
|
||||
// Add in a general offset of the greyscales for the exposure level so
|
||||
// a threshold of 128 gives a reasonable binary result.
|
||||
erosion_offset -= exposure * kExposureFactor;
|
||||
// Add a gradual fade over the page and a small amount of salt and pepper
|
||||
// noise to simulate noise in the sensor/paper fibres and varying
|
||||
// illumination.
|
||||
l_uint32* data = pixGetData(input);
|
||||
for (int y = 0; y < height; ++y) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
int pixel = GET_DATA_BYTE(data, x);
|
||||
if (randomizer != nullptr)
|
||||
pixel += randomizer->IntRand() % (kSaltnPepper*2 + 1) - kSaltnPepper;
|
||||
if (height + width > kMinRampSize)
|
||||
pixel -= (2*x + y) * 32 / (height + width);
|
||||
pixel += erosion_offset;
|
||||
if (pixel < 0)
|
||||
pixel = 0;
|
||||
if (pixel > 255)
|
||||
pixel = 255;
|
||||
SET_DATA_BYTE(data, x, pixel);
|
||||
}
|
||||
data += input->wpl;
|
||||
}
|
||||
return input;
|
||||
}
|
||||
|
||||
// Creates and returns a Pix distorted by various means according to the bool
|
||||
// flags. If boxes is not nullptr, the boxes are resized/positioned according to
|
||||
// any spatial distortion and also by the integer reduction factor box_scale
|
||||
// so they will match what the network will output.
|
||||
// Returns nullptr on error. The returned Pix must be pixDestroyed.
|
||||
Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
|
||||
bool white_noise, bool smooth_noise, bool blur,
|
||||
int box_reduction, TRand* randomizer,
|
||||
GenericVector<TBOX>* boxes) {
|
||||
Pix* distorted = pixCopy(nullptr, const_cast<Pix*>(pix));
|
||||
// Things to do to synthetic training data.
|
||||
if (invert && randomizer->SignedRand(1.0) < 0)
|
||||
pixInvert(distorted, distorted);
|
||||
if ((white_noise || smooth_noise) && randomizer->SignedRand(1.0) > 0.0) {
|
||||
// TODO(rays) Cook noise in a more thread-safe manner than rand().
|
||||
// Attempt to make the sequences reproducible.
|
||||
srand(randomizer->IntRand());
|
||||
Pix* pixn = pixAddGaussianNoise(distorted, 8.0);
|
||||
pixDestroy(&distorted);
|
||||
if (smooth_noise) {
|
||||
distorted = pixBlockconv(pixn, 1, 1);
|
||||
pixDestroy(&pixn);
|
||||
} else {
|
||||
distorted = pixn;
|
||||
}
|
||||
}
|
||||
if (blur && randomizer->SignedRand(1.0) > 0.0) {
|
||||
Pix* blurred = pixBlockconv(distorted, 1, 1);
|
||||
pixDestroy(&distorted);
|
||||
distorted = blurred;
|
||||
}
|
||||
if (perspective)
|
||||
GeneratePerspectiveDistortion(0, 0, randomizer, &distorted, boxes);
|
||||
if (boxes != nullptr) {
|
||||
for (int b = 0; b < boxes->size(); ++b) {
|
||||
(*boxes)[b].scale(1.0f / box_reduction);
|
||||
if ((*boxes)[b].width() <= 0)
|
||||
(*boxes)[b].set_right((*boxes)[b].left() + 1);
|
||||
}
|
||||
}
|
||||
return distorted;
|
||||
}
|
||||
|
||||
// Distorts anything that has a non-null pointer with the same pseudo-random
|
||||
// perspective distortion. Width and height only need to be set if there
|
||||
// is no pix. If there is a pix, then they will be taken from there.
|
||||
void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
|
||||
Pix** pix, GenericVector<TBOX>* boxes) {
|
||||
if (pix != nullptr && *pix != nullptr) {
|
||||
width = pixGetWidth(*pix);
|
||||
height = pixGetHeight(*pix);
|
||||
}
|
||||
float* im_coeffs = nullptr;
|
||||
float* box_coeffs = nullptr;
|
||||
l_int32 incolor =
|
||||
ProjectiveCoeffs(width, height, randomizer, &im_coeffs, &box_coeffs);
|
||||
if (pix != nullptr && *pix != nullptr) {
|
||||
// Transform the image.
|
||||
Pix* transformed = pixProjective(*pix, im_coeffs, incolor);
|
||||
if (transformed == nullptr) {
|
||||
tprintf("Projective transformation failed!!\n");
|
||||
return;
|
||||
}
|
||||
pixDestroy(pix);
|
||||
*pix = transformed;
|
||||
}
|
||||
if (boxes != nullptr) {
|
||||
// Transform the boxes.
|
||||
for (int b = 0; b < boxes->size(); ++b) {
|
||||
int x1, y1, x2, y2;
|
||||
const TBOX& box = (*boxes)[b];
|
||||
projectiveXformSampledPt(box_coeffs, box.left(), height - box.top(), &x1,
|
||||
&y1);
|
||||
projectiveXformSampledPt(box_coeffs, box.right(), height - box.bottom(),
|
||||
&x2, &y2);
|
||||
TBOX new_box1(x1, height - y2, x2, height - y1);
|
||||
projectiveXformSampledPt(box_coeffs, box.left(), height - box.bottom(),
|
||||
&x1, &y1);
|
||||
projectiveXformSampledPt(box_coeffs, box.right(), height - box.top(), &x2,
|
||||
&y2);
|
||||
TBOX new_box2(x1, height - y1, x2, height - y2);
|
||||
(*boxes)[b] = new_box1.bounding_union(new_box2);
|
||||
}
|
||||
}
|
||||
free(im_coeffs);
|
||||
free(box_coeffs);
|
||||
}
|
||||
|
||||
// Computes the coefficients of a randomized projective transformation.
|
||||
// The image transform requires backward transformation coefficient, and the
|
||||
// box transform the forward coefficients.
|
||||
// Returns the incolor arg to pixProjective.
|
||||
int ProjectiveCoeffs(int width, int height, TRand* randomizer,
|
||||
float** im_coeffs, float** box_coeffs) {
|
||||
// Setup "from" points.
|
||||
Pta* src_pts = ptaCreate(4);
|
||||
ptaAddPt(src_pts, 0.0f, 0.0f);
|
||||
ptaAddPt(src_pts, width, 0.0f);
|
||||
ptaAddPt(src_pts, width, height);
|
||||
ptaAddPt(src_pts, 0.0f, height);
|
||||
// Extract factors from pseudo-random sequence.
|
||||
float factors[FN_NUM_FACTORS];
|
||||
float shear = 0.0f; // Shear is signed.
|
||||
for (int i = 0; i < FN_NUM_FACTORS; ++i) {
|
||||
// Everything is squared to make wild values rarer.
|
||||
if (i == FN_SHEAR) {
|
||||
// Shear is signed.
|
||||
shear = randomizer->SignedRand(0.5 / 3.0);
|
||||
shear = shear >= 0.0 ? shear * shear : -shear * shear;
|
||||
// Keep the sheared points within the original rectangle.
|
||||
if (shear < -factors[FN_X0]) shear = -factors[FN_X0];
|
||||
if (shear > factors[FN_X1]) shear = factors[FN_X1];
|
||||
factors[i] = shear;
|
||||
} else if (i != FN_INCOLOR) {
|
||||
factors[i] = fabs(randomizer->SignedRand(1.0));
|
||||
if (i <= FN_Y3)
|
||||
factors[i] *= 5.0 / 8.0;
|
||||
else
|
||||
factors[i] *= 0.5;
|
||||
factors[i] *= factors[i];
|
||||
}
|
||||
}
|
||||
// Setup "to" points.
|
||||
Pta* dest_pts = ptaCreate(4);
|
||||
ptaAddPt(dest_pts, factors[FN_X0] * width, factors[FN_Y0] * height);
|
||||
ptaAddPt(dest_pts, (1.0f - factors[FN_X1]) * width, factors[FN_Y1] * height);
|
||||
ptaAddPt(dest_pts, (1.0f - factors[FN_X1] + shear) * width,
|
||||
(1 - factors[FN_Y2]) * height);
|
||||
ptaAddPt(dest_pts, (factors[FN_X0] + shear) * width,
|
||||
(1 - factors[FN_Y3]) * height);
|
||||
getProjectiveXformCoeffs(dest_pts, src_pts, im_coeffs);
|
||||
getProjectiveXformCoeffs(src_pts, dest_pts, box_coeffs);
|
||||
ptaDestroy(&src_pts);
|
||||
ptaDestroy(&dest_pts);
|
||||
return factors[FN_INCOLOR] > 0.5f ? L_BRING_IN_WHITE : L_BRING_IN_BLACK;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
@ -1,61 +1,61 @@
|
||||
/**********************************************************************
|
||||
* File: degradeimage.h
|
||||
* Description: Function to degrade an image (usually of text) as if it
|
||||
* has been printed and then scanned.
|
||||
* Authors: Ray Smith
|
||||
* Created: Tue Nov 19 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_
|
||||
#define TESSERACT_TRAINING_DEGRADEIMAGE_H_
|
||||
|
||||
#include "allheaders.h"
|
||||
#include "genericvector.h"
|
||||
#include "helpers.h" // For TRand.
|
||||
#include "rect.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
|
||||
// corresponding to darkening on the copier and <0 lighter and 0 not copied.
|
||||
// If rotation is not nullptr, the clockwise rotation in radians is saved there.
|
||||
// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.)
|
||||
// The input image is destroyed and a different image returned.
|
||||
struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer,
|
||||
float* rotation);
|
||||
|
||||
// Creates and returns a Pix distorted by various means according to the bool
|
||||
// flags. If boxes is not nullptr, the boxes are resized/positioned according to
|
||||
// any spatial distortion and also by the integer reduction factor box_scale
|
||||
// so they will match what the network will output.
|
||||
// Returns nullptr on error. The returned Pix must be pixDestroyed.
|
||||
Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
|
||||
bool white_noise, bool smooth_noise, bool blur,
|
||||
int box_reduction, TRand* randomizer,
|
||||
GenericVector<TBOX>* boxes);
|
||||
// Distorts anything that has a non-null pointer with the same pseudo-random
|
||||
// perspective distortion. Width and height only need to be set if there
|
||||
// is no pix. If there is a pix, then they will be taken from there.
|
||||
void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
|
||||
Pix** pix, GenericVector<TBOX>* boxes);
|
||||
// Computes the coefficients of a randomized projective transformation.
|
||||
// The image transform requires backward transformation coefficient, and the
|
||||
// box transform the forward coefficients.
|
||||
// Returns the incolor arg to pixProjective.
|
||||
int ProjectiveCoeffs(int width, int height, TRand* randomizer,
|
||||
float** im_coeffs, float** box_coeffs);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_DEGRADEIMAGE_H_
|
||||
/**********************************************************************
|
||||
* File: degradeimage.h
|
||||
* Description: Function to degrade an image (usually of text) as if it
|
||||
* has been printed and then scanned.
|
||||
* Authors: Ray Smith
|
||||
* Created: Tue Nov 19 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_
|
||||
#define TESSERACT_TRAINING_DEGRADEIMAGE_H_
|
||||
|
||||
#include "allheaders.h"
|
||||
#include "genericvector.h"
|
||||
#include "helpers.h" // For TRand.
|
||||
#include "rect.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
|
||||
// corresponding to darkening on the copier and <0 lighter and 0 not copied.
|
||||
// If rotation is not nullptr, the clockwise rotation in radians is saved there.
|
||||
// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.)
|
||||
// The input image is destroyed and a different image returned.
|
||||
struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer,
|
||||
float* rotation);
|
||||
|
||||
// Creates and returns a Pix distorted by various means according to the bool
|
||||
// flags. If boxes is not nullptr, the boxes are resized/positioned according to
|
||||
// any spatial distortion and also by the integer reduction factor box_scale
|
||||
// so they will match what the network will output.
|
||||
// Returns nullptr on error. The returned Pix must be pixDestroyed.
|
||||
Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
|
||||
bool white_noise, bool smooth_noise, bool blur,
|
||||
int box_reduction, TRand* randomizer,
|
||||
GenericVector<TBOX>* boxes);
|
||||
// Distorts anything that has a non-null pointer with the same pseudo-random
|
||||
// perspective distortion. Width and height only need to be set if there
|
||||
// is no pix. If there is a pix, then they will be taken from there.
|
||||
void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
|
||||
Pix** pix, GenericVector<TBOX>* boxes);
|
||||
// Computes the coefficients of a randomized projective transformation.
|
||||
// The image transform requires backward transformation coefficient, and the
|
||||
// box transform the forward coefficients.
|
||||
// Returns the incolor arg to pixProjective.
|
||||
int ProjectiveCoeffs(int width, int height, TRand* randomizer,
|
||||
float** im_coeffs, float** box_coeffs);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_DEGRADEIMAGE_H_
|
@ -1,66 +1,66 @@
|
||||
/**********************************************************************
|
||||
* File: icuerrorcode.h
|
||||
* Description: Wrapper class for UErrorCode, with conversion operators for
|
||||
* direct use in ICU C and C++ APIs.
|
||||
* Author: Fredrik Roubert
|
||||
* Created: Thu July 4 2013
|
||||
*
|
||||
* Features:
|
||||
* - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,
|
||||
* removing one common source of errors.
|
||||
* - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking
|
||||
* UErrorCode& (reference), via conversion operators.
|
||||
* - Automatic checking for success when it goes out of scope. On failure,
|
||||
* the destructor will log an error message and exit.
|
||||
*
|
||||
* Most of ICU will handle errors gracefully and provide sensible fallbacks.
|
||||
* Using IcuErrorCode, it is therefore possible to write very compact code
|
||||
* that does sensible things on failure and provides logging for debugging.
|
||||
*
|
||||
* Example:
|
||||
* IcuErrorCode icuerrorcode;
|
||||
* return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL;
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
#ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_
|
||||
#define TESSERACT_CCUTIL_ICUERRORCODE_H_
|
||||
|
||||
#include "tprintf.h"
|
||||
#include "unicode/errorcode.h" // From libicu
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class IcuErrorCode : public icu::ErrorCode {
|
||||
public:
|
||||
IcuErrorCode() {}
|
||||
virtual ~IcuErrorCode() {
|
||||
if (isFailure()) {
|
||||
handleFailure();
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
virtual void handleFailure() const {
|
||||
tprintf("ICU ERROR: %s", errorName());
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
private:
|
||||
// Disallow implicit copying of object.
|
||||
IcuErrorCode(const IcuErrorCode&);
|
||||
void operator=(const IcuErrorCode&);
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
#endif // TESSERACT_CCUTIL_ICUERRORCODE_H_
|
||||
/**********************************************************************
|
||||
* File: icuerrorcode.h
|
||||
* Description: Wrapper class for UErrorCode, with conversion operators for
|
||||
* direct use in ICU C and C++ APIs.
|
||||
* Author: Fredrik Roubert
|
||||
* Created: Thu July 4 2013
|
||||
*
|
||||
* Features:
|
||||
* - The constructor initializes the internal UErrorCode to U_ZERO_ERROR,
|
||||
* removing one common source of errors.
|
||||
* - Same use in C APIs taking a UErrorCode* (pointer) and C++ taking
|
||||
* UErrorCode& (reference), via conversion operators.
|
||||
* - Automatic checking for success when it goes out of scope. On failure,
|
||||
* the destructor will log an error message and exit.
|
||||
*
|
||||
* Most of ICU will handle errors gracefully and provide sensible fallbacks.
|
||||
* Using IcuErrorCode, it is therefore possible to write very compact code
|
||||
* that does sensible things on failure and provides logging for debugging.
|
||||
*
|
||||
* Example:
|
||||
* IcuErrorCode icuerrorcode;
|
||||
* return collator.compareUTF8(a, b, icuerrorcode) == UCOL_EQUAL;
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
#ifndef TESSERACT_CCUTIL_ICUERRORCODE_H_
|
||||
#define TESSERACT_CCUTIL_ICUERRORCODE_H_
|
||||
|
||||
#include "tprintf.h"
|
||||
#include "unicode/errorcode.h" // From libicu
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class IcuErrorCode : public icu::ErrorCode {
|
||||
public:
|
||||
IcuErrorCode() {}
|
||||
virtual ~IcuErrorCode() {
|
||||
if (isFailure()) {
|
||||
handleFailure();
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
virtual void handleFailure() const {
|
||||
tprintf("ICU ERROR: %s", errorName());
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
private:
|
||||
// Disallow implicit copying of object.
|
||||
IcuErrorCode(const IcuErrorCode&);
|
||||
void operator=(const IcuErrorCode&);
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
#endif // TESSERACT_CCUTIL_ICUERRORCODE_H_
|
0
training/language-specific.sh → src/training/language-specific.sh
Executable file → Normal file
0
training/language-specific.sh → src/training/language-specific.sh
Executable file → Normal file
@ -1,353 +1,353 @@
|
||||
/******************************************************************************
|
||||
** Filename: MergeNF.c
|
||||
** Purpose: Program for merging similar nano-feature protos
|
||||
** Author: Dan Johnson
|
||||
** History: Wed Nov 21 09:55:23 1990, DSJ, Created.
|
||||
**
|
||||
** (c) Copyright Hewlett-Packard Company, 1988.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
******************************************************************************/
|
||||
#include "mergenf.h"
|
||||
#include "host.h"
|
||||
#include "efio.h"
|
||||
#include "clusttool.h"
|
||||
#include "cluster.h"
|
||||
#include "oldlist.h"
|
||||
#include "protos.h"
|
||||
#include "ndminx.h"
|
||||
#include "ocrfeatures.h"
|
||||
#include "const.h"
|
||||
#include "featdefs.h"
|
||||
#include "intproto.h"
|
||||
#include "params.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
/*-------------------once in subfeat---------------------------------*/
|
||||
double_VAR(training_angle_match_scale, 1.0, "Angle Match Scale ...");
|
||||
|
||||
double_VAR(training_similarity_midpoint, 0.0075, "Similarity Midpoint ...");
|
||||
|
||||
double_VAR(training_similarity_curl, 2.0, "Similarity Curl ...");
|
||||
|
||||
/*-----------------------------once in fasttrain----------------------------------*/
|
||||
double_VAR(training_tangent_bbox_pad, 0.5, "Tangent bounding box pad ...");
|
||||
|
||||
double_VAR(training_orthogonal_bbox_pad, 2.5, "Orthogonal bounding box pad ...");
|
||||
|
||||
double_VAR(training_angle_pad, 45.0, "Angle pad ...");
|
||||
|
||||
/**
|
||||
* Compare protos p1 and p2 and return an estimate of the
|
||||
* worst evidence rating that will result for any part of p1
|
||||
* that is compared to p2. In other words, if p1 were broken
|
||||
* into pico-features and each pico-feature was matched to p2,
|
||||
* what is the worst evidence rating that will be achieved for
|
||||
* any pico-feature.
|
||||
*
|
||||
* @param p1, p2 protos to be compared
|
||||
*
|
||||
* Globals: none
|
||||
*
|
||||
* @return Worst possible result when matching p1 to p2.
|
||||
* @note Exceptions: none
|
||||
* @note History: Mon Nov 26 08:27:53 1990, DSJ, Created.
|
||||
*/
|
||||
FLOAT32 CompareProtos(PROTO p1, PROTO p2) {
|
||||
FEATURE Feature;
|
||||
FLOAT32 WorstEvidence = WORST_EVIDENCE;
|
||||
FLOAT32 Evidence;
|
||||
FLOAT32 Angle, Length;
|
||||
|
||||
/* if p1 and p2 are not close in length, don't let them match */
|
||||
Length = fabs (p1->Length - p2->Length);
|
||||
if (Length > MAX_LENGTH_MISMATCH)
|
||||
return (0.0);
|
||||
|
||||
/* create a dummy pico-feature to be used for comparisons */
|
||||
Feature = NewFeature (&PicoFeatDesc);
|
||||
Feature->Params[PicoFeatDir] = p1->Angle;
|
||||
|
||||
/* convert angle to radians */
|
||||
Angle = p1->Angle * 2.0 * PI;
|
||||
|
||||
/* find distance from center of p1 to 1/2 picofeat from end */
|
||||
Length = p1->Length / 2.0 - GetPicoFeatureLength () / 2.0;
|
||||
if (Length < 0) Length = 0;
|
||||
|
||||
/* set the dummy pico-feature at one end of p1 and match it to p2 */
|
||||
Feature->Params[PicoFeatX] = p1->X + cos (Angle) * Length;
|
||||
Feature->Params[PicoFeatY] = p1->Y + sin (Angle) * Length;
|
||||
if (DummyFastMatch (Feature, p2)) {
|
||||
Evidence = SubfeatureEvidence (Feature, p2);
|
||||
if (Evidence < WorstEvidence)
|
||||
WorstEvidence = Evidence;
|
||||
} else {
|
||||
FreeFeature(Feature);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
/* set the dummy pico-feature at the other end of p1 and match it to p2 */
|
||||
Feature->Params[PicoFeatX] = p1->X - cos (Angle) * Length;
|
||||
Feature->Params[PicoFeatY] = p1->Y - sin (Angle) * Length;
|
||||
if (DummyFastMatch (Feature, p2)) {
|
||||
Evidence = SubfeatureEvidence (Feature, p2);
|
||||
if (Evidence < WorstEvidence)
|
||||
WorstEvidence = Evidence;
|
||||
} else {
|
||||
FreeFeature(Feature);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
FreeFeature (Feature);
|
||||
return (WorstEvidence);
|
||||
|
||||
} /* CompareProtos */
|
||||
|
||||
/**
|
||||
* This routine computes a proto which is the weighted
|
||||
* average of protos p1 and p2. The new proto is returned
|
||||
* in MergedProto.
|
||||
*
|
||||
* @param p1, p2 protos to be merged
|
||||
* @param w1, w2 weight of each proto
|
||||
* @param MergedProto place to put resulting merged proto
|
||||
*
|
||||
* Globals: none
|
||||
*
|
||||
* @return none (results are returned in MergedProto)
|
||||
* @note Exceptions: none
|
||||
* @note History: Mon Nov 26 08:15:08 1990, DSJ, Created.
|
||||
*/
|
||||
void ComputeMergedProto (PROTO p1,
|
||||
PROTO p2,
|
||||
FLOAT32 w1,
|
||||
FLOAT32 w2,
|
||||
PROTO MergedProto) {
|
||||
FLOAT32 TotalWeight;
|
||||
|
||||
TotalWeight = w1 + w2;
|
||||
w1 /= TotalWeight;
|
||||
w2 /= TotalWeight;
|
||||
|
||||
MergedProto->X = p1->X * w1 + p2->X * w2;
|
||||
MergedProto->Y = p1->Y * w1 + p2->Y * w2;
|
||||
MergedProto->Length = p1->Length * w1 + p2->Length * w2;
|
||||
MergedProto->Angle = p1->Angle * w1 + p2->Angle * w2;
|
||||
FillABC(MergedProto);
|
||||
} /* ComputeMergedProto */
|
||||
|
||||
/**
|
||||
* This routine searches through all of the prototypes in
|
||||
* Class and returns the id of the proto which would provide
|
||||
* the best approximation of Prototype. If no close
|
||||
* approximation can be found, NO_PROTO is returned.
|
||||
*
|
||||
* @param Class class to search for matching old proto in
|
||||
* @param NumMerged # of protos merged into each proto of Class
|
||||
* @param Prototype new proto to find match for
|
||||
*
|
||||
* Globals: none
|
||||
*
|
||||
* @return Id of closest proto in Class or NO_PROTO.
|
||||
* @note Exceptions: none
|
||||
* @note History: Sat Nov 24 11:42:58 1990, DSJ, Created.
|
||||
*/
|
||||
int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[],
|
||||
PROTOTYPE *Prototype) {
|
||||
PROTO_STRUCT NewProto;
|
||||
PROTO_STRUCT MergedProto;
|
||||
int Pid;
|
||||
PROTO Proto;
|
||||
int BestProto;
|
||||
FLOAT32 BestMatch;
|
||||
FLOAT32 Match, OldMatch, NewMatch;
|
||||
|
||||
MakeNewFromOld (&NewProto, Prototype);
|
||||
|
||||
BestProto = NO_PROTO;
|
||||
BestMatch = WORST_MATCH_ALLOWED;
|
||||
for (Pid = 0; Pid < Class->NumProtos; Pid++) {
|
||||
Proto = ProtoIn(Class, Pid);
|
||||
ComputeMergedProto(Proto, &NewProto,
|
||||
(FLOAT32) NumMerged[Pid], 1.0, &MergedProto);
|
||||
OldMatch = CompareProtos(Proto, &MergedProto);
|
||||
NewMatch = CompareProtos(&NewProto, &MergedProto);
|
||||
Match = MIN(OldMatch, NewMatch);
|
||||
if (Match > BestMatch) {
|
||||
BestProto = Pid;
|
||||
BestMatch = Match;
|
||||
}
|
||||
}
|
||||
return BestProto;
|
||||
} /* FindClosestExistingProto */
|
||||
|
||||
/**
|
||||
* This fills in the fields of the New proto based on the
|
||||
* fields of the Old proto.
|
||||
*
|
||||
* @param New new proto to be filled in
|
||||
* @param Old old proto to be converted
|
||||
*
|
||||
* Globals: none
|
||||
*
|
||||
* Exceptions: none
|
||||
* History: Mon Nov 26 09:45:39 1990, DSJ, Created.
|
||||
*/
|
||||
void MakeNewFromOld(PROTO New, PROTOTYPE *Old) {
|
||||
New->X = CenterX(Old->Mean);
|
||||
New->Y = CenterY(Old->Mean);
|
||||
New->Length = LengthOf(Old->Mean);
|
||||
New->Angle = OrientationOf(Old->Mean);
|
||||
FillABC(New);
|
||||
} /* MakeNewFromOld */
|
||||
|
||||
/*-------------------once in subfeat---------------------------------*/
|
||||
|
||||
/**
|
||||
* @name SubfeatureEvidence
|
||||
*
|
||||
* Compare a feature to a prototype. Print the result.
|
||||
*/
|
||||
FLOAT32 SubfeatureEvidence(FEATURE Feature, PROTO Proto) {
|
||||
float Distance;
|
||||
float Dangle;
|
||||
|
||||
Dangle = Proto->Angle - Feature->Params[PicoFeatDir];
|
||||
if (Dangle < -0.5) Dangle += 1.0;
|
||||
if (Dangle > 0.5) Dangle -= 1.0;
|
||||
Dangle *= training_angle_match_scale;
|
||||
|
||||
Distance = Proto->A * Feature->Params[PicoFeatX] +
|
||||
Proto->B * Feature->Params[PicoFeatY] +
|
||||
Proto->C;
|
||||
|
||||
return (EvidenceOf (Distance * Distance + Dangle * Dangle));
|
||||
}
|
||||
|
||||
/**
|
||||
* @name EvidenceOf
|
||||
*
|
||||
* Return the new type of evidence number corresponding to this
|
||||
* distance value. This number is no longer based on the chi squared
|
||||
* approximation. The equation that represents the transform is:
|
||||
* 1 / (1 + (sim / midpoint) ^ curl)
|
||||
*/
|
||||
double EvidenceOf (double Similarity) {
|
||||
|
||||
Similarity /= training_similarity_midpoint;
|
||||
|
||||
if (training_similarity_curl == 3)
|
||||
Similarity = Similarity * Similarity * Similarity;
|
||||
else if (training_similarity_curl == 2)
|
||||
Similarity = Similarity * Similarity;
|
||||
else
|
||||
Similarity = pow (Similarity, training_similarity_curl);
|
||||
|
||||
return (1.0 / (1.0 + Similarity));
|
||||
}
|
||||
|
||||
/**
|
||||
* This routine returns TRUE if Feature would be matched
|
||||
* by a fast match table built from Proto.
|
||||
*
|
||||
* @param Feature feature to be "fast matched" to proto
|
||||
* @param Proto proto being "fast matched" against
|
||||
*
|
||||
* Globals:
|
||||
* - training_tangent_bbox_pad bounding box pad tangent to proto
|
||||
* - training_orthogonal_bbox_pad bounding box pad orthogonal to proto
|
||||
*
|
||||
* @return TRUE if feature could match Proto.
|
||||
* @note Exceptions: none
|
||||
* @note History: Wed Nov 14 17:19:58 1990, DSJ, Created.
|
||||
*/
|
||||
BOOL8 DummyFastMatch (
|
||||
FEATURE Feature,
|
||||
PROTO Proto)
|
||||
{
|
||||
FRECT BoundingBox;
|
||||
FLOAT32 MaxAngleError;
|
||||
FLOAT32 AngleError;
|
||||
|
||||
MaxAngleError = training_angle_pad / 360.0;
|
||||
AngleError = fabs (Proto->Angle - Feature->Params[PicoFeatDir]);
|
||||
if (AngleError > 0.5)
|
||||
AngleError = 1.0 - AngleError;
|
||||
|
||||
if (AngleError > MaxAngleError)
|
||||
return (FALSE);
|
||||
|
||||
ComputePaddedBoundingBox (Proto,
|
||||
training_tangent_bbox_pad * GetPicoFeatureLength (),
|
||||
training_orthogonal_bbox_pad * GetPicoFeatureLength (),
|
||||
&BoundingBox);
|
||||
|
||||
return PointInside(&BoundingBox, Feature->Params[PicoFeatX],
|
||||
Feature->Params[PicoFeatY]);
|
||||
} /* DummyFastMatch */
|
||||
|
||||
/**
|
||||
* This routine computes a bounding box that encloses the
|
||||
* specified proto along with some padding. The
|
||||
* amount of padding is specified as separate distances
|
||||
* in the tangential and orthogonal directions.
|
||||
*
|
||||
* @param Proto proto to compute bounding box for
|
||||
* @param TangentPad amount of pad to add in direction of segment
|
||||
* @param OrthogonalPad amount of pad to add orthogonal to segment
|
||||
* @param[out] BoundingBox place to put results
|
||||
*
|
||||
* Globals: none
|
||||
*
|
||||
* @return none (results are returned in BoundingBox)
|
||||
* @note Exceptions: none
|
||||
* @note History: Wed Nov 14 14:55:30 1990, DSJ, Created.
|
||||
*/
|
||||
void ComputePaddedBoundingBox (PROTO Proto, FLOAT32 TangentPad,
|
||||
FLOAT32 OrthogonalPad, FRECT *BoundingBox) {
|
||||
FLOAT32 Pad, Length, Angle;
|
||||
FLOAT32 CosOfAngle, SinOfAngle;
|
||||
|
||||
Length = Proto->Length / 2.0 + TangentPad;
|
||||
Angle = Proto->Angle * 2.0 * PI;
|
||||
CosOfAngle = fabs(cos(Angle));
|
||||
SinOfAngle = fabs(sin(Angle));
|
||||
|
||||
Pad = MAX (CosOfAngle * Length, SinOfAngle * OrthogonalPad);
|
||||
BoundingBox->MinX = Proto->X - Pad;
|
||||
BoundingBox->MaxX = Proto->X + Pad;
|
||||
|
||||
Pad = MAX(SinOfAngle * Length, CosOfAngle * OrthogonalPad);
|
||||
BoundingBox->MinY = Proto->Y - Pad;
|
||||
BoundingBox->MaxY = Proto->Y + Pad;
|
||||
|
||||
} /* ComputePaddedBoundingBox */
|
||||
|
||||
/**
|
||||
* Return TRUE if point (X,Y) is inside of Rectangle.
|
||||
*
|
||||
* Globals: none
|
||||
*
|
||||
* @return TRUE if point (X,Y) is inside of Rectangle.
|
||||
* @note Exceptions: none
|
||||
* @note History: Wed Nov 14 17:26:35 1990, DSJ, Created.
|
||||
*/
|
||||
BOOL8 PointInside(FRECT *Rectangle, FLOAT32 X, FLOAT32 Y) {
|
||||
if (X < Rectangle->MinX) return (FALSE);
|
||||
if (X > Rectangle->MaxX) return (FALSE);
|
||||
if (Y < Rectangle->MinY) return (FALSE);
|
||||
if (Y > Rectangle->MaxY) return (FALSE);
|
||||
return (TRUE);
|
||||
|
||||
} /* PointInside */
|
||||
/******************************************************************************
|
||||
** Filename: MergeNF.c
|
||||
** Purpose: Program for merging similar nano-feature protos
|
||||
** Author: Dan Johnson
|
||||
** History: Wed Nov 21 09:55:23 1990, DSJ, Created.
|
||||
**
|
||||
** (c) Copyright Hewlett-Packard Company, 1988.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
******************************************************************************/
|
||||
#include "mergenf.h"
|
||||
#include "host.h"
|
||||
#include "efio.h"
|
||||
#include "clusttool.h"
|
||||
#include "cluster.h"
|
||||
#include "oldlist.h"
|
||||
#include "protos.h"
|
||||
#include "ndminx.h"
|
||||
#include "ocrfeatures.h"
|
||||
#include "const.h"
|
||||
#include "featdefs.h"
|
||||
#include "intproto.h"
|
||||
#include "params.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
|
||||
/*-------------------once in subfeat---------------------------------*/
|
||||
double_VAR(training_angle_match_scale, 1.0, "Angle Match Scale ...");
|
||||
|
||||
double_VAR(training_similarity_midpoint, 0.0075, "Similarity Midpoint ...");
|
||||
|
||||
double_VAR(training_similarity_curl, 2.0, "Similarity Curl ...");
|
||||
|
||||
/*-----------------------------once in fasttrain----------------------------------*/
|
||||
double_VAR(training_tangent_bbox_pad, 0.5, "Tangent bounding box pad ...");
|
||||
|
||||
double_VAR(training_orthogonal_bbox_pad, 2.5, "Orthogonal bounding box pad ...");
|
||||
|
||||
double_VAR(training_angle_pad, 45.0, "Angle pad ...");
|
||||
|
||||
/**
|
||||
* Compare protos p1 and p2 and return an estimate of the
|
||||
* worst evidence rating that will result for any part of p1
|
||||
* that is compared to p2. In other words, if p1 were broken
|
||||
* into pico-features and each pico-feature was matched to p2,
|
||||
* what is the worst evidence rating that will be achieved for
|
||||
* any pico-feature.
|
||||
*
|
||||
* @param p1, p2 protos to be compared
|
||||
*
|
||||
* Globals: none
|
||||
*
|
||||
* @return Worst possible result when matching p1 to p2.
|
||||
* @note Exceptions: none
|
||||
* @note History: Mon Nov 26 08:27:53 1990, DSJ, Created.
|
||||
*/
|
||||
FLOAT32 CompareProtos(PROTO p1, PROTO p2) {
|
||||
FEATURE Feature;
|
||||
FLOAT32 WorstEvidence = WORST_EVIDENCE;
|
||||
FLOAT32 Evidence;
|
||||
FLOAT32 Angle, Length;
|
||||
|
||||
/* if p1 and p2 are not close in length, don't let them match */
|
||||
Length = fabs (p1->Length - p2->Length);
|
||||
if (Length > MAX_LENGTH_MISMATCH)
|
||||
return (0.0);
|
||||
|
||||
/* create a dummy pico-feature to be used for comparisons */
|
||||
Feature = NewFeature (&PicoFeatDesc);
|
||||
Feature->Params[PicoFeatDir] = p1->Angle;
|
||||
|
||||
/* convert angle to radians */
|
||||
Angle = p1->Angle * 2.0 * PI;
|
||||
|
||||
/* find distance from center of p1 to 1/2 picofeat from end */
|
||||
Length = p1->Length / 2.0 - GetPicoFeatureLength () / 2.0;
|
||||
if (Length < 0) Length = 0;
|
||||
|
||||
/* set the dummy pico-feature at one end of p1 and match it to p2 */
|
||||
Feature->Params[PicoFeatX] = p1->X + cos (Angle) * Length;
|
||||
Feature->Params[PicoFeatY] = p1->Y + sin (Angle) * Length;
|
||||
if (DummyFastMatch (Feature, p2)) {
|
||||
Evidence = SubfeatureEvidence (Feature, p2);
|
||||
if (Evidence < WorstEvidence)
|
||||
WorstEvidence = Evidence;
|
||||
} else {
|
||||
FreeFeature(Feature);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
/* set the dummy pico-feature at the other end of p1 and match it to p2 */
|
||||
Feature->Params[PicoFeatX] = p1->X - cos (Angle) * Length;
|
||||
Feature->Params[PicoFeatY] = p1->Y - sin (Angle) * Length;
|
||||
if (DummyFastMatch (Feature, p2)) {
|
||||
Evidence = SubfeatureEvidence (Feature, p2);
|
||||
if (Evidence < WorstEvidence)
|
||||
WorstEvidence = Evidence;
|
||||
} else {
|
||||
FreeFeature(Feature);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
FreeFeature (Feature);
|
||||
return (WorstEvidence);
|
||||
|
||||
} /* CompareProtos */
|
||||
|
||||
/**
|
||||
* This routine computes a proto which is the weighted
|
||||
* average of protos p1 and p2. The new proto is returned
|
||||
* in MergedProto.
|
||||
*
|
||||
* @param p1, p2 protos to be merged
|
||||
* @param w1, w2 weight of each proto
|
||||
* @param MergedProto place to put resulting merged proto
|
||||
*
|
||||
* Globals: none
|
||||
*
|
||||
* @return none (results are returned in MergedProto)
|
||||
* @note Exceptions: none
|
||||
* @note History: Mon Nov 26 08:15:08 1990, DSJ, Created.
|
||||
*/
|
||||
void ComputeMergedProto (PROTO p1,
|
||||
PROTO p2,
|
||||
FLOAT32 w1,
|
||||
FLOAT32 w2,
|
||||
PROTO MergedProto) {
|
||||
FLOAT32 TotalWeight;
|
||||
|
||||
TotalWeight = w1 + w2;
|
||||
w1 /= TotalWeight;
|
||||
w2 /= TotalWeight;
|
||||
|
||||
MergedProto->X = p1->X * w1 + p2->X * w2;
|
||||
MergedProto->Y = p1->Y * w1 + p2->Y * w2;
|
||||
MergedProto->Length = p1->Length * w1 + p2->Length * w2;
|
||||
MergedProto->Angle = p1->Angle * w1 + p2->Angle * w2;
|
||||
FillABC(MergedProto);
|
||||
} /* ComputeMergedProto */
|
||||
|
||||
/**
|
||||
* This routine searches through all of the prototypes in
|
||||
* Class and returns the id of the proto which would provide
|
||||
* the best approximation of Prototype. If no close
|
||||
* approximation can be found, NO_PROTO is returned.
|
||||
*
|
||||
* @param Class class to search for matching old proto in
|
||||
* @param NumMerged # of protos merged into each proto of Class
|
||||
* @param Prototype new proto to find match for
|
||||
*
|
||||
* Globals: none
|
||||
*
|
||||
* @return Id of closest proto in Class or NO_PROTO.
|
||||
* @note Exceptions: none
|
||||
* @note History: Sat Nov 24 11:42:58 1990, DSJ, Created.
|
||||
*/
|
||||
int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[],
|
||||
PROTOTYPE *Prototype) {
|
||||
PROTO_STRUCT NewProto;
|
||||
PROTO_STRUCT MergedProto;
|
||||
int Pid;
|
||||
PROTO Proto;
|
||||
int BestProto;
|
||||
FLOAT32 BestMatch;
|
||||
FLOAT32 Match, OldMatch, NewMatch;
|
||||
|
||||
MakeNewFromOld (&NewProto, Prototype);
|
||||
|
||||
BestProto = NO_PROTO;
|
||||
BestMatch = WORST_MATCH_ALLOWED;
|
||||
for (Pid = 0; Pid < Class->NumProtos; Pid++) {
|
||||
Proto = ProtoIn(Class, Pid);
|
||||
ComputeMergedProto(Proto, &NewProto,
|
||||
(FLOAT32) NumMerged[Pid], 1.0, &MergedProto);
|
||||
OldMatch = CompareProtos(Proto, &MergedProto);
|
||||
NewMatch = CompareProtos(&NewProto, &MergedProto);
|
||||
Match = MIN(OldMatch, NewMatch);
|
||||
if (Match > BestMatch) {
|
||||
BestProto = Pid;
|
||||
BestMatch = Match;
|
||||
}
|
||||
}
|
||||
return BestProto;
|
||||
} /* FindClosestExistingProto */
|
||||
|
||||
/**
|
||||
* This fills in the fields of the New proto based on the
|
||||
* fields of the Old proto.
|
||||
*
|
||||
* @param New new proto to be filled in
|
||||
* @param Old old proto to be converted
|
||||
*
|
||||
* Globals: none
|
||||
*
|
||||
* Exceptions: none
|
||||
* History: Mon Nov 26 09:45:39 1990, DSJ, Created.
|
||||
*/
|
||||
void MakeNewFromOld(PROTO New, PROTOTYPE *Old) {
|
||||
New->X = CenterX(Old->Mean);
|
||||
New->Y = CenterY(Old->Mean);
|
||||
New->Length = LengthOf(Old->Mean);
|
||||
New->Angle = OrientationOf(Old->Mean);
|
||||
FillABC(New);
|
||||
} /* MakeNewFromOld */
|
||||
|
||||
/*-------------------once in subfeat---------------------------------*/
|
||||
|
||||
/**
|
||||
* @name SubfeatureEvidence
|
||||
*
|
||||
* Compare a feature to a prototype. Print the result.
|
||||
*/
|
||||
FLOAT32 SubfeatureEvidence(FEATURE Feature, PROTO Proto) {
|
||||
float Distance;
|
||||
float Dangle;
|
||||
|
||||
Dangle = Proto->Angle - Feature->Params[PicoFeatDir];
|
||||
if (Dangle < -0.5) Dangle += 1.0;
|
||||
if (Dangle > 0.5) Dangle -= 1.0;
|
||||
Dangle *= training_angle_match_scale;
|
||||
|
||||
Distance = Proto->A * Feature->Params[PicoFeatX] +
|
||||
Proto->B * Feature->Params[PicoFeatY] +
|
||||
Proto->C;
|
||||
|
||||
return (EvidenceOf (Distance * Distance + Dangle * Dangle));
|
||||
}
|
||||
|
||||
/**
|
||||
* @name EvidenceOf
|
||||
*
|
||||
* Return the new type of evidence number corresponding to this
|
||||
* distance value. This number is no longer based on the chi squared
|
||||
* approximation. The equation that represents the transform is:
|
||||
* 1 / (1 + (sim / midpoint) ^ curl)
|
||||
*/
|
||||
double EvidenceOf (double Similarity) {
|
||||
|
||||
Similarity /= training_similarity_midpoint;
|
||||
|
||||
if (training_similarity_curl == 3)
|
||||
Similarity = Similarity * Similarity * Similarity;
|
||||
else if (training_similarity_curl == 2)
|
||||
Similarity = Similarity * Similarity;
|
||||
else
|
||||
Similarity = pow (Similarity, training_similarity_curl);
|
||||
|
||||
return (1.0 / (1.0 + Similarity));
|
||||
}
|
||||
|
||||
/**
|
||||
* This routine returns TRUE if Feature would be matched
|
||||
* by a fast match table built from Proto.
|
||||
*
|
||||
* @param Feature feature to be "fast matched" to proto
|
||||
* @param Proto proto being "fast matched" against
|
||||
*
|
||||
* Globals:
|
||||
* - training_tangent_bbox_pad bounding box pad tangent to proto
|
||||
* - training_orthogonal_bbox_pad bounding box pad orthogonal to proto
|
||||
*
|
||||
* @return TRUE if feature could match Proto.
|
||||
* @note Exceptions: none
|
||||
* @note History: Wed Nov 14 17:19:58 1990, DSJ, Created.
|
||||
*/
|
||||
BOOL8 DummyFastMatch (
|
||||
FEATURE Feature,
|
||||
PROTO Proto)
|
||||
{
|
||||
FRECT BoundingBox;
|
||||
FLOAT32 MaxAngleError;
|
||||
FLOAT32 AngleError;
|
||||
|
||||
MaxAngleError = training_angle_pad / 360.0;
|
||||
AngleError = fabs (Proto->Angle - Feature->Params[PicoFeatDir]);
|
||||
if (AngleError > 0.5)
|
||||
AngleError = 1.0 - AngleError;
|
||||
|
||||
if (AngleError > MaxAngleError)
|
||||
return (FALSE);
|
||||
|
||||
ComputePaddedBoundingBox (Proto,
|
||||
training_tangent_bbox_pad * GetPicoFeatureLength (),
|
||||
training_orthogonal_bbox_pad * GetPicoFeatureLength (),
|
||||
&BoundingBox);
|
||||
|
||||
return PointInside(&BoundingBox, Feature->Params[PicoFeatX],
|
||||
Feature->Params[PicoFeatY]);
|
||||
} /* DummyFastMatch */
|
||||
|
||||
/**
|
||||
* This routine computes a bounding box that encloses the
|
||||
* specified proto along with some padding. The
|
||||
* amount of padding is specified as separate distances
|
||||
* in the tangential and orthogonal directions.
|
||||
*
|
||||
* @param Proto proto to compute bounding box for
|
||||
* @param TangentPad amount of pad to add in direction of segment
|
||||
* @param OrthogonalPad amount of pad to add orthogonal to segment
|
||||
* @param[out] BoundingBox place to put results
|
||||
*
|
||||
* Globals: none
|
||||
*
|
||||
* @return none (results are returned in BoundingBox)
|
||||
* @note Exceptions: none
|
||||
* @note History: Wed Nov 14 14:55:30 1990, DSJ, Created.
|
||||
*/
|
||||
void ComputePaddedBoundingBox (PROTO Proto, FLOAT32 TangentPad,
|
||||
FLOAT32 OrthogonalPad, FRECT *BoundingBox) {
|
||||
FLOAT32 Pad, Length, Angle;
|
||||
FLOAT32 CosOfAngle, SinOfAngle;
|
||||
|
||||
Length = Proto->Length / 2.0 + TangentPad;
|
||||
Angle = Proto->Angle * 2.0 * PI;
|
||||
CosOfAngle = fabs(cos(Angle));
|
||||
SinOfAngle = fabs(sin(Angle));
|
||||
|
||||
Pad = MAX (CosOfAngle * Length, SinOfAngle * OrthogonalPad);
|
||||
BoundingBox->MinX = Proto->X - Pad;
|
||||
BoundingBox->MaxX = Proto->X + Pad;
|
||||
|
||||
Pad = MAX(SinOfAngle * Length, CosOfAngle * OrthogonalPad);
|
||||
BoundingBox->MinY = Proto->Y - Pad;
|
||||
BoundingBox->MaxY = Proto->Y + Pad;
|
||||
|
||||
} /* ComputePaddedBoundingBox */
|
||||
|
||||
/**
|
||||
* Return TRUE if point (X,Y) is inside of Rectangle.
|
||||
*
|
||||
* Globals: none
|
||||
*
|
||||
* @return TRUE if point (X,Y) is inside of Rectangle.
|
||||
* @note Exceptions: none
|
||||
* @note History: Wed Nov 14 17:26:35 1990, DSJ, Created.
|
||||
*/
|
||||
BOOL8 PointInside(FRECT *Rectangle, FLOAT32 X, FLOAT32 Y) {
|
||||
if (X < Rectangle->MinX) return (FALSE);
|
||||
if (X > Rectangle->MaxX) return (FALSE);
|
||||
if (Y < Rectangle->MinY) return (FALSE);
|
||||
if (Y > Rectangle->MaxY) return (FALSE);
|
||||
return (TRUE);
|
||||
|
||||
} /* PointInside */
|
@ -1,103 +1,103 @@
|
||||
/******************************************************************************
|
||||
** Filename: MergeNF.c
|
||||
** Purpose: Program for merging similar nano-feature protos
|
||||
** Author: Dan Johnson
|
||||
** History: Wed Nov 21 09:55:23 1990, DSJ, Created.
|
||||
**
|
||||
** (c) Copyright Hewlett-Packard Company, 1988.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
******************************************************************************/
|
||||
|
||||
#ifndef TESSERACT_TRAINING_MERGENF_H_
|
||||
#define TESSERACT_TRAINING_MERGENF_H_
|
||||
|
||||
/**----------------------------------------------------------------------------
|
||||
Include Files and Type Defines
|
||||
----------------------------------------------------------------------------**/
|
||||
#include "protos.h"
|
||||
#include "cluster.h"
|
||||
#include "ocrfeatures.h"
|
||||
#include "callcpp.h"
|
||||
#include "picofeat.h"
|
||||
|
||||
|
||||
#define WORST_MATCH_ALLOWED (0.9)
|
||||
#define WORST_EVIDENCE (1.0)
|
||||
#define MAX_LENGTH_MISMATCH (2.0 * GetPicoFeatureLength ())
|
||||
|
||||
|
||||
#define PROTO_SUFFIX ".mf.p"
|
||||
#define CONFIG_SUFFIX ".cl"
|
||||
#define NO_PROTO (-1)
|
||||
#define XPOSITION 0
|
||||
#define YPOSITION 1
|
||||
#define MFLENGTH 2
|
||||
#define ORIENTATION 3
|
||||
|
||||
typedef struct
|
||||
{
|
||||
FLOAT32 MinX, MaxX, MinY, MaxY;
|
||||
} FRECT;
|
||||
|
||||
/**----------------------------------------------------------------------------
|
||||
Public Macros
|
||||
----------------------------------------------------------------------------**/
|
||||
#define CenterX(M) ( (M)[XPOSITION] )
|
||||
#define CenterY(M) ( (M)[YPOSITION] )
|
||||
#define LengthOf(M) ( (M)[MFLENGTH] )
|
||||
#define OrientationOf(M) ( (M)[ORIENTATION] )
|
||||
|
||||
/**----------------------------------------------------------------------------
|
||||
Public Function Prototypes
|
||||
----------------------------------------------------------------------------**/
|
||||
FLOAT32 CompareProtos (
|
||||
PROTO p1,
|
||||
PROTO p2);
|
||||
|
||||
void ComputeMergedProto (
|
||||
PROTO p1,
|
||||
PROTO p2,
|
||||
FLOAT32 w1,
|
||||
FLOAT32 w2,
|
||||
PROTO MergedProto);
|
||||
|
||||
int FindClosestExistingProto (
|
||||
CLASS_TYPE Class,
|
||||
int NumMerged[],
|
||||
PROTOTYPE *Prototype);
|
||||
|
||||
void MakeNewFromOld (
|
||||
PROTO New,
|
||||
PROTOTYPE *Old);
|
||||
|
||||
FLOAT32 SubfeatureEvidence (
|
||||
FEATURE Feature,
|
||||
PROTO Proto);
|
||||
|
||||
double EvidenceOf (
|
||||
register double Similarity);
|
||||
|
||||
BOOL8 DummyFastMatch (
|
||||
FEATURE Feature,
|
||||
PROTO Proto);
|
||||
|
||||
void ComputePaddedBoundingBox (
|
||||
PROTO Proto,
|
||||
FLOAT32 TangentPad,
|
||||
FLOAT32 OrthogonalPad,
|
||||
FRECT *BoundingBox);
|
||||
|
||||
BOOL8 PointInside (
|
||||
FRECT *Rectangle,
|
||||
FLOAT32 X,
|
||||
FLOAT32 Y);
|
||||
|
||||
#endif // TESSERACT_TRAINING_MERGENF_H_
|
||||
/******************************************************************************
|
||||
** Filename: MergeNF.c
|
||||
** Purpose: Program for merging similar nano-feature protos
|
||||
** Author: Dan Johnson
|
||||
** History: Wed Nov 21 09:55:23 1990, DSJ, Created.
|
||||
**
|
||||
** (c) Copyright Hewlett-Packard Company, 1988.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
******************************************************************************/
|
||||
|
||||
#ifndef TESSERACT_TRAINING_MERGENF_H_
|
||||
#define TESSERACT_TRAINING_MERGENF_H_
|
||||
|
||||
/**----------------------------------------------------------------------------
|
||||
Include Files and Type Defines
|
||||
----------------------------------------------------------------------------**/
|
||||
#include "protos.h"
|
||||
#include "cluster.h"
|
||||
#include "ocrfeatures.h"
|
||||
#include "callcpp.h"
|
||||
#include "picofeat.h"
|
||||
|
||||
|
||||
#define WORST_MATCH_ALLOWED (0.9)
|
||||
#define WORST_EVIDENCE (1.0)
|
||||
#define MAX_LENGTH_MISMATCH (2.0 * GetPicoFeatureLength ())
|
||||
|
||||
|
||||
#define PROTO_SUFFIX ".mf.p"
|
||||
#define CONFIG_SUFFIX ".cl"
|
||||
#define NO_PROTO (-1)
|
||||
#define XPOSITION 0
|
||||
#define YPOSITION 1
|
||||
#define MFLENGTH 2
|
||||
#define ORIENTATION 3
|
||||
|
||||
typedef struct
|
||||
{
|
||||
FLOAT32 MinX, MaxX, MinY, MaxY;
|
||||
} FRECT;
|
||||
|
||||
/**----------------------------------------------------------------------------
|
||||
Public Macros
|
||||
----------------------------------------------------------------------------**/
|
||||
#define CenterX(M) ( (M)[XPOSITION] )
|
||||
#define CenterY(M) ( (M)[YPOSITION] )
|
||||
#define LengthOf(M) ( (M)[MFLENGTH] )
|
||||
#define OrientationOf(M) ( (M)[ORIENTATION] )
|
||||
|
||||
/**----------------------------------------------------------------------------
|
||||
Public Function Prototypes
|
||||
----------------------------------------------------------------------------**/
|
||||
FLOAT32 CompareProtos (
|
||||
PROTO p1,
|
||||
PROTO p2);
|
||||
|
||||
void ComputeMergedProto (
|
||||
PROTO p1,
|
||||
PROTO p2,
|
||||
FLOAT32 w1,
|
||||
FLOAT32 w2,
|
||||
PROTO MergedProto);
|
||||
|
||||
int FindClosestExistingProto (
|
||||
CLASS_TYPE Class,
|
||||
int NumMerged[],
|
||||
PROTOTYPE *Prototype);
|
||||
|
||||
void MakeNewFromOld (
|
||||
PROTO New,
|
||||
PROTOTYPE *Old);
|
||||
|
||||
FLOAT32 SubfeatureEvidence (
|
||||
FEATURE Feature,
|
||||
PROTO Proto);
|
||||
|
||||
double EvidenceOf (
|
||||
register double Similarity);
|
||||
|
||||
BOOL8 DummyFastMatch (
|
||||
FEATURE Feature,
|
||||
PROTO Proto);
|
||||
|
||||
void ComputePaddedBoundingBox (
|
||||
PROTO Proto,
|
||||
FLOAT32 TangentPad,
|
||||
FLOAT32 OrthogonalPad,
|
||||
FRECT *BoundingBox);
|
||||
|
||||
BOOL8 PointInside (
|
||||
FRECT *Rectangle,
|
||||
FLOAT32 X,
|
||||
FLOAT32 Y);
|
||||
|
||||
#endif // TESSERACT_TRAINING_MERGENF_H_
|
0
training/tesstrain.sh → src/training/tesstrain.sh
Executable file → Normal file
0
training/tesstrain.sh → src/training/tesstrain.sh
Executable file → Normal file
0
training/tesstrain_utils.sh → src/training/tesstrain_utils.sh
Executable file → Normal file
0
training/tesstrain_utils.sh → src/training/tesstrain_utils.sh
Executable file → Normal file
@ -1,23 +1,23 @@
|
||||
/**********************************************************************
|
||||
* File: tlog.cpp
|
||||
* Description: Variant of printf with logging level controllable by a
|
||||
* commandline flag.
|
||||
* Author: Ranjith Unnikrishnan
|
||||
* Created: Wed Nov 20 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "tlog.h"
|
||||
|
||||
INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output");
|
||||
/**********************************************************************
|
||||
* File: tlog.cpp
|
||||
* Description: Variant of printf with logging level controllable by a
|
||||
* commandline flag.
|
||||
* Author: Ranjith Unnikrishnan
|
||||
* Created: Wed Nov 20 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "tlog.h"
|
||||
|
||||
INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output");
|
@ -1,41 +1,41 @@
|
||||
/**********************************************************************
|
||||
* File: tlog.h
|
||||
* Description: Variant of printf with logging level controllable by a
|
||||
* commandline flag.
|
||||
* Author: Ranjith Unnikrishnan
|
||||
* Created: Wed Nov 20 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
#ifndef TESSERACT_TRAINING_TLOG_H_
|
||||
#define TESSERACT_TRAINING_TLOG_H_
|
||||
|
||||
#include "commandlineflags.h"
|
||||
#include "errcode.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
DECLARE_INT_PARAM_FLAG(tlog_level);
|
||||
|
||||
// Variant guarded by the numeric logging level parameter FLAGS_tlog_level
|
||||
// (default 0). Code using ParseCommandLineFlags() can control its value using
|
||||
// the --tlog_level commandline argument. Otherwise it must be specified in a
|
||||
// config file like other params.
|
||||
#define tlog(level, ...) { \
|
||||
if (FLAGS_tlog_level >= level) { \
|
||||
tprintf_internal(__VA_ARGS__); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level)
|
||||
|
||||
#endif // TESSERACT_TRAINING_TLOG_H_
|
||||
/**********************************************************************
|
||||
* File: tlog.h
|
||||
* Description: Variant of printf with logging level controllable by a
|
||||
* commandline flag.
|
||||
* Author: Ranjith Unnikrishnan
|
||||
* Created: Wed Nov 20 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
#ifndef TESSERACT_TRAINING_TLOG_H_
|
||||
#define TESSERACT_TRAINING_TLOG_H_
|
||||
|
||||
#include "commandlineflags.h"
|
||||
#include "errcode.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
DECLARE_INT_PARAM_FLAG(tlog_level);
|
||||
|
||||
// Variant guarded by the numeric logging level parameter FLAGS_tlog_level
|
||||
// (default 0). Code using ParseCommandLineFlags() can control its value using
|
||||
// the --tlog_level commandline argument. Otherwise it must be specified in a
|
||||
// config file like other params.
|
||||
#define tlog(level, ...) { \
|
||||
if (FLAGS_tlog_level >= level) { \
|
||||
tprintf_internal(__VA_ARGS__); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level)
|
||||
|
||||
#endif // TESSERACT_TRAINING_TLOG_H_
|
@ -1,35 +1,35 @@
|
||||
#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
|
||||
#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
|
||||
|
||||
#include "validator.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Subclass of Validator that validates and segments generic unicode into
|
||||
// grapheme clusters, including Latin with diacritics.
|
||||
class ValidateGrapheme : public Validator {
|
||||
public:
|
||||
ValidateGrapheme(ViramaScript script, bool report_errors)
|
||||
: Validator(script, report_errors) {}
|
||||
~ValidateGrapheme() {}
|
||||
|
||||
protected:
|
||||
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
|
||||
// parts_ and output_. Returns true if a valid Grapheme was consumed,
|
||||
// otherwise does not increment codes_used_.
|
||||
bool ConsumeGraphemeIfValid() override;
|
||||
// Returns the CharClass corresponding to the given Unicode ch.
|
||||
CharClass UnicodeToCharClass(char32 ch) const override;
|
||||
|
||||
private:
|
||||
// Helper returns true if the sequence prev_ch,ch is invalid.
|
||||
bool IsBadlyFormed(char32 prev_ch, char32 ch);
|
||||
// Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
|
||||
static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch);
|
||||
// Helper returns true if the sequence prev_ch,ch is invalid Thai.
|
||||
static bool IsBadlyFormedThai(char32 prev_ch, char32 ch);
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
|
||||
#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
|
||||
#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
|
||||
|
||||
#include "validator.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Subclass of Validator that validates and segments generic unicode into
|
||||
// grapheme clusters, including Latin with diacritics.
|
||||
class ValidateGrapheme : public Validator {
|
||||
public:
|
||||
ValidateGrapheme(ViramaScript script, bool report_errors)
|
||||
: Validator(script, report_errors) {}
|
||||
~ValidateGrapheme() {}
|
||||
|
||||
protected:
|
||||
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
|
||||
// parts_ and output_. Returns true if a valid Grapheme was consumed,
|
||||
// otherwise does not increment codes_used_.
|
||||
bool ConsumeGraphemeIfValid() override;
|
||||
// Returns the CharClass corresponding to the given Unicode ch.
|
||||
CharClass UnicodeToCharClass(char32 ch) const override;
|
||||
|
||||
private:
|
||||
// Helper returns true if the sequence prev_ch,ch is invalid.
|
||||
bool IsBadlyFormed(char32 prev_ch, char32 ch);
|
||||
// Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
|
||||
static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch);
|
||||
// Helper returns true if the sequence prev_ch,ch is invalid Thai.
|
||||
static bool IsBadlyFormedThai(char32 prev_ch, char32 ch);
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
|
@ -1,44 +1,44 @@
|
||||
#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_
|
||||
#define TESSERACT_TRAINING_VALIDATE_INDIC_H_
|
||||
|
||||
#include "validator.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Subclass of Validator that validates and segments Indic scripts in the
|
||||
// unicode range 0x900-0xdff (Devanagari-Sinhala).
|
||||
class ValidateIndic : public Validator {
|
||||
public:
|
||||
ValidateIndic(ViramaScript script, bool report_errors)
|
||||
: Validator(script, report_errors) {}
|
||||
~ValidateIndic() {}
|
||||
|
||||
protected:
|
||||
// Returns whether codes matches the pattern for an Indic Grapheme.
|
||||
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
|
||||
// parts_ and output_. Returns true if a valid Grapheme was consumed,
|
||||
// otherwise does not increment codes_used_.
|
||||
bool ConsumeGraphemeIfValid() override;
|
||||
// Returns the CharClass corresponding to the given Unicode ch.
|
||||
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
|
||||
|
||||
private:
|
||||
// Helper consumes/copies a virama and any associated post-virama joiners.
|
||||
bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
|
||||
// Helper consumes/copies a series of consonants separated by viramas while
|
||||
// valid, but not any vowel or other modifiers.
|
||||
bool ConsumeConsonantHeadIfValid();
|
||||
// Helper consumes/copies a tail part of a consonant, comprising optional
|
||||
// matra/piece, vowel modifier, vedic mark, terminating virama.
|
||||
bool ConsumeConsonantTailIfValid();
|
||||
// Helper consumes/copies a vowel and optional modifiers.
|
||||
bool ConsumeVowelIfValid();
|
||||
|
||||
// Some special unicodes used only for Indic processing.
|
||||
static const char32 kYayana = 0xdba; // Sinhala Ya
|
||||
static const char32 kRayana = 0xdbb; // Sinhala Ra
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_
|
||||
#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_
|
||||
#define TESSERACT_TRAINING_VALIDATE_INDIC_H_
|
||||
|
||||
#include "validator.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Subclass of Validator that validates and segments Indic scripts in the
|
||||
// unicode range 0x900-0xdff (Devanagari-Sinhala).
|
||||
class ValidateIndic : public Validator {
|
||||
public:
|
||||
ValidateIndic(ViramaScript script, bool report_errors)
|
||||
: Validator(script, report_errors) {}
|
||||
~ValidateIndic() {}
|
||||
|
||||
protected:
|
||||
// Returns whether codes matches the pattern for an Indic Grapheme.
|
||||
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
|
||||
// parts_ and output_. Returns true if a valid Grapheme was consumed,
|
||||
// otherwise does not increment codes_used_.
|
||||
bool ConsumeGraphemeIfValid() override;
|
||||
// Returns the CharClass corresponding to the given Unicode ch.
|
||||
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
|
||||
|
||||
private:
|
||||
// Helper consumes/copies a virama and any associated post-virama joiners.
|
||||
bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
|
||||
// Helper consumes/copies a series of consonants separated by viramas while
|
||||
// valid, but not any vowel or other modifiers.
|
||||
bool ConsumeConsonantHeadIfValid();
|
||||
// Helper consumes/copies a tail part of a consonant, comprising optional
|
||||
// matra/piece, vowel modifier, vedic mark, terminating virama.
|
||||
bool ConsumeConsonantTailIfValid();
|
||||
// Helper consumes/copies a vowel and optional modifiers.
|
||||
bool ConsumeVowelIfValid();
|
||||
|
||||
// Some special unicodes used only for Indic processing.
|
||||
static const char32 kYayana = 0xdba; // Sinhala Ya
|
||||
static const char32 kRayana = 0xdbb; // Sinhala Ra
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_
|
@ -1,106 +1,106 @@
|
||||
#include "validate_khmer.h"
|
||||
#include "errcode.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Returns whether codes matches the pattern for a Khmer Grapheme.
|
||||
// Taken from unicode standard:
|
||||
// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
|
||||
// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
|
||||
// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
|
||||
// Translated to the codes used by the CharClass enum:
|
||||
// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
|
||||
// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
|
||||
// Also the Consonant class here includes independent vowels, as they are
|
||||
// treated the same anyway.
|
||||
// In the split grapheme mode, the only characters that get grouped are the
|
||||
// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
|
||||
// the BNF syntax, so who knows what they do.
|
||||
bool ValidateKhmer::ConsumeGraphemeIfValid() {
|
||||
int num_codes = codes_.size();
|
||||
if (codes_used_ == num_codes) return false;
|
||||
if (codes_[codes_used_].first == CharClass::kOther) {
|
||||
UseMultiCode(1);
|
||||
return true;
|
||||
}
|
||||
if (codes_[codes_used_].first != CharClass::kConsonant) {
|
||||
if (report_errors_) {
|
||||
tprintf("Invalid start of Khmer syllable:0x%x\n",
|
||||
codes_[codes_used_].second);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if (UseMultiCode(1)) return true;
|
||||
if (codes_[codes_used_].first == CharClass::kRobat ||
|
||||
codes_[codes_used_].first == CharClass::kNukta) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
while (codes_used_ + 1 < num_codes &&
|
||||
codes_[codes_used_].first == CharClass::kVirama &&
|
||||
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
|
||||
ASSERT_HOST(!CodeOnlyToOutput());
|
||||
if (UseMultiCode(2)) return true;
|
||||
if (codes_[codes_used_].first == CharClass::kRobat) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
}
|
||||
int num_matra_parts = 0;
|
||||
if (codes_[codes_used_].second == kZeroWidthJoiner ||
|
||||
codes_[codes_used_].second == kZeroWidthNonJoiner) {
|
||||
if (CodeOnlyToOutput()) {
|
||||
if (report_errors_) {
|
||||
tprintf("Unterminated joiner: 0x%x\n", output_.back());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
++num_matra_parts;
|
||||
}
|
||||
// Not quite as shown by the BNF, the matra piece is allowed as a matra on its
|
||||
// own or as an addition to other matras.
|
||||
if (codes_[codes_used_].first == CharClass::kMatra ||
|
||||
codes_[codes_used_].first == CharClass::kMatraPiece) {
|
||||
++num_matra_parts;
|
||||
if (UseMultiCode(num_matra_parts)) return true;
|
||||
} else if (num_matra_parts) {
|
||||
if (report_errors_) {
|
||||
tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
|
||||
output_.back(), codes_[codes_used_].second);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if (codes_[codes_used_].first == CharClass::kMatraPiece &&
|
||||
codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
if (codes_[codes_used_].first == CharClass::kVowelModifier) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
if (codes_used_ + 1 < num_codes &&
|
||||
codes_[codes_used_].first == CharClass::kVirama &&
|
||||
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
|
||||
ASSERT_HOST(!CodeOnlyToOutput());
|
||||
if (UseMultiCode(2)) return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const {
|
||||
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
|
||||
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
|
||||
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
|
||||
// Offset from the start of the relevant unicode code block aka code page.
|
||||
int off = ch - static_cast<char32>(script_);
|
||||
// Anything in another code block is other.
|
||||
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
|
||||
if (off <= 0x33) return CharClass::kConsonant;
|
||||
if (off <= 0x45) return CharClass::kMatra;
|
||||
if (off == 0x46) return CharClass::kMatraPiece;
|
||||
if (off == 0x4c) return CharClass::kRobat;
|
||||
if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
|
||||
if (off <= 0x51) return CharClass::kVowelModifier;
|
||||
if (off == 0x52) return CharClass::kVirama;
|
||||
return CharClass::kOther;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
#include "validate_khmer.h"
|
||||
#include "errcode.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Returns whether codes matches the pattern for a Khmer Grapheme.
|
||||
// Taken from unicode standard:
|
||||
// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
|
||||
// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
|
||||
// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
|
||||
// Translated to the codes used by the CharClass enum:
|
||||
// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
|
||||
// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
|
||||
// Also the Consonant class here includes independent vowels, as they are
|
||||
// treated the same anyway.
|
||||
// In the split grapheme mode, the only characters that get grouped are the
|
||||
// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
|
||||
// the BNF syntax, so who knows what they do.
|
||||
bool ValidateKhmer::ConsumeGraphemeIfValid() {
|
||||
int num_codes = codes_.size();
|
||||
if (codes_used_ == num_codes) return false;
|
||||
if (codes_[codes_used_].first == CharClass::kOther) {
|
||||
UseMultiCode(1);
|
||||
return true;
|
||||
}
|
||||
if (codes_[codes_used_].first != CharClass::kConsonant) {
|
||||
if (report_errors_) {
|
||||
tprintf("Invalid start of Khmer syllable:0x%x\n",
|
||||
codes_[codes_used_].second);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if (UseMultiCode(1)) return true;
|
||||
if (codes_[codes_used_].first == CharClass::kRobat ||
|
||||
codes_[codes_used_].first == CharClass::kNukta) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
while (codes_used_ + 1 < num_codes &&
|
||||
codes_[codes_used_].first == CharClass::kVirama &&
|
||||
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
|
||||
ASSERT_HOST(!CodeOnlyToOutput());
|
||||
if (UseMultiCode(2)) return true;
|
||||
if (codes_[codes_used_].first == CharClass::kRobat) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
}
|
||||
int num_matra_parts = 0;
|
||||
if (codes_[codes_used_].second == kZeroWidthJoiner ||
|
||||
codes_[codes_used_].second == kZeroWidthNonJoiner) {
|
||||
if (CodeOnlyToOutput()) {
|
||||
if (report_errors_) {
|
||||
tprintf("Unterminated joiner: 0x%x\n", output_.back());
|
||||
}
|
||||
return false;
|
||||
}
|
||||
++num_matra_parts;
|
||||
}
|
||||
// Not quite as shown by the BNF, the matra piece is allowed as a matra on its
|
||||
// own or as an addition to other matras.
|
||||
if (codes_[codes_used_].first == CharClass::kMatra ||
|
||||
codes_[codes_used_].first == CharClass::kMatraPiece) {
|
||||
++num_matra_parts;
|
||||
if (UseMultiCode(num_matra_parts)) return true;
|
||||
} else if (num_matra_parts) {
|
||||
if (report_errors_) {
|
||||
tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
|
||||
output_.back(), codes_[codes_used_].second);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if (codes_[codes_used_].first == CharClass::kMatraPiece &&
|
||||
codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
if (codes_[codes_used_].first == CharClass::kVowelModifier) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
if (codes_used_ + 1 < num_codes &&
|
||||
codes_[codes_used_].first == CharClass::kVirama &&
|
||||
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
|
||||
ASSERT_HOST(!CodeOnlyToOutput());
|
||||
if (UseMultiCode(2)) return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const {
|
||||
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
|
||||
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
|
||||
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
|
||||
// Offset from the start of the relevant unicode code block aka code page.
|
||||
int off = ch - static_cast<char32>(script_);
|
||||
// Anything in another code block is other.
|
||||
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
|
||||
if (off <= 0x33) return CharClass::kConsonant;
|
||||
if (off <= 0x45) return CharClass::kMatra;
|
||||
if (off == 0x46) return CharClass::kMatraPiece;
|
||||
if (off == 0x4c) return CharClass::kRobat;
|
||||
if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
|
||||
if (off <= 0x51) return CharClass::kVowelModifier;
|
||||
if (off == 0x52) return CharClass::kVirama;
|
||||
return CharClass::kOther;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
@ -1,27 +1,27 @@
|
||||
#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_
|
||||
#define TESSERACT_TRAINING_VALIDATE_KHMER_H_
|
||||
|
||||
#include "validator.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Subclass of Validator that validates and segments Khmer.
|
||||
class ValidateKhmer : public Validator {
|
||||
public:
|
||||
ValidateKhmer(ViramaScript script, bool report_errors)
|
||||
: Validator(script, report_errors) {}
|
||||
~ValidateKhmer() {}
|
||||
|
||||
protected:
|
||||
// Returns whether codes matches the pattern for an Khmer Grapheme.
|
||||
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
|
||||
// parts_ and output_. Returns true if a valid Grapheme was consumed,
|
||||
// otherwise does not increment codes_used_.
|
||||
bool ConsumeGraphemeIfValid() override;
|
||||
// Returns the CharClass corresponding to the given Unicode ch.
|
||||
CharClass UnicodeToCharClass(char32 ch) const override;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_
|
||||
#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_
|
||||
#define TESSERACT_TRAINING_VALIDATE_KHMER_H_
|
||||
|
||||
#include "validator.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Subclass of Validator that validates and segments Khmer.
|
||||
class ValidateKhmer : public Validator {
|
||||
public:
|
||||
ValidateKhmer(ViramaScript script, bool report_errors)
|
||||
: Validator(script, report_errors) {}
|
||||
~ValidateKhmer() {}
|
||||
|
||||
protected:
|
||||
// Returns whether codes matches the pattern for an Khmer Grapheme.
|
||||
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
|
||||
// parts_ and output_. Returns true if a valid Grapheme was consumed,
|
||||
// otherwise does not increment codes_used_.
|
||||
bool ConsumeGraphemeIfValid() override;
|
||||
// Returns the CharClass corresponding to the given Unicode ch.
|
||||
CharClass UnicodeToCharClass(char32 ch) const override;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_
|
@ -1,160 +1,160 @@
|
||||
#include "validate_myanmar.h"
|
||||
#include "errcode.h"
|
||||
#include "icuerrorcode.h"
|
||||
#include "tprintf.h"
|
||||
#include "unicode/uchar.h" // From libicu
|
||||
#include "unicode/uscript.h" // From libicu
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Returns whether codes matches the pattern for a Myanmar Grapheme.
|
||||
// Taken directly from the unicode table 16-3.
|
||||
// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
|
||||
bool ValidateMyanmar::ConsumeGraphemeIfValid() {
|
||||
int num_codes = codes_.size();
|
||||
if (codes_used_ == num_codes) return true;
|
||||
// Other.
|
||||
if (IsMyanmarOther(codes_[codes_used_].second)) {
|
||||
UseMultiCode(1);
|
||||
return true;
|
||||
}
|
||||
// Kinzi.
|
||||
if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
|
||||
codes_[codes_used_ + 1].second == kMyanmarAsat &&
|
||||
codes_[codes_used_ + 2].second == kMyanmarVirama) {
|
||||
ASSERT_HOST(!CodeOnlyToOutput());
|
||||
ASSERT_HOST(!CodeOnlyToOutput());
|
||||
if (UseMultiCode(3)) return true;
|
||||
}
|
||||
// Base consonant/vowel. NOTE that since everything in Myanmar appears to be
|
||||
// optional, except the base, this is the only place where invalid input can
|
||||
// be detected and false returned.
|
||||
if (IsMyanmarLetter(codes_[codes_used_].second)) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
} else {
|
||||
if (report_errors_) {
|
||||
tprintf("Invalid start of Myanmar syllable:0x%x\n",
|
||||
codes_[codes_used_].second);
|
||||
}
|
||||
return false; // One of these is required.
|
||||
}
|
||||
if (ConsumeSubscriptIfPresent()) return true;
|
||||
ConsumeOptionalSignsIfPresent();
|
||||
// What we have consumed so far is a valid syllable.
|
||||
return true;
|
||||
}
|
||||
|
||||
// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
|
||||
// is little correspondence between the content of table 16-3 and the char
|
||||
// classes of the Indic languages. (Experts may disagree and improve!)
|
||||
// In unicode table 16-3 there is basically a long list of optional characters,
|
||||
// which can be coded quite easily.
|
||||
// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
|
||||
// The table also allows sequences that still result in dotted circles!!
|
||||
// So with a lot of guesswork the rest have been added in a reasonable place.
|
||||
Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
|
||||
if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
|
||||
return CharClass::kOther;
|
||||
}
|
||||
|
||||
// Helper consumes/copies a virama and any subscript consonant.
|
||||
// Returns true if the end of input is reached.
|
||||
bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
|
||||
// Subscript consonant. It appears there can be only one.
|
||||
int num_codes = codes_.size();
|
||||
if (codes_used_ + 1 < num_codes &&
|
||||
codes_[codes_used_].second == kMyanmarVirama) {
|
||||
if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
|
||||
ASSERT_HOST(!CodeOnlyToOutput());
|
||||
if (UseMultiCode(2)) return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Helper consumes/copies a series of optional signs.
|
||||
// Returns true if the end of input is reached.
|
||||
bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
|
||||
// The following characters are allowed, all optional, and in sequence.
|
||||
// An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
|
||||
const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
|
||||
0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
|
||||
0x1081, 0x1031});
|
||||
for (char32 ch : kMedials) {
|
||||
if (codes_[codes_used_].second == ch) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
if (ch == kMyanmarMedialYa &&
|
||||
codes_[codes_used_].second == kMyanmarAsat) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Vowel sign i, ii, ai.
|
||||
char32 ch = codes_[codes_used_].second;
|
||||
if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
// Vowel sign u, uu, and extensions.
|
||||
ch = codes_[codes_used_].second;
|
||||
if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
|
||||
ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
|
||||
(0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
|
||||
ch == 0x109c || ch == 0x109d) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
// Tall aa, aa with optional asat.
|
||||
if (codes_[codes_used_].second == 0x102b ||
|
||||
codes_[codes_used_].second == 0x102c) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
if (codes_[codes_used_].second == kMyanmarAsat) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
}
|
||||
// The following characters are allowed, all optional, and in sequence.
|
||||
const std::vector<char32> kSigns({0x1036, 0x1037});
|
||||
for (char32 ch : kSigns) {
|
||||
if (codes_[codes_used_].second == ch) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
}
|
||||
// Tone mark extensions.
|
||||
ch = codes_[codes_used_].second;
|
||||
if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
|
||||
(0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
|
||||
ch == 0x108f || ch == 0x109a || ch == 0x109b ||
|
||||
(0xaa7b <= ch && ch <= 0xaa7d)) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns true if the unicode is a Myanmar "letter" including consonants
|
||||
// and independent vowels. Although table 16-3 distinguishes between some
|
||||
// base consonants and vowels, the extensions make no such distinction, so we
|
||||
// put them all into a single bucket.
|
||||
/* static */
|
||||
bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
|
||||
return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
|
||||
(0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
|
||||
ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
|
||||
(0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
|
||||
ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
|
||||
(0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
|
||||
ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
|
||||
}
|
||||
|
||||
// Returns true if ch is a Myanmar digit or other symbol that does not take
|
||||
// part in being a syllable.
|
||||
/* static */
|
||||
bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
|
||||
IcuErrorCode err;
|
||||
UScriptCode script_code = uscript_getScript(ch, err);
|
||||
if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
|
||||
ch != Validator::kZeroWidthNonJoiner)
|
||||
return true;
|
||||
return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
|
||||
(0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
|
||||
(0xaa74 <= ch && ch <= 0xaa79);
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
||||
#include "validate_myanmar.h"
|
||||
#include "errcode.h"
|
||||
#include "icuerrorcode.h"
|
||||
#include "tprintf.h"
|
||||
#include "unicode/uchar.h" // From libicu
|
||||
#include "unicode/uscript.h" // From libicu
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Returns whether codes matches the pattern for a Myanmar Grapheme.
|
||||
// Taken directly from the unicode table 16-3.
|
||||
// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
|
||||
bool ValidateMyanmar::ConsumeGraphemeIfValid() {
|
||||
int num_codes = codes_.size();
|
||||
if (codes_used_ == num_codes) return true;
|
||||
// Other.
|
||||
if (IsMyanmarOther(codes_[codes_used_].second)) {
|
||||
UseMultiCode(1);
|
||||
return true;
|
||||
}
|
||||
// Kinzi.
|
||||
if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
|
||||
codes_[codes_used_ + 1].second == kMyanmarAsat &&
|
||||
codes_[codes_used_ + 2].second == kMyanmarVirama) {
|
||||
ASSERT_HOST(!CodeOnlyToOutput());
|
||||
ASSERT_HOST(!CodeOnlyToOutput());
|
||||
if (UseMultiCode(3)) return true;
|
||||
}
|
||||
// Base consonant/vowel. NOTE that since everything in Myanmar appears to be
|
||||
// optional, except the base, this is the only place where invalid input can
|
||||
// be detected and false returned.
|
||||
if (IsMyanmarLetter(codes_[codes_used_].second)) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
} else {
|
||||
if (report_errors_) {
|
||||
tprintf("Invalid start of Myanmar syllable:0x%x\n",
|
||||
codes_[codes_used_].second);
|
||||
}
|
||||
return false; // One of these is required.
|
||||
}
|
||||
if (ConsumeSubscriptIfPresent()) return true;
|
||||
ConsumeOptionalSignsIfPresent();
|
||||
// What we have consumed so far is a valid syllable.
|
||||
return true;
|
||||
}
|
||||
|
||||
// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
|
||||
// is little correspondence between the content of table 16-3 and the char
|
||||
// classes of the Indic languages. (Experts may disagree and improve!)
|
||||
// In unicode table 16-3 there is basically a long list of optional characters,
|
||||
// which can be coded quite easily.
|
||||
// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
|
||||
// The table also allows sequences that still result in dotted circles!!
|
||||
// So with a lot of guesswork the rest have been added in a reasonable place.
|
||||
Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
|
||||
if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
|
||||
return CharClass::kOther;
|
||||
}
|
||||
|
||||
// Helper consumes/copies a virama and any subscript consonant.
|
||||
// Returns true if the end of input is reached.
|
||||
bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
|
||||
// Subscript consonant. It appears there can be only one.
|
||||
int num_codes = codes_.size();
|
||||
if (codes_used_ + 1 < num_codes &&
|
||||
codes_[codes_used_].second == kMyanmarVirama) {
|
||||
if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
|
||||
ASSERT_HOST(!CodeOnlyToOutput());
|
||||
if (UseMultiCode(2)) return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Helper consumes/copies a series of optional signs.
|
||||
// Returns true if the end of input is reached.
|
||||
bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
|
||||
// The following characters are allowed, all optional, and in sequence.
|
||||
// An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
|
||||
const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
|
||||
0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
|
||||
0x1081, 0x1031});
|
||||
for (char32 ch : kMedials) {
|
||||
if (codes_[codes_used_].second == ch) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
if (ch == kMyanmarMedialYa &&
|
||||
codes_[codes_used_].second == kMyanmarAsat) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Vowel sign i, ii, ai.
|
||||
char32 ch = codes_[codes_used_].second;
|
||||
if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
// Vowel sign u, uu, and extensions.
|
||||
ch = codes_[codes_used_].second;
|
||||
if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
|
||||
ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
|
||||
(0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
|
||||
ch == 0x109c || ch == 0x109d) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
// Tall aa, aa with optional asat.
|
||||
if (codes_[codes_used_].second == 0x102b ||
|
||||
codes_[codes_used_].second == 0x102c) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
if (codes_[codes_used_].second == kMyanmarAsat) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
}
|
||||
// The following characters are allowed, all optional, and in sequence.
|
||||
const std::vector<char32> kSigns({0x1036, 0x1037});
|
||||
for (char32 ch : kSigns) {
|
||||
if (codes_[codes_used_].second == ch) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
}
|
||||
// Tone mark extensions.
|
||||
ch = codes_[codes_used_].second;
|
||||
if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
|
||||
(0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
|
||||
ch == 0x108f || ch == 0x109a || ch == 0x109b ||
|
||||
(0xaa7b <= ch && ch <= 0xaa7d)) {
|
||||
if (UseMultiCode(1)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns true if the unicode is a Myanmar "letter" including consonants
|
||||
// and independent vowels. Although table 16-3 distinguishes between some
|
||||
// base consonants and vowels, the extensions make no such distinction, so we
|
||||
// put them all into a single bucket.
|
||||
/* static */
|
||||
bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
|
||||
return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
|
||||
(0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
|
||||
ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
|
||||
(0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
|
||||
ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
|
||||
(0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
|
||||
ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
|
||||
}
|
||||
|
||||
// Returns true if ch is a Myanmar digit or other symbol that does not take
|
||||
// part in being a syllable.
|
||||
/* static */
|
||||
bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
|
||||
IcuErrorCode err;
|
||||
UScriptCode script_code = uscript_getScript(ch, err);
|
||||
if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
|
||||
ch != Validator::kZeroWidthNonJoiner)
|
||||
return true;
|
||||
return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
|
||||
(0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
|
||||
(0xaa74 <= ch && ch <= 0xaa79);
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
@ -1,47 +1,47 @@
|
||||
#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
|
||||
#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
|
||||
|
||||
#include "validator.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Subclass of Validator that validates and segments Myanmar.
|
||||
class ValidateMyanmar : public Validator {
|
||||
public:
|
||||
ValidateMyanmar(ViramaScript script, bool report_errors)
|
||||
: Validator(script, report_errors) {}
|
||||
~ValidateMyanmar() {}
|
||||
|
||||
protected:
|
||||
// Returns whether codes matches the pattern for a Myanmar Grapheme.
|
||||
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
|
||||
// parts_ and output_. Returns true if a valid Grapheme was consumed,
|
||||
// otherwise does not increment codes_used_.
|
||||
bool ConsumeGraphemeIfValid() override;
|
||||
// Returns the CharClass corresponding to the given Unicode ch.
|
||||
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
|
||||
|
||||
private:
|
||||
// Helper consumes/copies a virama and any subscript consonant.
|
||||
// Returns true if the end of input is reached.
|
||||
bool ConsumeSubscriptIfPresent();
|
||||
// Helper consumes/copies a series of optional signs.
|
||||
// Returns true if the end of input is reached.
|
||||
bool ConsumeOptionalSignsIfPresent();
|
||||
// Returns true if the unicode is a Myanmar "letter" including consonants
|
||||
// and independent vowels. Although table 16-3 distinguishes between some
|
||||
// base consonants and vowels, the extensions make no such distinction, so we
|
||||
// put them all into a single bucket.
|
||||
static bool IsMyanmarLetter(char32 ch);
|
||||
// Returns true if ch is a Myanmar digit or other symbol that does not take
|
||||
// part in being a syllable.
|
||||
static bool IsMyanmarOther(char32 ch);
|
||||
|
||||
// Some special unicodes used only for Myanmar processing.
|
||||
static const char32 kMyanmarAsat = 0x103a;
|
||||
static const char32 kMyanmarMedialYa = 0x103b;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
|
||||
#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
|
||||
#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
|
||||
|
||||
#include "validator.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Subclass of Validator that validates and segments Myanmar.
|
||||
class ValidateMyanmar : public Validator {
|
||||
public:
|
||||
ValidateMyanmar(ViramaScript script, bool report_errors)
|
||||
: Validator(script, report_errors) {}
|
||||
~ValidateMyanmar() {}
|
||||
|
||||
protected:
|
||||
// Returns whether codes matches the pattern for a Myanmar Grapheme.
|
||||
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
|
||||
// parts_ and output_. Returns true if a valid Grapheme was consumed,
|
||||
// otherwise does not increment codes_used_.
|
||||
bool ConsumeGraphemeIfValid() override;
|
||||
// Returns the CharClass corresponding to the given Unicode ch.
|
||||
Validator::CharClass UnicodeToCharClass(char32 ch) const override;
|
||||
|
||||
private:
|
||||
// Helper consumes/copies a virama and any subscript consonant.
|
||||
// Returns true if the end of input is reached.
|
||||
bool ConsumeSubscriptIfPresent();
|
||||
// Helper consumes/copies a series of optional signs.
|
||||
// Returns true if the end of input is reached.
|
||||
bool ConsumeOptionalSignsIfPresent();
|
||||
// Returns true if the unicode is a Myanmar "letter" including consonants
|
||||
// and independent vowels. Although table 16-3 distinguishes between some
|
||||
// base consonants and vowels, the extensions make no such distinction, so we
|
||||
// put them all into a single bucket.
|
||||
static bool IsMyanmarLetter(char32 ch);
|
||||
// Returns true if ch is a Myanmar digit or other symbol that does not take
|
||||
// part in being a syllable.
|
||||
static bool IsMyanmarOther(char32 ch);
|
||||
|
||||
// Some special unicodes used only for Myanmar processing.
|
||||
static const char32 kMyanmarAsat = 0x103a;
|
||||
static const char32 kMyanmarMedialYa = 0x103b;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
|
@ -1,243 +1,243 @@
|
||||
/**********************************************************************
|
||||
* File: validator.h
|
||||
* Description: Base class for various text validators. Intended mainly for
|
||||
* scripts that use a virama character.
|
||||
* Author: Ray Smith
|
||||
* Created: Tue May 23 2017
|
||||
*
|
||||
* (C) Copyright 2017, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TESSERACT_TRAINING_VALIDATOR_H_
|
||||
#define TESSERACT_TRAINING_VALIDATOR_H_
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include "unichar.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Different kinds of grapheme normalization - not just for Indic!
|
||||
// A grapheme is a syllable unit in Indic and can be several unicodes.
|
||||
// In other scripts, a grapheme is a base character and accent/diacritic
|
||||
// combination, as not all accented characters have a single composed form.
|
||||
enum class GraphemeNormMode {
|
||||
// Validation result is a single string, even if input is multi-word.
|
||||
kSingleString,
|
||||
// Standard unicode graphemes are validated and output as grapheme units.
|
||||
kCombined,
|
||||
// Graphemes are validated and sub-divided. For virama-using scripts, units
|
||||
// that correspond to repeatable glyphs are generated. (Mostly single unicodes
|
||||
// but viramas and joiners are paired with the most sensible neighbor.)
|
||||
// For non-virama scripts, this means that base/accent pairs are separated,
|
||||
// ie the output is individual unicodes.
|
||||
kGlyphSplit,
|
||||
// The output is always single unicodes, regardless of the script.
|
||||
kIndividualUnicodes,
|
||||
};
|
||||
|
||||
// An enum representing the scripts that use a virama character. It is
|
||||
// guaranteed that the value of any element, (except kNonVirama) can be cast
|
||||
// to a unicode (char32) value that represents the start of the unicode range
|
||||
// of the corresponding script.
|
||||
enum class ViramaScript : char32 {
|
||||
kNonVirama = 0,
|
||||
kDevanagari = 0x900,
|
||||
kBengali = 0x980,
|
||||
kGurmukhi = 0xa00,
|
||||
kGujarati = 0xa80,
|
||||
kOriya = 0xb00,
|
||||
kTamil = 0xb80,
|
||||
kTelugu = 0xc00,
|
||||
kKannada = 0xc80,
|
||||
kMalayalam = 0xd00,
|
||||
kSinhala = 0xd80,
|
||||
kMyanmar = 0x1000,
|
||||
kKhmer = 0x1780,
|
||||
};
|
||||
|
||||
// Base class offers a validation API and protected methods to allow subclasses
|
||||
// to easily build the validated/segmented output.
|
||||
class Validator {
|
||||
public:
|
||||
// Validates and cleans the src vector of unicodes to the *dest, according to
|
||||
// g_mode. In the case of kSingleString, a single vector containing the whole
|
||||
// result is added to *dest. With kCombined, multiple vectors are added to
|
||||
// *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
|
||||
// added to *dest with a smaller unit representing a glyph in each.
|
||||
// In case of validation error, returns false and as much as possible of the
|
||||
// input, without discarding invalid text.
|
||||
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode,
|
||||
bool report_errors,
|
||||
const std::vector<char32>& src,
|
||||
std::vector<std::vector<char32>>* dest);
|
||||
|
||||
// Returns true if the unicode ch is a non-printing zero-width mark of no
|
||||
// significance to OCR training or evaluation.
|
||||
static bool IsZeroWidthMark(char32 ch) {
|
||||
return ch == kZeroWidthSpace || ch == kLeftToRightMark ||
|
||||
ch == kRightToLeftMark || ch == kInvalid;
|
||||
}
|
||||
virtual ~Validator() {}
|
||||
|
||||
// Some specific but universally useful unicodes.
|
||||
static const char32 kZeroWidthSpace;
|
||||
static const char32 kZeroWidthNonJoiner;
|
||||
static const char32 kZeroWidthJoiner;
|
||||
static const char32 kLeftToRightMark;
|
||||
static const char32 kRightToLeftMark;
|
||||
static const char32 kInvalid;
|
||||
|
||||
protected:
|
||||
// These are more or less the character class identifiers in the ISCII
|
||||
// standard, section 8. They have been augmented with the Unicode meta
|
||||
// characters Zero Width Joiner and Zero Width Non Joiner, and the
|
||||
// Unicode Vedic Marks.
|
||||
// The best sources of information on Unicode and Indic scripts are:
|
||||
// http://varamozhi.sourceforge.net/iscii91.pdf
|
||||
// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
|
||||
// http://unicode.org/faq/indic.html
|
||||
// http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
|
||||
enum class CharClass {
|
||||
// NOTE: The values of the enum members are meaningless and arbitrary, ie
|
||||
// they are not used for sorting, or any other risky application.
|
||||
// The reason they are what they are is they are a single character
|
||||
// abbreviation that can be used in a regexp/BNF definition of a grammar,
|
||||
// IN A COMMENT, and still not relied upon in the code.
|
||||
kConsonant = 'C',
|
||||
kVowel = 'V',
|
||||
kVirama = 'H', // (aka Halant)
|
||||
kMatra = 'M', // (aka Dependent Vowel)
|
||||
kMatraPiece = 'P', // unicode provides pieces of Matras.
|
||||
kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks)
|
||||
kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C
|
||||
kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D
|
||||
kVedicMark = 'v', // Modifiers can come modify any indic syllable.
|
||||
kNukta = 'N', // Occurs only immediately after consonants.
|
||||
kRobat = 'R', // Khmer only.
|
||||
kOther = 'O', // (digits, measures, non-Indic, etc)
|
||||
// Additional classes used only by ValidateGrapheme.
|
||||
kWhitespace = ' ',
|
||||
kCombiner = 'c', // Combiners other than virama.
|
||||
};
|
||||
typedef std::pair<CharClass, char32> IndicPair;
|
||||
|
||||
Validator(ViramaScript script, bool report_errors)
|
||||
: script_(script),
|
||||
codes_used_(0),
|
||||
output_used_(0),
|
||||
report_errors_(report_errors) {}
|
||||
|
||||
// Factory method that understands how to map script to the right subclass.
|
||||
static std::unique_ptr<Validator> ScriptValidator(ViramaScript script,
|
||||
bool report_errors);
|
||||
|
||||
// Internal version of the public static ValidateCleanAndSegment.
|
||||
// Validates and cleans the src vector of unicodes to the *dest, according to
|
||||
// its type and the given g_mode.
|
||||
// In case of validation error, returns false and returns as much as possible
|
||||
// of the input, without discarding invalid text.
|
||||
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode,
|
||||
const std::vector<char32>& src,
|
||||
std::vector<std::vector<char32>>* dest);
|
||||
// Moves the results from parts_ or output_ to dest according to g_mode.
|
||||
void MoveResultsToDest(GraphemeNormMode g_mode,
|
||||
std::vector<std::vector<char32>>* dest);
|
||||
|
||||
// Computes and returns the ViramaScript corresponding to the most frequent
|
||||
// virama-using script in the input, or kNonVirama if none are present.
|
||||
static ViramaScript MostFrequentViramaScript(
|
||||
const std::vector<char32>& utf32);
|
||||
// Returns true if the given UTF-32 unicode is a "virama" character.
|
||||
static bool IsVirama(char32 unicode);
|
||||
// Returns true if the given UTF-32 unicode is a vedic accent.
|
||||
static bool IsVedicAccent(char32 unicode);
|
||||
// Returns true if the script is one that uses subscripts for conjuncts.
|
||||
bool IsSubscriptScript() const;
|
||||
|
||||
// Helper function appends the next element of codes_ only to output_,
|
||||
// without touching parts_
|
||||
// Returns true at the end of codes_.
|
||||
bool CodeOnlyToOutput() {
|
||||
output_.push_back(codes_[codes_used_].second);
|
||||
return ++codes_used_ == codes_.size();
|
||||
}
|
||||
|
||||
// Helper function adds a length-element vector to parts_ from the last length
|
||||
// elements of output_. If there are more than length unused elements in
|
||||
// output_, adds unicodes as single-element vectors to parts_ to catch
|
||||
// output_used_ up to output->size() - length before adding the length-element
|
||||
// vector.
|
||||
void MultiCodePart(int length) {
|
||||
while (output_used_ + length < output_.size()) {
|
||||
parts_.emplace_back(
|
||||
std::initializer_list<char32>{output_[output_used_++]});
|
||||
}
|
||||
parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
|
||||
while (++output_used_ < output_.size()) {
|
||||
parts_.back().push_back(output_[output_used_]);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function appends the next element of codes_ to output_, and then
|
||||
// calls MultiCodePart to add the appropriate components to parts_.
|
||||
// Returns true at the end of codes_.
|
||||
bool UseMultiCode(int length) {
|
||||
output_.push_back(codes_[codes_used_].second);
|
||||
MultiCodePart(length);
|
||||
return ++codes_used_ == codes_.size();
|
||||
}
|
||||
|
||||
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
|
||||
// parts_ and output_. Returns true if a valid Grapheme was consumed,
|
||||
// otherwise does not increment codes_used_.
|
||||
virtual bool ConsumeGraphemeIfValid() = 0;
|
||||
// Sets codes_ to the class codes for the given unicode text.
|
||||
void ComputeClassCodes(const std::vector<char32>& text);
|
||||
// Returns the CharClass corresponding to the given Unicode ch.
|
||||
virtual CharClass UnicodeToCharClass(char32 ch) const = 0;
|
||||
// Resets to the initial state.
|
||||
void Clear();
|
||||
|
||||
// Number of unicodes in each Indic codepage.
|
||||
static const int kIndicCodePageSize = 128;
|
||||
// Lowest unicode value of any Indic script. (Devanagari).
|
||||
static const char32 kMinIndicUnicode = 0x900;
|
||||
// Highest unicode value of any consistent (ISCII-based) Indic script.
|
||||
static const char32 kMaxSinhalaUnicode = 0xdff;
|
||||
// Highest unicode value of any virama-using script. (Khmer).
|
||||
static const char32 kMaxViramaScriptUnicode = 0x17ff;
|
||||
// Some special unicodes.
|
||||
static const char32 kSinhalaVirama = 0xdca;
|
||||
static const char32 kMyanmarVirama = 0x1039;
|
||||
static const char32 kKhmerVirama = 0x17d2;
|
||||
|
||||
// Script we are operating on.
|
||||
ViramaScript script_;
|
||||
// Input unicodes with assigned CharClass is the data to be validated.
|
||||
std::vector<IndicPair> codes_;
|
||||
// Glyph-like components of the input.
|
||||
std::vector<std::vector<char32>> parts_;
|
||||
// Copied validated unicodes from codes_ that are OK to output.
|
||||
std::vector<char32> output_;
|
||||
// The number of elements of codes_ that have been processed so far.
|
||||
int codes_used_;
|
||||
// The number of elements of output_ that have already been added to parts_.
|
||||
int output_used_;
|
||||
// Log error messages for reasons why text is invalid.
|
||||
bool report_errors_;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_VALIDATOR_H_
|
||||
/**********************************************************************
|
||||
* File: validator.h
|
||||
* Description: Base class for various text validators. Intended mainly for
|
||||
* scripts that use a virama character.
|
||||
* Author: Ray Smith
|
||||
* Created: Tue May 23 2017
|
||||
*
|
||||
* (C) Copyright 2017, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TESSERACT_TRAINING_VALIDATOR_H_
|
||||
#define TESSERACT_TRAINING_VALIDATOR_H_
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
#include "unichar.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Different kinds of grapheme normalization - not just for Indic!
|
||||
// A grapheme is a syllable unit in Indic and can be several unicodes.
|
||||
// In other scripts, a grapheme is a base character and accent/diacritic
|
||||
// combination, as not all accented characters have a single composed form.
|
||||
enum class GraphemeNormMode {
|
||||
// Validation result is a single string, even if input is multi-word.
|
||||
kSingleString,
|
||||
// Standard unicode graphemes are validated and output as grapheme units.
|
||||
kCombined,
|
||||
// Graphemes are validated and sub-divided. For virama-using scripts, units
|
||||
// that correspond to repeatable glyphs are generated. (Mostly single unicodes
|
||||
// but viramas and joiners are paired with the most sensible neighbor.)
|
||||
// For non-virama scripts, this means that base/accent pairs are separated,
|
||||
// ie the output is individual unicodes.
|
||||
kGlyphSplit,
|
||||
// The output is always single unicodes, regardless of the script.
|
||||
kIndividualUnicodes,
|
||||
};
|
||||
|
||||
// An enum representing the scripts that use a virama character. It is
|
||||
// guaranteed that the value of any element, (except kNonVirama) can be cast
|
||||
// to a unicode (char32) value that represents the start of the unicode range
|
||||
// of the corresponding script.
|
||||
enum class ViramaScript : char32 {
|
||||
kNonVirama = 0,
|
||||
kDevanagari = 0x900,
|
||||
kBengali = 0x980,
|
||||
kGurmukhi = 0xa00,
|
||||
kGujarati = 0xa80,
|
||||
kOriya = 0xb00,
|
||||
kTamil = 0xb80,
|
||||
kTelugu = 0xc00,
|
||||
kKannada = 0xc80,
|
||||
kMalayalam = 0xd00,
|
||||
kSinhala = 0xd80,
|
||||
kMyanmar = 0x1000,
|
||||
kKhmer = 0x1780,
|
||||
};
|
||||
|
||||
// Base class offers a validation API and protected methods to allow subclasses
|
||||
// to easily build the validated/segmented output.
|
||||
class Validator {
|
||||
public:
|
||||
// Validates and cleans the src vector of unicodes to the *dest, according to
|
||||
// g_mode. In the case of kSingleString, a single vector containing the whole
|
||||
// result is added to *dest. With kCombined, multiple vectors are added to
|
||||
// *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
|
||||
// added to *dest with a smaller unit representing a glyph in each.
|
||||
// In case of validation error, returns false and as much as possible of the
|
||||
// input, without discarding invalid text.
|
||||
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode,
|
||||
bool report_errors,
|
||||
const std::vector<char32>& src,
|
||||
std::vector<std::vector<char32>>* dest);
|
||||
|
||||
// Returns true if the unicode ch is a non-printing zero-width mark of no
|
||||
// significance to OCR training or evaluation.
|
||||
static bool IsZeroWidthMark(char32 ch) {
|
||||
return ch == kZeroWidthSpace || ch == kLeftToRightMark ||
|
||||
ch == kRightToLeftMark || ch == kInvalid;
|
||||
}
|
||||
virtual ~Validator() {}
|
||||
|
||||
// Some specific but universally useful unicodes.
|
||||
static const char32 kZeroWidthSpace;
|
||||
static const char32 kZeroWidthNonJoiner;
|
||||
static const char32 kZeroWidthJoiner;
|
||||
static const char32 kLeftToRightMark;
|
||||
static const char32 kRightToLeftMark;
|
||||
static const char32 kInvalid;
|
||||
|
||||
protected:
|
||||
// These are more or less the character class identifiers in the ISCII
|
||||
// standard, section 8. They have been augmented with the Unicode meta
|
||||
// characters Zero Width Joiner and Zero Width Non Joiner, and the
|
||||
// Unicode Vedic Marks.
|
||||
// The best sources of information on Unicode and Indic scripts are:
|
||||
// http://varamozhi.sourceforge.net/iscii91.pdf
|
||||
// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
|
||||
// http://unicode.org/faq/indic.html
|
||||
// http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
|
||||
enum class CharClass {
|
||||
// NOTE: The values of the enum members are meaningless and arbitrary, ie
|
||||
// they are not used for sorting, or any other risky application.
|
||||
// The reason they are what they are is they are a single character
|
||||
// abbreviation that can be used in a regexp/BNF definition of a grammar,
|
||||
// IN A COMMENT, and still not relied upon in the code.
|
||||
kConsonant = 'C',
|
||||
kVowel = 'V',
|
||||
kVirama = 'H', // (aka Halant)
|
||||
kMatra = 'M', // (aka Dependent Vowel)
|
||||
kMatraPiece = 'P', // unicode provides pieces of Matras.
|
||||
kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks)
|
||||
kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C
|
||||
kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D
|
||||
kVedicMark = 'v', // Modifiers can come modify any indic syllable.
|
||||
kNukta = 'N', // Occurs only immediately after consonants.
|
||||
kRobat = 'R', // Khmer only.
|
||||
kOther = 'O', // (digits, measures, non-Indic, etc)
|
||||
// Additional classes used only by ValidateGrapheme.
|
||||
kWhitespace = ' ',
|
||||
kCombiner = 'c', // Combiners other than virama.
|
||||
};
|
||||
typedef std::pair<CharClass, char32> IndicPair;
|
||||
|
||||
Validator(ViramaScript script, bool report_errors)
|
||||
: script_(script),
|
||||
codes_used_(0),
|
||||
output_used_(0),
|
||||
report_errors_(report_errors) {}
|
||||
|
||||
// Factory method that understands how to map script to the right subclass.
|
||||
static std::unique_ptr<Validator> ScriptValidator(ViramaScript script,
|
||||
bool report_errors);
|
||||
|
||||
// Internal version of the public static ValidateCleanAndSegment.
|
||||
// Validates and cleans the src vector of unicodes to the *dest, according to
|
||||
// its type and the given g_mode.
|
||||
// In case of validation error, returns false and returns as much as possible
|
||||
// of the input, without discarding invalid text.
|
||||
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode,
|
||||
const std::vector<char32>& src,
|
||||
std::vector<std::vector<char32>>* dest);
|
||||
// Moves the results from parts_ or output_ to dest according to g_mode.
|
||||
void MoveResultsToDest(GraphemeNormMode g_mode,
|
||||
std::vector<std::vector<char32>>* dest);
|
||||
|
||||
// Computes and returns the ViramaScript corresponding to the most frequent
|
||||
// virama-using script in the input, or kNonVirama if none are present.
|
||||
static ViramaScript MostFrequentViramaScript(
|
||||
const std::vector<char32>& utf32);
|
||||
// Returns true if the given UTF-32 unicode is a "virama" character.
|
||||
static bool IsVirama(char32 unicode);
|
||||
// Returns true if the given UTF-32 unicode is a vedic accent.
|
||||
static bool IsVedicAccent(char32 unicode);
|
||||
// Returns true if the script is one that uses subscripts for conjuncts.
|
||||
bool IsSubscriptScript() const;
|
||||
|
||||
// Helper function appends the next element of codes_ only to output_,
|
||||
// without touching parts_
|
||||
// Returns true at the end of codes_.
|
||||
bool CodeOnlyToOutput() {
|
||||
output_.push_back(codes_[codes_used_].second);
|
||||
return ++codes_used_ == codes_.size();
|
||||
}
|
||||
|
||||
// Helper function adds a length-element vector to parts_ from the last length
|
||||
// elements of output_. If there are more than length unused elements in
|
||||
// output_, adds unicodes as single-element vectors to parts_ to catch
|
||||
// output_used_ up to output->size() - length before adding the length-element
|
||||
// vector.
|
||||
void MultiCodePart(int length) {
|
||||
while (output_used_ + length < output_.size()) {
|
||||
parts_.emplace_back(
|
||||
std::initializer_list<char32>{output_[output_used_++]});
|
||||
}
|
||||
parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
|
||||
while (++output_used_ < output_.size()) {
|
||||
parts_.back().push_back(output_[output_used_]);
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function appends the next element of codes_ to output_, and then
|
||||
// calls MultiCodePart to add the appropriate components to parts_.
|
||||
// Returns true at the end of codes_.
|
||||
bool UseMultiCode(int length) {
|
||||
output_.push_back(codes_[codes_used_].second);
|
||||
MultiCodePart(length);
|
||||
return ++codes_used_ == codes_.size();
|
||||
}
|
||||
|
||||
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
|
||||
// parts_ and output_. Returns true if a valid Grapheme was consumed,
|
||||
// otherwise does not increment codes_used_.
|
||||
virtual bool ConsumeGraphemeIfValid() = 0;
|
||||
// Sets codes_ to the class codes for the given unicode text.
|
||||
void ComputeClassCodes(const std::vector<char32>& text);
|
||||
// Returns the CharClass corresponding to the given Unicode ch.
|
||||
virtual CharClass UnicodeToCharClass(char32 ch) const = 0;
|
||||
// Resets to the initial state.
|
||||
void Clear();
|
||||
|
||||
// Number of unicodes in each Indic codepage.
|
||||
static const int kIndicCodePageSize = 128;
|
||||
// Lowest unicode value of any Indic script. (Devanagari).
|
||||
static const char32 kMinIndicUnicode = 0x900;
|
||||
// Highest unicode value of any consistent (ISCII-based) Indic script.
|
||||
static const char32 kMaxSinhalaUnicode = 0xdff;
|
||||
// Highest unicode value of any virama-using script. (Khmer).
|
||||
static const char32 kMaxViramaScriptUnicode = 0x17ff;
|
||||
// Some special unicodes.
|
||||
static const char32 kSinhalaVirama = 0xdca;
|
||||
static const char32 kMyanmarVirama = 0x1039;
|
||||
static const char32 kKhmerVirama = 0x17d2;
|
||||
|
||||
// Script we are operating on.
|
||||
ViramaScript script_;
|
||||
// Input unicodes with assigned CharClass is the data to be validated.
|
||||
std::vector<IndicPair> codes_;
|
||||
// Glyph-like components of the input.
|
||||
std::vector<std::vector<char32>> parts_;
|
||||
// Copied validated unicodes from codes_ that are OK to output.
|
||||
std::vector<char32> output_;
|
||||
// The number of elements of codes_ that have been processed so far.
|
||||
int codes_used_;
|
||||
// The number of elements of output_ that have already been added to parts_.
|
||||
int output_used_;
|
||||
// Log error messages for reasons why text is invalid.
|
||||
bool report_errors_;
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_VALIDATOR_H_
|
Loading…
Reference in New Issue
Block a user