mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 20:59:36 +08:00
New training tool text2image
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@965 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
0e230a9d96
commit
f244ab3fc6
101
training/boxchar.cpp
Normal file
101
training/boxchar.cpp
Normal file
@ -0,0 +1,101 @@
|
||||
/**********************************************************************
|
||||
* File: boxchar.cpp
|
||||
* Description: Simple class to associate a Tesseract classification unit with
|
||||
* its bounding box so that the boxes can be rotated as the image
|
||||
* is rotated for degradation. Also includes routines to output
|
||||
* the character-tagged boxes to a boxfile.
|
||||
* Author: Ray Smith
|
||||
* Created: Mon Nov 18 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "boxchar.h"
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
#include "fileio.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
BoxChar::BoxChar(const char* utf8_str, int len) : ch_(utf8_str, len) {
|
||||
box_ = NULL;
|
||||
}
|
||||
|
||||
BoxChar::~BoxChar() {
|
||||
boxDestroy(&box_);
|
||||
}
|
||||
|
||||
void BoxChar::AddBox(int x, int y, int width, int height) {
|
||||
box_ = boxCreate(x, y, width, height);
|
||||
}
|
||||
|
||||
/* static */
|
||||
void BoxChar::TranslateBoxes(int xshift, int yshift,
|
||||
vector<BoxChar*>* boxes) {
|
||||
for (int i = 0; i < boxes->size(); ++i) {
|
||||
BOX* box = (*boxes)[i]->box_;
|
||||
if (box != NULL) {
|
||||
box->x += xshift;
|
||||
box->y += yshift;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Rotate the boxes in [start_box, end_box) by the given rotation.
|
||||
// The rotation is in radians clockwise about the given center.
|
||||
/* static */
|
||||
void BoxChar::RotateBoxes(float rotation,
|
||||
int xcenter,
|
||||
int ycenter,
|
||||
int start_box,
|
||||
int end_box,
|
||||
vector<BoxChar*>* boxes) {
|
||||
Boxa* orig = boxaCreate(0);
|
||||
for (int i = start_box; i < end_box; ++i) {
|
||||
BOX* box = (*boxes)[i]->box_;
|
||||
if (box) boxaAddBox(orig, box, L_CLONE);
|
||||
}
|
||||
Boxa* rotated = boxaRotate(orig, xcenter, ycenter, rotation);
|
||||
boxaDestroy(&orig);
|
||||
for (int i = start_box, box_ind = 0; i < end_box; ++i) {
|
||||
if ((*boxes)[i]->box_) {
|
||||
boxDestroy(&((*boxes)[i]->box_));
|
||||
(*boxes)[i]->box_ = boxaGetBox(rotated, box_ind++, L_CLONE);
|
||||
}
|
||||
}
|
||||
boxaDestroy(&rotated);
|
||||
}
|
||||
|
||||
/* static */
|
||||
void BoxChar::WriteTesseractBoxFile(const string& filename, int height,
|
||||
const vector<BoxChar*>& boxes) {
|
||||
string output;
|
||||
const int kMaxLineLength = 1024;
|
||||
char buffer[kMaxLineLength];
|
||||
for (int i = 0; i < boxes.size(); ++i) {
|
||||
if (boxes[i]->box_ != NULL) {
|
||||
int nbytes = snprintf(buffer, kMaxLineLength,
|
||||
"%s %d %d %d %d %d\n",
|
||||
boxes[i]->ch_.c_str(),
|
||||
boxes[i]->box_->x,
|
||||
height - boxes[i]->box_->y - boxes[i]->box_->h,
|
||||
boxes[i]->box_->x + boxes[i]->box_->w,
|
||||
height - boxes[i]->box_->y,
|
||||
boxes[i]->page_);
|
||||
output.append(buffer, nbytes);
|
||||
}
|
||||
}
|
||||
File::WriteStringToFileOrDie(output, filename);
|
||||
}
|
||||
} // namespace tesseract
|
84
training/boxchar.h
Normal file
84
training/boxchar.h
Normal file
@ -0,0 +1,84 @@
|
||||
/**********************************************************************
|
||||
* File: boxchar.h
|
||||
* Description: Simple class to associate a Tesseract classification unit with
|
||||
* its bounding box so that the boxes can be rotated as the image
|
||||
* is rotated for degradation. Also includes routines to output
|
||||
* the character-tagged boxes to a boxfile.
|
||||
* Author: Ray Smith
|
||||
* Created: Mon Nov 18 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TESSERACT_TRAINING_BOXCHAR_H_
|
||||
#define TESSERACT_TRAINING_BOXCHAR_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "allheaders.h" // from Leptonica
|
||||
|
||||
#ifdef USE_STD_NAMESPACE
|
||||
using std::string;
|
||||
using std::vector;
|
||||
#endif
|
||||
|
||||
struct Box;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class BoxChar {
|
||||
public:
|
||||
BoxChar(const char* utf8_str, int len);
|
||||
|
||||
~BoxChar();
|
||||
|
||||
// Accessors.
|
||||
const string& ch() const { return ch_; }
|
||||
const Box* box() const { return box_; }
|
||||
const int& page() const { return page_; }
|
||||
|
||||
|
||||
// Set the box_ member.
|
||||
void AddBox(int x, int y, int width, int height);
|
||||
|
||||
void set_page(int page) { page_ = page; }
|
||||
|
||||
string* mutable_ch() { return &ch_; }
|
||||
Box* mutable_box() { return box_; }
|
||||
|
||||
static void TranslateBoxes(int xshift, int yshift,
|
||||
vector<BoxChar*>* boxes);
|
||||
|
||||
// Rotate the vector of boxes between start and end by the given rotation.
|
||||
// The rotation is in radians clockwise about the given center.
|
||||
static void RotateBoxes(float rotation,
|
||||
int xcenter,
|
||||
int ycenter,
|
||||
int start_box,
|
||||
int end_box,
|
||||
vector<BoxChar*>* boxes);
|
||||
|
||||
// Create a tesseract box file from the vector of boxes. The image height
|
||||
// is needed to convert to tesseract coordinates.
|
||||
static void WriteTesseractBoxFile(const string& name, int height,
|
||||
const vector<BoxChar*>& boxes);
|
||||
|
||||
private:
|
||||
string ch_;
|
||||
Box* box_;
|
||||
int page_;
|
||||
};
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_BOXCHAR_H_
|
146
training/degradeimage.cpp
Normal file
146
training/degradeimage.cpp
Normal file
@ -0,0 +1,146 @@
|
||||
/**********************************************************************
|
||||
* File: degradeimage.cpp
|
||||
* Description: Function to degrade an image (usually of text) as if it
|
||||
* has been printed and then scanned.
|
||||
* Authors: Ray Smith
|
||||
* Created: Tue Nov 19 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "degradeimage.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include "allheaders.h" // from leptonica
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Rotation is +/- kRotationRange radians.
|
||||
const float kRotationRange = 0.02f;
|
||||
// Number of grey levels to shift by for each exposure step.
|
||||
const int kExposureFactor = 16;
|
||||
// Salt and pepper noise is +/- kSaltnPepper.
|
||||
const int kSaltnPepper = 5;
|
||||
// Min sum of width + height on which to operate the ramp.
|
||||
const int kMinRampSize = 1000;
|
||||
|
||||
static unsigned int random_seed = 0x18273645;
|
||||
|
||||
// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
|
||||
// corresponding to darkening on the copier and <0 lighter and 0 not copied.
|
||||
// Exposures in [-2,2] are most useful, with -3 and 3 being extreme.
|
||||
// If rotation is NULL, rotation is skipped. If *rotation is non-zero, the pix
|
||||
// is rotated by *rotation else it is randomly rotated and *rotation is
|
||||
// modified.
|
||||
// HOW IT WORKS:
|
||||
// Most of the process is really dictated by the fact that the minimum
|
||||
// available convolution is 3X3, which is too big really to simulate a
|
||||
// good quality print/scan process. (2X2 would be better.)
|
||||
// 1 pixel wide inputs are heavily smeared by the 3X3 convolution, making the
|
||||
// images generally biased to being too light, so most of the work is to make
|
||||
// them darker. 3 levels of thickening/darkening are achieved with 2 dilations,
|
||||
// (using a greyscale erosion) one heavy (by being before convolution) and one
|
||||
// light (after convolution).
|
||||
// With no dilation, after covolution, the images are so light that a heavy
|
||||
// constant offset is required to make the 0 image look reasonable. A simple
|
||||
// constant offset multiple of exposure to undo this value is enough to achieve
|
||||
// all the required lightening. This gives the advantage that exposure level 1
|
||||
// with a single dilation gives a good impression of the broken-yet-too-dark
|
||||
// problem that is often seen in scans.
|
||||
// A small random rotation gives some varying greyscale values on the edges,
|
||||
// and some random salt and pepper noise on top helps to realistically jaggy-up
|
||||
// the edges.
|
||||
// Finally a greyscale ramp provides a continuum of effects between exposure
|
||||
// levels.
|
||||
Pix* DegradeImage(Pix* input, int exposure, float* rotation) {
|
||||
Pix* pix = pixConvertTo8(input, false);
|
||||
pixDestroy(&input);
|
||||
input = pix;
|
||||
int width = pixGetWidth(input);
|
||||
int height = pixGetHeight(input);
|
||||
if (exposure >= 2) {
|
||||
// An erosion simulates the spreading darkening of a dark copy.
|
||||
// This is backwards to binary morphology,
|
||||
// see http://www.leptonica.com/grayscale-morphology.html
|
||||
pix = input;
|
||||
input = pixErodeGray(pix, 3, 3);
|
||||
pixDestroy(&pix);
|
||||
}
|
||||
// A convolution is essential to any mode as no scanner produces an
|
||||
// image as sharp as the electronic image.
|
||||
pix = pixBlockconv(input, 1, 1);
|
||||
pixDestroy(&input);
|
||||
// A small random rotation helps to make the edges jaggy in a realistic way.
|
||||
if (rotation != NULL) {
|
||||
float radians_clockwise;
|
||||
if (*rotation) {
|
||||
radians_clockwise = *rotation;
|
||||
} else {
|
||||
radians_clockwise = (2.0*rand_r(&random_seed)/RAND_MAX - 1.0) *
|
||||
kRotationRange;
|
||||
}
|
||||
|
||||
input = pixRotate(pix, radians_clockwise,
|
||||
L_ROTATE_AREA_MAP, L_BRING_IN_WHITE,
|
||||
0, 0);
|
||||
// Rotate the boxes to match.
|
||||
*rotation = radians_clockwise;
|
||||
pixDestroy(&pix);
|
||||
} else {
|
||||
input = pix;
|
||||
}
|
||||
|
||||
if (exposure >= 3 || exposure == 1) {
|
||||
// Erosion after the convolution is not as heavy as before, so it is
|
||||
// good for level 1 and in addition as a level 3.
|
||||
// This is backwards to binary morphology,
|
||||
// see http://www.leptonica.com/grayscale-morphology.html
|
||||
pix = input;
|
||||
input = pixErodeGray(pix, 3, 3);
|
||||
pixDestroy(&pix);
|
||||
}
|
||||
// The convolution really needed to be 2x2 to be realistic enough, but
|
||||
// we only have 3x3, so we have to bias the image darker or lose thin
|
||||
// strokes.
|
||||
int erosion_offset = 0;
|
||||
// For light and 0 exposure, there is no dilation, so compensate for the
|
||||
// convolution with a big darkening bias which is undone for lighter
|
||||
// exposures.
|
||||
if (exposure <= 0)
|
||||
erosion_offset = -3 * kExposureFactor;
|
||||
// Add in a general offset of the greyscales for the exposure level so
|
||||
// a threshold of 128 gives a reasonable binary result.
|
||||
erosion_offset -= exposure * kExposureFactor;
|
||||
// Add a gradual fade over the page and a small amount of salt and pepper
|
||||
// noise to simulate noise in the sensor/paper fibres and varying
|
||||
// illumination.
|
||||
l_uint32* data = pixGetData(input);
|
||||
for (int y = 0; y < height; ++y) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
int pixel = GET_DATA_BYTE(data, x);
|
||||
pixel += rand_r(&random_seed) % (kSaltnPepper*2 + 1) - kSaltnPepper;
|
||||
if (height + width > kMinRampSize)
|
||||
pixel -= (2*x + y) * 32 / (height + width);
|
||||
pixel += erosion_offset;
|
||||
if (pixel < 0)
|
||||
pixel = 0;
|
||||
if (pixel > 255)
|
||||
pixel = 255;
|
||||
SET_DATA_BYTE(data, x, pixel);
|
||||
}
|
||||
data += input->wpl;
|
||||
}
|
||||
return input;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
36
training/degradeimage.h
Normal file
36
training/degradeimage.h
Normal file
@ -0,0 +1,36 @@
|
||||
/**********************************************************************
|
||||
* File: degradeimage.h
|
||||
* Description: Function to degrade an image (usually of text) as if it
|
||||
* has been printed and then scanned.
|
||||
* Authors: Ray Smith
|
||||
* Created: Tue Nov 19 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
#ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_
|
||||
#define TESSERACT_TRAINING_DEGRADEIMAGE_H_
|
||||
|
||||
struct Pix;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Degrade the pix as if by a print/copy/scan cycle with exposure > 0
|
||||
// corresponding to darkening on the copier and <0 lighter and 0 not copied.
|
||||
// If rotation is not NULL, the clockwise rotation in radians is saved there.
|
||||
// The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.)
|
||||
// The input image is destroyed and a different image returned.
|
||||
struct Pix* DegradeImage(struct Pix* input, int exposure, float* rotation);
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_DEGRADEIMAGE_H_
|
194
training/ligature_table.cpp
Normal file
194
training/ligature_table.cpp
Normal file
@ -0,0 +1,194 @@
|
||||
/**********************************************************************
|
||||
* File: ligature_table.cpp
|
||||
* Description: Class for adding and removing optional latin ligatures,
|
||||
* conditional on codepoint support by a specified font
|
||||
* (if specified).
|
||||
* Author: Ranjith Unnikrishnan
|
||||
* Created: Mon Nov 18 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "ligature_table.h"
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include "pango_font_info.h"
|
||||
#include "tlog.h"
|
||||
#include "unichar.h"
|
||||
#include "unicharset.h"
|
||||
#include "unicode/errorcode.h" // from libicu
|
||||
#include "unicode/normlzr.h" // from libicu
|
||||
#include "unicode/unistr.h" // from libicu
|
||||
#include "unicode/utypes.h" // from libicu
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
static string EncodeAsUTF8(const char32 ch32) {
|
||||
UNICHAR uni_ch(ch32);
|
||||
return string(uni_ch.utf8(), uni_ch.utf8_len());
|
||||
}
|
||||
|
||||
// Range of optional latin ligature characters in Unicode to build ligatures
|
||||
// from. Note that this range does not contain the custom ligatures that we
|
||||
// encode in the private use area.
|
||||
const int kMinLigature = 0xfb00;
|
||||
const int kMaxLigature = 0xfb4f;
|
||||
|
||||
/* static */
|
||||
SmartPtr<LigatureTable> LigatureTable::instance_;
|
||||
|
||||
/* static */
|
||||
LigatureTable* LigatureTable::Get() {
|
||||
if (instance_ == NULL) {
|
||||
instance_.reset(new LigatureTable());
|
||||
instance_->Init();
|
||||
}
|
||||
return instance_.get();
|
||||
}
|
||||
|
||||
LigatureTable::LigatureTable() : min_lig_length_(0), max_lig_length_(0),
|
||||
min_norm_length_(0), max_norm_length_(0) {}
|
||||
|
||||
void LigatureTable::Init() {
|
||||
if (norm_to_lig_table_.empty()) {
|
||||
for (char32 lig = kMinLigature; lig <= kMaxLigature; ++lig) {
|
||||
// For each char in the range, convert to utf8, nfkc normalize, and if
|
||||
// the strings are different put the both mappings in the hash_maps.
|
||||
string lig8 = EncodeAsUTF8(lig);
|
||||
icu::UnicodeString unicode_lig8(static_cast<UChar32>(lig));
|
||||
icu::UnicodeString normed8_result;
|
||||
icu::ErrorCode status;
|
||||
icu::Normalizer::normalize(unicode_lig8, UNORM_NFKC, 0, normed8_result,
|
||||
status);
|
||||
string normed8;
|
||||
normed8_result.toUTF8String(normed8);
|
||||
// The icu::Normalizer maps the "LONG S T" ligature to "st". Correct that
|
||||
// here manually so that AddLigatures() will work as desired.
|
||||
if (lig8 == "\uFB05")
|
||||
normed8 = "ſt";
|
||||
int lig_length = lig8.length();
|
||||
int norm_length = normed8.size();
|
||||
if (normed8 != lig8 && lig_length > 1 && norm_length > 1) {
|
||||
norm_to_lig_table_[normed8] = lig8;
|
||||
lig_to_norm_table_[lig8] = normed8;
|
||||
if (min_lig_length_ == 0 || lig_length < min_lig_length_)
|
||||
min_lig_length_ = lig_length;
|
||||
if (lig_length > max_lig_length_)
|
||||
max_lig_length_ = lig_length;
|
||||
if (min_norm_length_ == 0 || norm_length < min_norm_length_)
|
||||
min_norm_length_ = norm_length;
|
||||
if (norm_length > max_norm_length_)
|
||||
max_norm_length_ = norm_length;
|
||||
}
|
||||
}
|
||||
// Add custom extra ligatures.
|
||||
for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) {
|
||||
norm_to_lig_table_[UNICHARSET::kCustomLigatures[i][0]] =
|
||||
UNICHARSET::kCustomLigatures[i][1];
|
||||
int norm_length = strlen(UNICHARSET::kCustomLigatures[i][0]);
|
||||
if (min_norm_length_ == 0 || norm_length < min_norm_length_)
|
||||
min_norm_length_ = norm_length;
|
||||
if (norm_length > max_norm_length_)
|
||||
max_norm_length_ = norm_length;
|
||||
|
||||
lig_to_norm_table_[UNICHARSET::kCustomLigatures[i][1]] =
|
||||
UNICHARSET::kCustomLigatures[i][0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
string LigatureTable::RemoveLigatures(const string& str) const {
|
||||
string result;
|
||||
UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length());
|
||||
UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
|
||||
char tmp[5];
|
||||
int len;
|
||||
for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
|
||||
len = it.get_utf8(tmp);
|
||||
tmp[len] = '\0';
|
||||
LigHash::const_iterator lig_it = lig_to_norm_table_.find(tmp);
|
||||
if (lig_it != lig_to_norm_table_.end()) {
|
||||
result += lig_it->second;
|
||||
} else {
|
||||
result += tmp;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
string LigatureTable::RemoveCustomLigatures(const string& str) const {
|
||||
string result;
|
||||
UNICHAR::const_iterator it_begin = UNICHAR::begin(str.c_str(), str.length());
|
||||
UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
|
||||
char tmp[5];
|
||||
int len;
|
||||
int norm_ind;
|
||||
for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
|
||||
len = it.get_utf8(tmp);
|
||||
tmp[len] = '\0';
|
||||
norm_ind = -1;
|
||||
for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL && norm_ind < 0;
|
||||
++i) {
|
||||
if (!strcmp(tmp, UNICHARSET::kCustomLigatures[i][1])) {
|
||||
norm_ind = i;
|
||||
}
|
||||
}
|
||||
if (norm_ind >= 0) {
|
||||
result += UNICHARSET::kCustomLigatures[norm_ind][0];
|
||||
} else {
|
||||
result += tmp;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
string LigatureTable::AddLigatures(const string& str,
|
||||
const PangoFontInfo* font) const {
|
||||
string result;
|
||||
int len = str.size();
|
||||
int step = 0;
|
||||
int i = 0;
|
||||
for (i = 0; i < len - min_norm_length_ + 1; i += step) {
|
||||
step = 0;
|
||||
for (int liglen = max_norm_length_; liglen >= min_norm_length_; --liglen) {
|
||||
if (i + liglen <= len) {
|
||||
string lig_cand = str.substr(i, liglen);
|
||||
LigHash::const_iterator it = norm_to_lig_table_.find(lig_cand);
|
||||
if (it != norm_to_lig_table_.end()) {
|
||||
tlog(3, "Considering %s -> %s\n", lig_cand.c_str(),
|
||||
it->second.c_str());
|
||||
if (font) {
|
||||
// Test for renderability.
|
||||
if (!font->CanRenderString(it->second.data(), it->second.length()))
|
||||
continue; // Not renderable
|
||||
}
|
||||
// Found a match so convert it.
|
||||
step = liglen;
|
||||
result += it->second;
|
||||
tlog(2, "Substituted %s -> %s\n", lig_cand.c_str(),
|
||||
it->second.c_str());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (step == 0) {
|
||||
result += str[i];
|
||||
step = 1;
|
||||
}
|
||||
}
|
||||
result += str.substr(i, len - i);
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
79
training/ligature_table.h
Normal file
79
training/ligature_table.h
Normal file
@ -0,0 +1,79 @@
|
||||
/**********************************************************************
|
||||
* File: ligature_table.h
|
||||
* Description: Class for adding and removing optional latin ligatures,
|
||||
* conditional on codepoint support by a specified font
|
||||
* (if specified).
|
||||
* Author: Ranjith Unnikrishnan
|
||||
* Created: Mon Nov 18 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TRAININGDATA_LIGATURE_TABLE_H_
|
||||
#define TRAININGDATA_LIGATURE_TABLE_H_
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "hashfn.h"
|
||||
#include "util.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class PangoFontInfo; // defined in pango_font_info.h
|
||||
|
||||
// Map to substitute strings for ligatures.
|
||||
typedef hash_map<string, string, StringHash> LigHash;
|
||||
|
||||
class LigatureTable {
|
||||
public:
|
||||
// Get a static instance of this class.
|
||||
static LigatureTable* Get();
|
||||
|
||||
// Convert the utf8 string so that ligaturizable sequences, such as "fi" get
|
||||
// replaced by the (utf8 code for) appropriate ligature characters. Only do so
|
||||
// if the corresponding ligature character is renderable in the current font.
|
||||
string AddLigatures(const string& str, const PangoFontInfo* font) const;
|
||||
// Remove all ligatures.
|
||||
string RemoveLigatures(const string& str) const;
|
||||
// Remove only custom ligatures (eg. "ct") encoded in the private-use-area.
|
||||
string RemoveCustomLigatures(const string& str) const;
|
||||
|
||||
const LigHash& norm_to_lig_table() const {
|
||||
return norm_to_lig_table_;
|
||||
}
|
||||
const LigHash& lig_to_norm_table() const {
|
||||
return lig_to_norm_table_;
|
||||
}
|
||||
|
||||
protected:
|
||||
LigatureTable();
|
||||
// Initialize the hash tables mapping between ligature strings and the
|
||||
// corresponding ligature characters.
|
||||
void Init();
|
||||
|
||||
static SmartPtr<LigatureTable> instance_;
|
||||
LigHash norm_to_lig_table_;
|
||||
LigHash lig_to_norm_table_;
|
||||
int min_lig_length_;
|
||||
int max_lig_length_;
|
||||
int min_norm_length_;
|
||||
int max_norm_length_;
|
||||
|
||||
private:
|
||||
LigatureTable(const LigatureTable&);
|
||||
void operator=(const LigatureTable&);
|
||||
};
|
||||
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // OCR_TRAININGDATA_TYPESETTING_LIGATURE_TABLE_H_
|
705
training/pango_font_info.cpp
Normal file
705
training/pango_font_info.cpp
Normal file
@ -0,0 +1,705 @@
|
||||
/**********************************************************************
|
||||
* File: pango_font_info.cpp
|
||||
* Description: Font-related objects and helper functions
|
||||
* Author: Ranjith Unnikrishnan
|
||||
* Created: Mon Nov 18 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "pango_font_info.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/param.h>
|
||||
#include <algorithm>
|
||||
|
||||
#include "commandlineflags.h"
|
||||
#include "fileio.h"
|
||||
#include "normstrngs.h"
|
||||
#include "tlog.h"
|
||||
#include "unichar.h"
|
||||
#include "util.h"
|
||||
#include "pango/pango-context.h"
|
||||
#include "pango/pango-font.h"
|
||||
#include "pango/pango-glyph-item.h"
|
||||
#include "pango/pango-glyph.h"
|
||||
#include "pango/pango-layout.h"
|
||||
#include "pango/pango-utils.h"
|
||||
#include "pango/pangocairo.h"
|
||||
#include "pango/pangofc-font.h"
|
||||
|
||||
STRING_PARAM_FLAG(fonts_dir, "/auto/ocr-data/tesstraining/fonts",
|
||||
"Overrides system default font location");
|
||||
STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp",
|
||||
"Overrides fontconfig default temporary dir");
|
||||
BOOL_PARAM_FLAG(fontconfig_refresh_cache, false,
|
||||
"Does a one-time deletion of cache files from the "
|
||||
"fontconfig_tmpdir before initializing fontconfig.");
|
||||
|
||||
#ifndef USE_STD_NAMESPACE
|
||||
#include "ocr/trainingdata/typesetting/legacy_fonts.h"
|
||||
BOOL_PARAM_FLAG(use_only_legacy_fonts, false,
|
||||
"Overrides --fonts_dir and sets the known universe of fonts to"
|
||||
"the list in legacy_fonts.h");
|
||||
// Compatability with pango 1.20.
|
||||
#include "pango/pango-glyph-item-private.h"
|
||||
#define pango_glyph_item_iter_init_start _pango_glyph_item_iter_init_start
|
||||
#define pango_glyph_item_iter_next_cluster _pango_glyph_item_iter_next_cluster
|
||||
#else
|
||||
using std::pair;
|
||||
#endif
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Default assumed output resolution. Required only for providing font metrics
|
||||
// in pixels.
|
||||
const int kDefaultResolution = 300;
|
||||
|
||||
PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) {
|
||||
Clear();
|
||||
}
|
||||
|
||||
PangoFontInfo::PangoFontInfo(const string& desc)
|
||||
: desc_(NULL), resolution_(kDefaultResolution) {
|
||||
if (!ParseFontDescriptionName(desc)) {
|
||||
tprintf("ERROR: Could not parse %s\n", desc.c_str());
|
||||
Clear();
|
||||
}
|
||||
}
|
||||
|
||||
void PangoFontInfo::Clear() {
|
||||
font_size_ = 0;
|
||||
is_bold_ = false;
|
||||
is_italic_ = false;
|
||||
is_smallcaps_ = false;
|
||||
is_monospace_ = false;
|
||||
family_name_.clear();
|
||||
font_type_ = UNKNOWN;
|
||||
if (desc_) {
|
||||
pango_font_description_free(desc_);
|
||||
desc_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
string PangoFontInfo::DescriptionName() const {
|
||||
if (!desc_) return "";
|
||||
char* desc_str = pango_font_description_to_string(desc_);
|
||||
string desc_name(desc_str);
|
||||
g_free(desc_str);
|
||||
return desc_name;
|
||||
}
|
||||
|
||||
// Initializes Fontconfig for use by writing a fake fonts.conf file into the
|
||||
// FLAGS_fontconfigs_tmpdir directory, that points to the supplied
|
||||
// FLAGS_fonts_dir, and then overrides the FONTCONFIG_PATH environment variable
|
||||
// to point to this fonts.conf file.
|
||||
static void InitFontconfig() {
|
||||
static bool init_fontconfig = false;
|
||||
if (init_fontconfig || FLAGS_fonts_dir.empty()) {
|
||||
init_fontconfig = true;
|
||||
return;
|
||||
}
|
||||
if (FLAGS_fontconfig_refresh_cache) {
|
||||
tprintf("Deleting cache files from %s\n", FLAGS_fontconfig_tmpdir.c_str());
|
||||
File::DeleteMatchingFiles(File::JoinPath(
|
||||
FLAGS_fontconfig_tmpdir.c_str(), "*cache-2").c_str());
|
||||
}
|
||||
tprintf("Initializing fontconfig\n");
|
||||
string fonts_dir = File::JoinPath(
|
||||
FLAGS_fonts_dir.c_str(), "google3/ocr/trainingdata/typesetting/testdata");
|
||||
const int MAX_FONTCONF_FILESIZE = 1024;
|
||||
char fonts_conf_template[MAX_FONTCONF_FILESIZE];
|
||||
snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
|
||||
"<?xml version=\"1.0\"?>\n"
|
||||
"<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
|
||||
"<fontconfig>\n"
|
||||
"<dir>%s</dir>\n"
|
||||
"<cachedir>%s</cachedir>\n"
|
||||
"<config></config>\n"
|
||||
"</fontconfig>", FLAGS_fonts_dir.c_str(),
|
||||
FLAGS_fontconfig_tmpdir.c_str());
|
||||
string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(),
|
||||
"fonts.conf");
|
||||
File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
|
||||
setenv("FONTCONFIG_PATH", FLAGS_fontconfig_tmpdir.c_str(), true);
|
||||
// Fix the locale so that the reported font names are consistent.
|
||||
setenv("LANG", "en_US.utf8", true);
|
||||
init_fontconfig = true;
|
||||
}
|
||||
|
||||
|
||||
static void ListFontFamilies(PangoFontFamily*** families,
|
||||
int* n_families) {
|
||||
InitFontconfig();
|
||||
PangoFontMap* font_map = pango_cairo_font_map_get_default();
|
||||
DISABLE_HEAP_LEAK_CHECK;
|
||||
pango_font_map_list_families(font_map, families, n_families);
|
||||
}
|
||||
|
||||
// Inspects whether a given font family is monospace. If the font is not
|
||||
// available, it cannot make a decision and returns false by default.
|
||||
static bool IsMonospaceFontFamily(const char* family_name) {
|
||||
PangoFontFamily** families = 0;
|
||||
int n_families = 0;
|
||||
bool is_monospace = false;
|
||||
ListFontFamilies(&families, &n_families);
|
||||
ASSERT_HOST(n_families > 0);
|
||||
bool found = false;
|
||||
for (int i = 0; i < n_families; ++i) {
|
||||
if (!strcasecmp(family_name, pango_font_family_get_name(families[i]))) {
|
||||
is_monospace = pango_font_family_is_monospace(families[i]);
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
tlog(1, "Could not find monospace property of family %s\n", family_name);
|
||||
}
|
||||
g_free(families);
|
||||
return is_monospace;
|
||||
}
|
||||
|
||||
bool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) {
|
||||
Clear();
|
||||
const char* family = pango_font_description_get_family(desc);
|
||||
if (!family) {
|
||||
char* desc_str = pango_font_description_to_string(desc);
|
||||
tprintf("WARNING: Could not parse family name from description: '%s'\n",
|
||||
desc_str);
|
||||
g_free(desc_str);
|
||||
return false;
|
||||
}
|
||||
family_name_ = string(family);
|
||||
desc_ = pango_font_description_copy(desc);
|
||||
is_monospace_ = IsMonospaceFontFamily(family);
|
||||
|
||||
// Set font size in points
|
||||
font_size_ = pango_font_description_get_size(desc);
|
||||
if (!pango_font_description_get_size_is_absolute(desc)) {
|
||||
font_size_ /= PANGO_SCALE;
|
||||
}
|
||||
|
||||
PangoStyle style = pango_font_description_get_style(desc);
|
||||
is_italic_ = (PANGO_STYLE_ITALIC == style ||
|
||||
PANGO_STYLE_OBLIQUE == style);
|
||||
is_smallcaps_ = (pango_font_description_get_variant(desc)
|
||||
== PANGO_VARIANT_SMALL_CAPS);
|
||||
|
||||
is_bold_ = (pango_font_description_get_weight(desc) >= PANGO_WEIGHT_BOLD);
|
||||
// We dont have a way to detect whether a font is of type Fraktur. The fonts
|
||||
// we currently use all have "Fraktur" in their family name, so we do a
|
||||
// fragile but functional check for that here.
|
||||
is_fraktur_ = (strcasestr(family, "Fraktur") != NULL);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PangoFontInfo::ParseFontDescriptionName(const string& name) {
|
||||
PangoFontDescription *desc = pango_font_description_from_string(name.c_str());
|
||||
bool success = ParseFontDescription(desc);
|
||||
pango_font_description_free(desc);
|
||||
return success;
|
||||
}
|
||||
|
||||
// Returns the PangoFont structure corresponding to the closest available font
|
||||
// in the font map. Note that if the font is wholly missing, this could
|
||||
// correspond to a completely different font family and face.
|
||||
PangoFont* PangoFontInfo::ToPangoFont() const {
|
||||
InitFontconfig();
|
||||
PangoFontMap* font_map = pango_cairo_font_map_get_default();
|
||||
PangoContext* context = pango_context_new();
|
||||
pango_cairo_context_set_resolution(context, resolution_);
|
||||
pango_context_set_font_map(context, font_map);
|
||||
PangoFont* font = NULL;
|
||||
{
|
||||
DISABLE_HEAP_LEAK_CHECK;
|
||||
font = pango_font_map_load_font(font_map, context, desc_);
|
||||
}
|
||||
g_object_unref(context);
|
||||
return font;
|
||||
}
|
||||
|
||||
bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const {
|
||||
PangoFont* font = ToPangoFont();
|
||||
PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
|
||||
for (UNICHAR::const_iterator it = UNICHAR::begin(utf8_text, byte_length);
|
||||
it != UNICHAR::end(utf8_text, byte_length);
|
||||
++it) {
|
||||
if (IsWhitespace(*it) || pango_is_zero_width(*it))
|
||||
continue;
|
||||
if (pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
|
||||
char tmp[5];
|
||||
int len = it.get_utf8(tmp);
|
||||
tmp[len] = '\0';
|
||||
tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int PangoFontInfo::DropUncoveredChars(string* utf8_text) const {
|
||||
PangoFont* font = ToPangoFont();
|
||||
PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
|
||||
int num_dropped_chars = 0;
|
||||
// Maintain two iterators that point into the string. For space efficiency, we
|
||||
// will repeatedly copy one covered UTF8 character from one to the other, and
|
||||
// at the end resize the string to the right length.
|
||||
char* out = const_cast<char*>(utf8_text->c_str());
|
||||
const UNICHAR::const_iterator it_begin =
|
||||
UNICHAR::begin(utf8_text->c_str(), utf8_text->length());
|
||||
const UNICHAR::const_iterator it_end =
|
||||
UNICHAR::end(utf8_text->c_str(), utf8_text->length());
|
||||
for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
|
||||
if (!IsWhitespace(*it) && !pango_is_zero_width(*it) &&
|
||||
pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
|
||||
if (TLOG_IS_ON(2)) {
|
||||
char tmp[5];
|
||||
int len = it.get_utf8(tmp);
|
||||
tmp[len] = '\0';
|
||||
tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
|
||||
}
|
||||
++num_dropped_chars;
|
||||
continue;
|
||||
}
|
||||
strncpy(out, it.utf8_data(), it.utf8_len());
|
||||
out += it.utf8_len();
|
||||
}
|
||||
utf8_text->resize(out - utf8_text->c_str());
|
||||
return num_dropped_chars;
|
||||
}
|
||||
|
||||
bool PangoFontInfo::GetSpacingProperties(const string& utf8_char,
|
||||
int* x_bearing, int* x_advance) const {
|
||||
// Convert to equivalent PangoFont structure
|
||||
PangoFont* font = ToPangoFont();
|
||||
// Find the glyph index in the font for the supplied utf8 character.
|
||||
int total_advance = 0;
|
||||
int min_bearing = 0;
|
||||
// Handle multi-unicode strings by reporting the left-most position of the
|
||||
// x-bearing, and right-most position of the x-advance if the string were to
|
||||
// be rendered.
|
||||
const UNICHAR::const_iterator it_begin = UNICHAR::begin(utf8_char.c_str(),
|
||||
utf8_char.length());
|
||||
const UNICHAR::const_iterator it_end = UNICHAR::end(utf8_char.c_str(),
|
||||
utf8_char.length());
|
||||
for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
|
||||
PangoGlyph glyph_index = pango_fc_font_get_glyph(
|
||||
reinterpret_cast<PangoFcFont*>(font), *it);
|
||||
if (!glyph_index) {
|
||||
// Glyph for given unicode character doesn't exist in font.
|
||||
return false;
|
||||
}
|
||||
// Find the ink glyph extents for the glyph
|
||||
PangoRectangle ink_rect, logical_rect;
|
||||
pango_font_get_glyph_extents(font, glyph_index, &ink_rect, &logical_rect);
|
||||
pango_extents_to_pixels(&ink_rect, NULL);
|
||||
pango_extents_to_pixels(&logical_rect, NULL);
|
||||
|
||||
int bearing = total_advance + PANGO_LBEARING(ink_rect);
|
||||
if (it == it_begin || bearing < min_bearing) {
|
||||
min_bearing = bearing;
|
||||
}
|
||||
total_advance += PANGO_RBEARING(logical_rect);
|
||||
}
|
||||
*x_bearing = min_bearing;
|
||||
*x_advance = total_advance;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool PangoFontInfo::CanRenderString(const char* utf8_word, int len) const {
|
||||
vector<string> graphemes;
|
||||
return CanRenderString(utf8_word, len, &graphemes);
|
||||
}
|
||||
|
||||
bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
|
||||
vector<string>* graphemes) const {
|
||||
if (graphemes) graphemes->clear();
|
||||
// We check for font coverage of the text first, as otherwise Pango could
|
||||
// (undesirably) fall back to another font that does have the required
|
||||
// coverage.
|
||||
if (!CoversUTF8Text(utf8_word, len)) {
|
||||
return false;
|
||||
}
|
||||
// U+25CC dotted circle character that often (but not always) gets rendered
|
||||
// when there is an illegal grapheme sequence.
|
||||
const char32 kDottedCircleGlyph = 9676;
|
||||
bool bad_glyph = false;
|
||||
PangoFontMap* font_map = pango_cairo_font_map_get_default();
|
||||
PangoContext* context = pango_context_new();
|
||||
pango_context_set_font_map(context, font_map);
|
||||
PangoLayout* layout = pango_layout_new(context);
|
||||
if (desc_) {
|
||||
pango_layout_set_font_description(layout, desc_);
|
||||
} else {
|
||||
PangoFontDescription *desc = pango_font_description_from_string(
|
||||
DescriptionName().c_str());
|
||||
pango_layout_set_font_description(layout, desc);
|
||||
pango_font_description_free(desc);
|
||||
}
|
||||
pango_layout_set_text(layout, utf8_word, len);
|
||||
PangoLayoutIter* run_iter = NULL;
|
||||
{ // Fontconfig caches some information here that is not freed before exit.
|
||||
DISABLE_HEAP_LEAK_CHECK;
|
||||
run_iter = pango_layout_get_iter(layout);
|
||||
}
|
||||
do {
|
||||
PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
|
||||
if (!run) {
|
||||
tlog(2, "Found end of line NULL run marker\n");
|
||||
continue;
|
||||
}
|
||||
PangoGlyph dotted_circle_glyph;
|
||||
PangoFont* font = run->item->analysis.font;
|
||||
dotted_circle_glyph = pango_fc_font_get_glyph(
|
||||
reinterpret_cast<PangoFcFont*>(font), kDottedCircleGlyph);
|
||||
if (TLOG_IS_ON(2)) {
|
||||
PangoFontDescription* desc = pango_font_describe(font);
|
||||
char* desc_str = pango_font_description_to_string(desc);
|
||||
tlog(2, "Desc of font in run: %s\n", desc_str);
|
||||
g_free(desc_str);
|
||||
pango_font_description_free(desc);
|
||||
}
|
||||
|
||||
PangoGlyphItemIter cluster_iter;
|
||||
gboolean have_cluster;
|
||||
for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
|
||||
run, utf8_word);
|
||||
have_cluster && !bad_glyph;
|
||||
have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
|
||||
const int start_byte_index = cluster_iter.start_index;
|
||||
const int end_byte_index = cluster_iter.end_index;
|
||||
int start_glyph_index = cluster_iter.start_glyph;
|
||||
int end_glyph_index = cluster_iter.end_glyph;
|
||||
string cluster_text = string(utf8_word + start_byte_index,
|
||||
end_byte_index - start_byte_index);
|
||||
if (graphemes) graphemes->push_back(cluster_text);
|
||||
if (IsUTF8Whitespace(cluster_text.c_str())) {
|
||||
tlog(2, "Skipping whitespace\n");
|
||||
continue;
|
||||
}
|
||||
if (TLOG_IS_ON(2)) {
|
||||
printf("start_byte=%d end_byte=%d start_glyph=%d end_glyph=%d ",
|
||||
start_byte_index, end_byte_index,
|
||||
start_glyph_index, end_glyph_index);
|
||||
}
|
||||
for (int i = start_glyph_index,
|
||||
step = (end_glyph_index > start_glyph_index) ? 1 : -1;
|
||||
!bad_glyph && i != end_glyph_index; i+= step) {
|
||||
const bool unknown_glyph =
|
||||
(cluster_iter.glyph_item->glyphs->glyphs[i].glyph &
|
||||
PANGO_GLYPH_UNKNOWN_FLAG);
|
||||
const bool illegal_glyph =
|
||||
(cluster_iter.glyph_item->glyphs->glyphs[i].glyph ==
|
||||
dotted_circle_glyph);
|
||||
bad_glyph = unknown_glyph || illegal_glyph;
|
||||
if (TLOG_IS_ON(2)) {
|
||||
printf("(%d=%d)", cluster_iter.glyph_item->glyphs->glyphs[i].glyph,
|
||||
bad_glyph ? 1 : 0);
|
||||
}
|
||||
}
|
||||
if (TLOG_IS_ON(2)) {
|
||||
printf(" '%s'\n", cluster_text.c_str());
|
||||
}
|
||||
if (bad_glyph)
|
||||
tlog(1, "Found illegal glyph!\n");
|
||||
}
|
||||
} while (!bad_glyph && pango_layout_iter_next_run(run_iter));
|
||||
|
||||
pango_layout_iter_free(run_iter);
|
||||
g_object_unref(context);
|
||||
g_object_unref(layout);
|
||||
if (bad_glyph && graphemes) graphemes->clear();
|
||||
return !bad_glyph;
|
||||
}
|
||||
|
||||
|
||||
// ------------------------ FontUtils ------------------------------------
|
||||
|
||||
// Returns whether the specified font description is available in the fonts
|
||||
// directory.
|
||||
//
|
||||
// The generated list of font families and faces includes "synthesized" font
|
||||
// faces that are not truly loadable. Pango versions >=1.18 have a
|
||||
// pango_font_face_is_synthesized method that can be used to prune the list.
|
||||
// Until then, we are restricted to using a hack where we try to load the font
|
||||
// from the font_map, and then check what we loaded to see if it has the
|
||||
// description we expected. If it is not, then the font is deemed unavailable.
|
||||
/* static */
|
||||
bool FontUtils::IsAvailableFont(const char* query_desc) {
|
||||
PangoFontDescription *desc = pango_font_description_from_string(query_desc);
|
||||
PangoFont* selected_font = NULL;
|
||||
{
|
||||
InitFontconfig();
|
||||
PangoFontMap* font_map = pango_cairo_font_map_get_default();
|
||||
PangoContext* context = pango_context_new();
|
||||
pango_context_set_font_map(context, font_map);
|
||||
{
|
||||
DISABLE_HEAP_LEAK_CHECK;
|
||||
selected_font = pango_font_map_load_font(font_map, context, desc);
|
||||
}
|
||||
g_object_unref(context);
|
||||
}
|
||||
PangoFontDescription* selected_desc = pango_font_describe(selected_font);
|
||||
|
||||
bool equal = pango_font_description_equal(desc, selected_desc);
|
||||
tlog(3, "query weight = %d \t selected weight =%d\n",
|
||||
pango_font_description_get_weight(desc),
|
||||
pango_font_description_get_weight(selected_desc));
|
||||
|
||||
char* selected_desc_str = pango_font_description_to_string(selected_desc);
|
||||
tlog(2, "query_desc: '%s' Selected: 's'\n", query_desc, selected_desc_str);
|
||||
|
||||
g_free(selected_desc_str);
|
||||
pango_font_description_free(selected_desc);
|
||||
pango_font_description_free(desc);
|
||||
return equal;
|
||||
}
|
||||
|
||||
static bool ShouldIgnoreFontFamilyName(const char* query) {
|
||||
static const char* kIgnoredFamilyNames[]
|
||||
= { "Sans", "Serif", "Monospace", NULL };
|
||||
const char** list = kIgnoredFamilyNames;
|
||||
for (; *list != NULL; ++list) {
|
||||
if (!strcmp(*list, query))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Outputs description names of available fonts.
|
||||
/* static */
|
||||
const vector<string>& FontUtils::ListAvailableFonts() {
|
||||
static vector<string> available_fonts_; // cache list
|
||||
if (available_fonts_.size()) {
|
||||
return available_fonts_;
|
||||
}
|
||||
#ifndef USE_STD_NAMESPACE
|
||||
if (FLAGS_use_only_legacy_fonts) {
|
||||
// Restrict view to list of fonts in legacy_fonts.h
|
||||
tprintf("Using list of legacy fonts only\n");
|
||||
const int kNumFontLists = 4;
|
||||
for (int i = 0; i < kNumFontLists; ++i) {
|
||||
for (int j = 0; kFontlists[i][j] != NULL; ++j) {
|
||||
available_fonts_.push_back(kFontlists[i][j]);
|
||||
}
|
||||
}
|
||||
return available_fonts_;
|
||||
}
|
||||
#endif
|
||||
|
||||
PangoFontFamily** families = 0;
|
||||
int n_families = 0;
|
||||
ListFontFamilies(&families, &n_families);
|
||||
for (int i = 0; i < n_families; ++i) {
|
||||
const char* family_name = pango_font_family_get_name(families[i]);
|
||||
tlog(2, "Listing family %s\n", family_name);
|
||||
if (ShouldIgnoreFontFamilyName(family_name))
|
||||
continue;
|
||||
|
||||
int n_faces;
|
||||
PangoFontFace** faces = NULL;
|
||||
pango_font_family_list_faces(families[i], &faces, &n_faces);
|
||||
for (int j = 0; j < n_faces; ++j) {
|
||||
PangoFontDescription* desc = pango_font_face_describe(faces[j]);
|
||||
char* desc_str = pango_font_description_to_string(desc);
|
||||
if (IsAvailableFont(desc_str)) {
|
||||
available_fonts_.push_back(desc_str);
|
||||
}
|
||||
pango_font_description_free(desc);
|
||||
g_free(desc_str);
|
||||
}
|
||||
g_free(faces);
|
||||
}
|
||||
g_free(families);
|
||||
sort(available_fonts_.begin(), available_fonts_.end());
|
||||
return available_fonts_;
|
||||
}
|
||||
|
||||
|
||||
static void CharCoverageMapToBitmap(PangoCoverage* coverage,
|
||||
vector<bool>* unichar_bitmap) {
|
||||
const int kMinUnicodeValue = 33;
|
||||
const int kMaxUnicodeValue = 0x10FFFF;
|
||||
unichar_bitmap->resize(kMaxUnicodeValue + 1, false);
|
||||
// Mark off characters that the font can render.
|
||||
for (int i = kMinUnicodeValue; i <= kMaxUnicodeValue; ++i) {
|
||||
if (IsInterchangeValid(i)) {
|
||||
(*unichar_bitmap)[i]
|
||||
= (pango_coverage_get(coverage, i) == PANGO_COVERAGE_EXACT);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* static */
|
||||
void FontUtils::GetAllRenderableCharacters(vector<bool>* unichar_bitmap) {
|
||||
const vector<string>& all_fonts = ListAvailableFonts();
|
||||
return GetAllRenderableCharacters(all_fonts, unichar_bitmap);
|
||||
}
|
||||
|
||||
/* static */
|
||||
void FontUtils::GetAllRenderableCharacters(const string& font_name,
|
||||
vector<bool>* unichar_bitmap) {
|
||||
PangoFontInfo font_info(font_name);
|
||||
PangoCoverage* coverage = pango_font_get_coverage(
|
||||
font_info.ToPangoFont(), NULL);
|
||||
CharCoverageMapToBitmap(coverage, unichar_bitmap);
|
||||
}
|
||||
|
||||
/* static */
|
||||
void FontUtils::GetAllRenderableCharacters(const vector<string>& fonts,
|
||||
vector<bool>* unichar_bitmap) {
|
||||
// Form the union of coverage maps from the fonts
|
||||
PangoCoverage* all_coverage = pango_coverage_new();
|
||||
tlog(1, "Processing %d fonts\n", fonts.size());
|
||||
for (int i = 0; i < fonts.size(); ++i) {
|
||||
PangoFontInfo font_info(fonts[i]);
|
||||
PangoCoverage* coverage = pango_font_get_coverage(
|
||||
font_info.ToPangoFont(), NULL);
|
||||
// Mark off characters that any font can render.
|
||||
pango_coverage_max(all_coverage, coverage);
|
||||
}
|
||||
CharCoverageMapToBitmap(all_coverage, unichar_bitmap);
|
||||
pango_coverage_unref(all_coverage);
|
||||
}
|
||||
|
||||
|
||||
// Utilities written to be backward compatible with StringRender
|
||||
|
||||
/* static */
|
||||
int FontUtils::FontScore(const unordered_map<char32, inT64>& ch_map,
|
||||
const string& fontname,
|
||||
int* raw_score,
|
||||
vector<bool>* ch_flags) {
|
||||
PangoFontInfo font_info;
|
||||
if (!font_info.ParseFontDescriptionName(fontname)) {
|
||||
tprintf("ERROR: Could not parse %s\n", fontname.c_str());
|
||||
}
|
||||
PangoFont* font = font_info.ToPangoFont();
|
||||
PangoCoverage* coverage = pango_font_get_coverage(font, NULL);
|
||||
|
||||
if (ch_flags) {
|
||||
ch_flags->clear();
|
||||
ch_flags->reserve(ch_map.size());
|
||||
}
|
||||
*raw_score = 0;
|
||||
int ok_chars = 0;
|
||||
for (unordered_map<char32, inT64>::const_iterator it = ch_map.begin();
|
||||
it != ch_map.end(); ++it) {
|
||||
bool covered = (IsWhitespace(it->first) ||
|
||||
(pango_coverage_get(coverage, it->first)
|
||||
== PANGO_COVERAGE_EXACT));
|
||||
if (covered) {
|
||||
++(*raw_score);
|
||||
ok_chars += it->second;
|
||||
}
|
||||
if (ch_flags) {
|
||||
ch_flags->push_back(covered);
|
||||
}
|
||||
}
|
||||
return ok_chars;
|
||||
}
|
||||
|
||||
|
||||
/* static */
|
||||
string FontUtils::BestFonts(const unordered_map<char32, inT64>& ch_map,
|
||||
vector<pair<const char*, vector<bool> > >* fonts) {
|
||||
const double kMinOKFraction = 0.99;
|
||||
// Weighted fraction of characters that must be renderable in a font to make
|
||||
// it OK even if the raw count is not good.
|
||||
const double kMinWeightedFraction = 0.99995;
|
||||
|
||||
fonts->clear();
|
||||
vector<vector<bool> > font_flags;
|
||||
vector<int> font_scores;
|
||||
vector<int> raw_scores;
|
||||
int most_ok_chars = 0;
|
||||
int best_raw_score = 0;
|
||||
const vector<string>& font_names = FontUtils::ListAvailableFonts();
|
||||
for (int i = 0; i < font_names.size(); ++i) {
|
||||
vector<bool> ch_flags;
|
||||
int raw_score = 0;
|
||||
int ok_chars = FontScore(ch_map, font_names[i], &raw_score, &ch_flags);
|
||||
most_ok_chars = MAX(ok_chars, most_ok_chars);
|
||||
best_raw_score = MAX(raw_score, best_raw_score);
|
||||
|
||||
font_flags.push_back(ch_flags);
|
||||
font_scores.push_back(ok_chars);
|
||||
raw_scores.push_back(raw_score);
|
||||
}
|
||||
|
||||
// Now select the fonts with a score above a threshold fraction
|
||||
// of both the raw and weighted best scores. To prevent bogus fonts being
|
||||
// selected for CJK, we require a high fraction (kMinOKFraction = 0.99) of
|
||||
// BOTH weighted and raw scores.
|
||||
// In low character-count scripts, the issue is more getting enough fonts,
|
||||
// when only 1 or 2 might have all those rare dingbats etc in them, so we
|
||||
// allow a font with a very high weighted (coverage) score
|
||||
// (kMinWeightedFraction = 0.99995) to be used even if its raw score is poor.
|
||||
int least_good_enough = static_cast<int>(most_ok_chars * kMinOKFraction);
|
||||
int least_raw_enough = static_cast<int>(best_raw_score * kMinOKFraction);
|
||||
int override_enough = static_cast<int>(most_ok_chars * kMinWeightedFraction);
|
||||
|
||||
string font_list;
|
||||
for (int i = 0; i < font_names.size(); ++i) {
|
||||
int score = font_scores[i];
|
||||
int raw_score = raw_scores[i];
|
||||
if ((score >= least_good_enough && raw_score >= least_raw_enough) ||
|
||||
score >= override_enough) {
|
||||
fonts->push_back(make_pair(font_names[i].c_str(), font_flags[i]));
|
||||
tlog(1, "OK font %s = %.4f%%, raw = %d = %.2f%%\n",
|
||||
font_names[i].c_str(),
|
||||
100.0 * score / most_ok_chars,
|
||||
raw_score, 100.0 * raw_score / best_raw_score);
|
||||
font_list += font_names[i];
|
||||
font_list += "\n";
|
||||
} else if (score >= least_good_enough || raw_score >= least_raw_enough) {
|
||||
tlog(1, "Runner-up font %s = %.4f%%, raw = %d = %.2f%%\n",
|
||||
font_names[i].c_str(),
|
||||
100.0 * score / most_ok_chars,
|
||||
raw_score, 100.0 * raw_score / best_raw_score);
|
||||
}
|
||||
}
|
||||
return font_list;
|
||||
}
|
||||
|
||||
/* static */
|
||||
bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
|
||||
string* font_name, vector<string>* graphemes) {
|
||||
return SelectFont(utf8_word, utf8_len, ListAvailableFonts(), font_name,
|
||||
graphemes);
|
||||
}
|
||||
|
||||
/* static */
|
||||
bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
|
||||
const vector<string>& all_fonts,
|
||||
string* font_name, vector<string>* graphemes) {
|
||||
if (font_name) font_name->clear();
|
||||
if (graphemes) graphemes->clear();
|
||||
for (int i = 0; i < all_fonts.size(); ++i) {
|
||||
PangoFontInfo font;
|
||||
vector<string> found_graphemes;
|
||||
ASSERT_HOST_MSG(font.ParseFontDescriptionName(all_fonts[i]),
|
||||
"Could not parse font desc name %s\n",
|
||||
all_fonts[i].c_str());
|
||||
if (font.CanRenderString(utf8_word, utf8_len, &found_graphemes)) {
|
||||
if (graphemes) graphemes->swap(found_graphemes);
|
||||
if (font_name) *font_name = all_fonts[i];
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
187
training/pango_font_info.h
Normal file
187
training/pango_font_info.h
Normal file
@ -0,0 +1,187 @@
|
||||
/**********************************************************************
|
||||
* File: pango_font_info.h
|
||||
* Description: Font-related objects and helper functions
|
||||
* Author: Ranjith Unnikrishnan
|
||||
* Created: Mon Nov 18 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TESSERACT_TRAINING_PANGO_FONT_INFO_H_
|
||||
#define TESSERACT_TRAINING_PANGO_FONT_INFO_H_
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "hashfn.h"
|
||||
#include "host.h"
|
||||
#include "util.h"
|
||||
#include "pango/pango-font.h"
|
||||
|
||||
typedef signed int char32;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Data holder class for a font, intented to avoid having to work with Pango or
|
||||
// FontConfig-specific objects directly.
|
||||
class PangoFontInfo {
|
||||
public:
|
||||
enum FontTypeEnum {
|
||||
UNKNOWN,
|
||||
SERIF,
|
||||
SANS_SERIF,
|
||||
DECORATIVE,
|
||||
};
|
||||
PangoFontInfo();
|
||||
// Initialize from parsing a font description name, defined as a string of the
|
||||
// format:
|
||||
// "FamilyName [FaceName] [PointSize]"
|
||||
// where a missing FaceName implies the default regular face.
|
||||
// eg. "Arial Italic 12", "Verdana"
|
||||
//
|
||||
// FaceName is a combination of:
|
||||
// [StyleName] [Variant] [Weight] [Stretch]
|
||||
// with (all optional) Pango-defined values of:
|
||||
// StyleName: Oblique, Italic
|
||||
// Variant : Small-Caps
|
||||
// Weight : Ultra-Light, Light, Medium, Semi-Bold, Bold, Ultra-Bold, Heavy
|
||||
// Stretch : Ultra-Condensed, Extra-Condensed, Condensed, Semi-Condensed,
|
||||
// Semi-Expanded, Expanded, Extra-Expanded, Ultra-Expanded.
|
||||
explicit PangoFontInfo(const string& name);
|
||||
bool ParseFontDescriptionName(const string& name);
|
||||
|
||||
// Returns true if the font have codepoint coverage for the specified text.
|
||||
bool CoversUTF8Text(const char* utf8_text, int byte_length) const;
|
||||
// Modifies string to remove unicode points that are not covered by the
|
||||
// font. Returns the number of characters dropped.
|
||||
int DropUncoveredChars(string* utf8_text) const;
|
||||
|
||||
// Returns true if the entire string can be rendered by the font with full
|
||||
// character coverage and no unknown glyph or dotted-circle glyph
|
||||
// substitutions on encountering a badly formed unicode sequence.
|
||||
// If true, returns individual graphemes. Any whitespace characters in the
|
||||
// original string are also included in the list.
|
||||
bool CanRenderString(const char* utf8_word, int len,
|
||||
vector<string>* graphemes) const;
|
||||
bool CanRenderString(const char* utf8_word, int len) const;
|
||||
|
||||
// Retrieves the x_bearing and x_advance for the given utf8 character in the
|
||||
// font. Returns false if the glyph for the character could not be found in
|
||||
// the font.
|
||||
// Ref: http://freetype.sourceforge.net/freetype2/docs/glyphs/glyphs-3.html
|
||||
bool GetSpacingProperties(const string& utf8_char,
|
||||
int* x_bearing, int* x_advance) const;
|
||||
|
||||
// Accessors
|
||||
string DescriptionName() const;
|
||||
// Font Family name eg. "Arial"
|
||||
const string& family_name() const { return family_name_; }
|
||||
// Size in points (1/72"), rounded to the nearest integer.
|
||||
const int font_size() const { return font_size_; }
|
||||
const bool is_bold() const { return is_bold_; }
|
||||
const bool is_italic() const { return is_italic_; }
|
||||
const bool is_smallcaps() const { return is_smallcaps_; }
|
||||
const bool is_monospace() const { return is_monospace_; }
|
||||
const bool is_fraktur() const { return is_fraktur_; }
|
||||
const FontTypeEnum font_type() const { return font_type_; }
|
||||
|
||||
const int resolution() const { return resolution_; }
|
||||
void set_resolution(const int resolution) {
|
||||
resolution_ = resolution;
|
||||
}
|
||||
|
||||
private:
|
||||
friend class FontUtils;
|
||||
void Clear();
|
||||
bool ParseFontDescription(const PangoFontDescription* desc);
|
||||
// Returns the PangoFont structure corresponding to the closest available font
|
||||
// in the font map.
|
||||
PangoFont* ToPangoFont() const;
|
||||
|
||||
// Font properties set automatically from parsing the font description name.
|
||||
string family_name_;
|
||||
int font_size_;
|
||||
bool is_bold_;
|
||||
bool is_italic_;
|
||||
bool is_smallcaps_;
|
||||
bool is_monospace_;
|
||||
bool is_fraktur_;
|
||||
FontTypeEnum font_type_;
|
||||
// The Pango description that was used to initialize the instance.
|
||||
PangoFontDescription* desc_;
|
||||
// Default output resolution to assume for GetSpacingProperties() and any
|
||||
// other methods that returns pixel values.
|
||||
int resolution_;
|
||||
|
||||
private:
|
||||
PangoFontInfo(const PangoFontInfo&);
|
||||
void operator=(const PangoFontInfo&);
|
||||
};
|
||||
|
||||
// Static utility methods for querying font availability and font-selection
|
||||
// based on codepoint coverage.
|
||||
class FontUtils {
|
||||
public:
|
||||
// Returns true if the font of the given description name is available in the
|
||||
// target directory specified by --fonts_dir
|
||||
static bool IsAvailableFont(const char* font_desc);
|
||||
// Outputs description names of available fonts.
|
||||
static const vector<string>& ListAvailableFonts();
|
||||
|
||||
// Picks font among available fonts that covers and can render the given word,
|
||||
// and returns the font description name and the decomposition of the word to
|
||||
// graphemes. Returns false if no suitable font was found.
|
||||
static bool SelectFont(const char* utf8_word, const int utf8_len,
|
||||
string* font_name, vector<string>* graphemes);
|
||||
|
||||
// Picks font among all_fonts that covers and can render the given word,
|
||||
// and returns the font description name and the decomposition of the word to
|
||||
// graphemes. Returns false if no suitable font was found.
|
||||
static bool SelectFont(const char* utf8_word, const int utf8_len,
|
||||
const vector<string>& all_fonts,
|
||||
string* font_name, vector<string>* graphemes);
|
||||
|
||||
// Returns a bitmask where the value of true at index 'n' implies that unicode
|
||||
// value 'n' is renderable by at least one available font.
|
||||
static void GetAllRenderableCharacters(vector<bool>* unichar_bitmap);
|
||||
// Variant of the above function that inspects only the provided font names.
|
||||
static void GetAllRenderableCharacters(const vector<string>& font_names,
|
||||
vector<bool>* unichar_bitmap);
|
||||
static void GetAllRenderableCharacters(const string& font_name,
|
||||
vector<bool>* unichar_bitmap);
|
||||
|
||||
// NOTE: The following utilities were written to be backward compatible with
|
||||
// StringRender.
|
||||
|
||||
// BestFonts returns a font name and a bit vector of the characters it
|
||||
// can render for the fonts that score within some fraction of the best
|
||||
// font on the characters in the given hash map.
|
||||
// In the flags vector, each flag is set according to whether the
|
||||
// corresponding character (in order of iterating ch_map) can be rendered.
|
||||
// The return string is a list of the acceptable fonts that were used.
|
||||
static string BestFonts(const unordered_map<char32, inT64>& ch_map,
|
||||
vector<std::pair<const char*, vector<bool> > >* font_flag);
|
||||
|
||||
// FontScore returns the weighted renderability score of the given
|
||||
// hash map character table in the given font. The unweighted score
|
||||
// is also returned in raw_score.
|
||||
// The values in the bool vector ch_flags correspond to whether the
|
||||
// corresponding character (in order of iterating ch_map) can be rendered.
|
||||
static int FontScore(const unordered_map<char32, inT64>& ch_map,
|
||||
const string& fontname, int* raw_score,
|
||||
vector<bool>* ch_flags);
|
||||
};
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // TESSERACT_TRAINING_PANGO_FONT_INFO_H_
|
807
training/stringrenderer.cpp
Normal file
807
training/stringrenderer.cpp
Normal file
@ -0,0 +1,807 @@
|
||||
/**********************************************************************
|
||||
* File: stringrenderer.cpp
|
||||
* Description: Class for rendering UTF-8 text to an image, and retrieving
|
||||
* bounding boxes around each grapheme cluster.
|
||||
* Author: Ranjith Unnikrishnan
|
||||
* Created: Mon Nov 18 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "stringrenderer.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <algorithm>
|
||||
#include <map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "allheaders.h" // from leptonica
|
||||
#include "boxchar.h"
|
||||
#include "ligature_table.h"
|
||||
#include "normstrngs.h"
|
||||
#include "pango/pango-font.h"
|
||||
#include "pango/pango-glyph-item.h"
|
||||
#include "tlog.h"
|
||||
#include "unichar.h"
|
||||
#include "unicode/uchar.h" // from libicu
|
||||
#include "util.h"
|
||||
|
||||
#ifndef USE_STD_NAMESPACE
|
||||
// Compatability with pango 1.20.
|
||||
#include "pango/pango-glyph-item-private.h"
|
||||
#define pango_glyph_item_iter_init_start _pango_glyph_item_iter_init_start
|
||||
#define pango_glyph_item_iter_next_cluster _pango_glyph_item_iter_next_cluster
|
||||
#else
|
||||
using std::map;
|
||||
using std::max;
|
||||
using std::min;
|
||||
using std::swap;
|
||||
#endif
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
static const int kDefaultOutputResolution = 300;
|
||||
|
||||
// Word joiner (U+2060) inserted after letters in ngram mode, as per
|
||||
// recommendation in http://unicode.org/reports/tr14/ to avoid line-breaks at
|
||||
// hyphens and other non-alpha characters.
|
||||
static const char* kWordJoinerUTF8 = "\u2060";
|
||||
static const char32 kWordJoiner = 0x2060;
|
||||
|
||||
static bool IsCombiner(int ch) {
|
||||
const int char_type = u_charType(ch);
|
||||
return ((char_type == U_NON_SPACING_MARK) ||
|
||||
(char_type == U_ENCLOSING_MARK) ||
|
||||
(char_type == U_COMBINING_SPACING_MARK));
|
||||
}
|
||||
|
||||
static string EncodeAsUTF8(const char32 ch32) {
|
||||
UNICHAR uni_ch(ch32);
|
||||
return string(uni_ch.utf8(), uni_ch.utf8_len());
|
||||
}
|
||||
|
||||
|
||||
/* static */
|
||||
Pix* CairoARGB32ToPixFormat(cairo_surface_t *surface) {
|
||||
if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) {
|
||||
printf("Unexpected surface format %d\n",
|
||||
cairo_image_surface_get_format(surface));
|
||||
return NULL;
|
||||
}
|
||||
const int width = cairo_image_surface_get_width(surface);
|
||||
const int height = cairo_image_surface_get_height(surface);
|
||||
Pix* pix = pixCreate(width, height, 32);
|
||||
int byte_stride = cairo_image_surface_get_stride(surface);
|
||||
|
||||
for (int i = 0; i < height; ++i) {
|
||||
memcpy(reinterpret_cast<unsigned char*>(pix->data + i * pix->wpl) + 1,
|
||||
cairo_image_surface_get_data(surface) + i * byte_stride,
|
||||
byte_stride - ((i == height - 1) ? 1 : 0));
|
||||
}
|
||||
return pix;
|
||||
}
|
||||
|
||||
StringRenderer::StringRenderer(const string& font_desc, int page_width,
|
||||
int page_height) :
|
||||
page_width_(page_width), page_height_(page_height),
|
||||
h_margin_(50), v_margin_(50), char_spacing_(0), leading_(0),
|
||||
vertical_text_(false), gravity_hint_strong_(false),
|
||||
render_fullwidth_latin_(false) ,drop_uncovered_chars_(true),
|
||||
strip_unrenderable_words_(false), add_ligatures_(false),
|
||||
output_word_boxes_(false), surface_(NULL), cr_(NULL),
|
||||
layout_(NULL), start_box_(0), page_(0), box_padding_(0),
|
||||
total_chars_(0), font_index_(0), last_offset_(0) {
|
||||
pen_color_[0] = 0.0;
|
||||
pen_color_[1] = 0.0;
|
||||
pen_color_[2] = 0.0;
|
||||
set_font(font_desc);
|
||||
set_resolution(kDefaultOutputResolution);
|
||||
page_boxes_ = NULL;
|
||||
}
|
||||
|
||||
bool StringRenderer::set_font(const string& desc) {
|
||||
bool success = font_.ParseFontDescriptionName(desc);
|
||||
font_.set_resolution(resolution_);
|
||||
return success;
|
||||
}
|
||||
|
||||
void StringRenderer::set_resolution(const int resolution) {
|
||||
resolution_ = resolution;
|
||||
font_.set_resolution(resolution);
|
||||
}
|
||||
|
||||
StringRenderer::~StringRenderer() {
|
||||
ClearBoxes();
|
||||
FreePangoCairo();
|
||||
}
|
||||
|
||||
void StringRenderer::InitPangoCairo() {
|
||||
FreePangoCairo();
|
||||
surface_ = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, page_width_,
|
||||
page_height_);
|
||||
cr_ = cairo_create(surface_);
|
||||
layout_ = pango_cairo_create_layout(cr_);
|
||||
|
||||
if (vertical_text_) {
|
||||
PangoContext* context = pango_layout_get_context(layout_);
|
||||
pango_context_set_base_gravity(context, PANGO_GRAVITY_EAST);
|
||||
if (gravity_hint_strong_) {
|
||||
pango_context_set_gravity_hint(context, PANGO_GRAVITY_HINT_STRONG);
|
||||
}
|
||||
pango_layout_context_changed(layout_);
|
||||
}
|
||||
|
||||
SetLayoutProperties();
|
||||
}
|
||||
|
||||
void StringRenderer::SetLayoutProperties() {
|
||||
string font_desc = font_.DescriptionName();
|
||||
// Specify the font via a description name
|
||||
PangoFontDescription *desc =
|
||||
pango_font_description_from_string(font_desc.c_str());
|
||||
// Assign the font description to the layout
|
||||
pango_layout_set_font_description(layout_, desc);
|
||||
pango_font_description_free(desc); // free the description
|
||||
pango_cairo_context_set_resolution(pango_layout_get_context(layout_),
|
||||
resolution_);
|
||||
|
||||
int max_width = page_width_ - 2 * h_margin_;
|
||||
int max_height = page_height_ - 2 * v_margin_;
|
||||
tlog(3, "max_width = %d, max_height = %d\n", max_width, max_height);
|
||||
if (vertical_text_) {
|
||||
swap(max_width, max_height);
|
||||
}
|
||||
pango_layout_set_width(layout_, max_width * PANGO_SCALE);
|
||||
pango_layout_set_wrap(layout_, PANGO_WRAP_WORD);
|
||||
|
||||
// Adjust character spacing
|
||||
PangoAttrList* attr_list = pango_attr_list_new();
|
||||
if (char_spacing_) {
|
||||
PangoAttribute* spacing_attr = pango_attr_letter_spacing_new(
|
||||
static_cast<int>(char_spacing_ * PANGO_SCALE + 0.5));
|
||||
spacing_attr->start_index = 0;
|
||||
spacing_attr->end_index = static_cast<guint>(-1);
|
||||
pango_attr_list_change(attr_list, spacing_attr);
|
||||
}
|
||||
pango_layout_set_attributes(layout_, attr_list);
|
||||
pango_attr_list_unref(attr_list);
|
||||
// Adjust line spacing
|
||||
if (leading_) {
|
||||
pango_layout_set_spacing(layout_, leading_ * PANGO_SCALE);
|
||||
}
|
||||
}
|
||||
|
||||
void StringRenderer::FreePangoCairo() {
|
||||
if (layout_) {
|
||||
g_object_unref(layout_);
|
||||
layout_ = NULL;
|
||||
}
|
||||
if (cr_) {
|
||||
cairo_destroy(cr_);
|
||||
cr_ = NULL;
|
||||
}
|
||||
if (surface_) {
|
||||
cairo_surface_destroy(surface_);
|
||||
surface_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Returns offset in utf8 bytes to first page.
|
||||
int StringRenderer::FindFirstPageBreakOffset(const char* text,
|
||||
int text_length) {
|
||||
if (!text_length) return 0;
|
||||
const int max_height = (page_height_ - 2 * v_margin_);
|
||||
const int max_width = (page_width_ - 2 * h_margin_);
|
||||
const int max_layout_height = vertical_text_ ? max_width : max_height;
|
||||
|
||||
UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
|
||||
const UNICHAR::const_iterator it_end = UNICHAR::end(text, text_length);
|
||||
const int kMaxUnicodeBufLength = 15000;
|
||||
for (int i = 0; i < kMaxUnicodeBufLength && it != it_end; ++it, ++i);
|
||||
int buf_length = it.utf8_data() - text;
|
||||
tlog(1, "len = %d buf_len = %d\n", text_length, buf_length);
|
||||
pango_layout_set_text(layout_, text, buf_length);
|
||||
|
||||
PangoLayoutIter* line_iter = NULL;
|
||||
{ // Fontconfig caches some info here that is not freed before exit.
|
||||
DISABLE_HEAP_LEAK_CHECK;
|
||||
line_iter = pango_layout_get_iter(layout_);
|
||||
}
|
||||
bool first_page = true;
|
||||
int page_top = 0;
|
||||
int offset = buf_length;
|
||||
do {
|
||||
// Get bounding box of the current line
|
||||
PangoRectangle line_ink_rect;
|
||||
pango_layout_iter_get_line_extents(line_iter, &line_ink_rect, NULL);
|
||||
pango_extents_to_pixels(&line_ink_rect, NULL);
|
||||
PangoLayoutLine* line = pango_layout_iter_get_line_readonly(line_iter);
|
||||
if (first_page) {
|
||||
page_top = line_ink_rect.y;
|
||||
first_page = false;
|
||||
}
|
||||
int line_bottom = line_ink_rect.y + line_ink_rect.height;
|
||||
if (line_bottom - page_top > max_layout_height) {
|
||||
offset = line->start_index;
|
||||
tlog(1, "Found offset = %d\n", offset);
|
||||
break;
|
||||
}
|
||||
} while (pango_layout_iter_next_line(line_iter));
|
||||
pango_layout_iter_free(line_iter);
|
||||
return offset;
|
||||
}
|
||||
|
||||
const vector<BoxChar*>& StringRenderer::GetBoxes() const {
|
||||
return boxchars_;
|
||||
}
|
||||
|
||||
Boxa* StringRenderer::GetPageBoxes() const {
|
||||
return page_boxes_;
|
||||
}
|
||||
|
||||
void StringRenderer::RotatePageBoxes(float rotation) {
|
||||
BoxChar::RotateBoxes(rotation, page_width_ / 2, page_height_ / 2,
|
||||
start_box_, boxchars_.size(), &boxchars_);
|
||||
}
|
||||
|
||||
|
||||
void StringRenderer::ClearBoxes() {
|
||||
for (int i = 0; i < boxchars_.size(); ++i)
|
||||
delete boxchars_[i];
|
||||
boxchars_.clear();
|
||||
boxaDestroy(&page_boxes_);
|
||||
}
|
||||
|
||||
void StringRenderer::WriteAllBoxes(const string& filename) const {
|
||||
BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_);
|
||||
}
|
||||
|
||||
// Returns cluster strings in logical order.
|
||||
bool StringRenderer::GetClusterStrings(vector<string>* cluster_text) {
|
||||
map<int, string> start_byte_to_text;
|
||||
PangoLayoutIter* run_iter = pango_layout_get_iter(layout_);
|
||||
const char* full_text = pango_layout_get_text(layout_);
|
||||
do {
|
||||
PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
|
||||
if (!run) {
|
||||
// End of line NULL run marker
|
||||
tlog(2, "Found end of line marker\n");
|
||||
continue;
|
||||
}
|
||||
PangoGlyphItemIter cluster_iter;
|
||||
gboolean have_cluster;
|
||||
for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
|
||||
run, full_text);
|
||||
have_cluster;
|
||||
have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
|
||||
const int start_byte_index = cluster_iter.start_index;
|
||||
const int end_byte_index = cluster_iter.end_index;
|
||||
string text = string(full_text + start_byte_index,
|
||||
end_byte_index - start_byte_index);
|
||||
if (IsUTF8Whitespace(text.c_str())) {
|
||||
tlog(2, "Found whitespace\n");
|
||||
text = " ";
|
||||
}
|
||||
tlog(2, "start_byte=%d end_byte=%d : '%s'\n", start_byte_index,
|
||||
end_byte_index, text.c_str());
|
||||
if (add_ligatures_) {
|
||||
// Make sure the output box files have ligatured text in case the font
|
||||
// decided to use an unmapped glyph.
|
||||
text = LigatureTable::Get()->AddLigatures(text, NULL);
|
||||
}
|
||||
start_byte_to_text[start_byte_index] = text;
|
||||
}
|
||||
} while (pango_layout_iter_next_run(run_iter));
|
||||
pango_layout_iter_free(run_iter);
|
||||
|
||||
cluster_text->clear();
|
||||
for (map<int, string>::const_iterator it = start_byte_to_text.begin();
|
||||
it != start_byte_to_text.end(); ++it) {
|
||||
cluster_text->push_back(it->second);
|
||||
}
|
||||
return cluster_text->size();
|
||||
}
|
||||
|
||||
// Merges an array of BoxChars into words based on the identification of
|
||||
// BoxChars containing the space character as inter-word separators.
|
||||
//
|
||||
// Sometime two adjacent characters in the sequence may be detected as lying on
|
||||
// different lines based on their spatial positions. This may be the result of a
|
||||
// newline character at end of the last word on a line in the source text, or of
|
||||
// a discretionary line-break created by Pango at intra-word locations like
|
||||
// hyphens. When this is detected the word is split at that location into
|
||||
// multiple BoxChars. Otherwise, each resulting BoxChar will contain a word and
|
||||
// its bounding box.
|
||||
static void MergeBoxCharsToWords(vector<BoxChar*>* boxchars) {
|
||||
vector<BoxChar*> result;
|
||||
bool started_word = false;
|
||||
for (int i = 0; i < boxchars->size(); ++i) {
|
||||
if (boxchars->at(i)->ch() == " " ||
|
||||
boxchars->at(i)->box() == NULL) {
|
||||
result.push_back(boxchars->at(i));
|
||||
boxchars->at(i) = NULL;
|
||||
started_word = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!started_word) {
|
||||
// Begin new word
|
||||
started_word = true;
|
||||
result.push_back(boxchars->at(i));
|
||||
boxchars->at(i) = NULL;
|
||||
} else {
|
||||
BoxChar* last_boxchar = result.back();
|
||||
// Compute bounding box union
|
||||
const Box* box = boxchars->at(i)->box();
|
||||
Box* last_box = last_boxchar->mutable_box();
|
||||
int left = min(last_box->x, box->x);
|
||||
int right = max(last_box->x + last_box->w, box->x + box->w);
|
||||
int top = min(last_box->y, box->y);
|
||||
int bottom = max(last_box->y + last_box->h, box->y + box->h);
|
||||
// Conclude that the word was broken to span multiple lines based on the
|
||||
// size of the merged bounding box in relation to those of the individual
|
||||
// characters seen so far.
|
||||
if (right - left > last_box->w + 5 * box->w) {
|
||||
tlog(1, "Found line break after '%s'", last_boxchar->ch().c_str());
|
||||
// Insert a fake interword space and start a new word with the current
|
||||
// boxchar.
|
||||
result.push_back(new BoxChar(" ", 1));
|
||||
result.push_back(boxchars->at(i));
|
||||
boxchars->at(i) = NULL;
|
||||
continue;
|
||||
}
|
||||
// Append to last word
|
||||
last_boxchar->mutable_ch()->append(boxchars->at(i)->ch());
|
||||
last_box->x = left;
|
||||
last_box->w = right - left;
|
||||
last_box->y = top;
|
||||
last_box->h = bottom - top;
|
||||
delete boxchars->at(i);
|
||||
boxchars->at(i) = NULL;
|
||||
}
|
||||
}
|
||||
boxchars->swap(result);
|
||||
}
|
||||
|
||||
|
||||
void StringRenderer::ComputeClusterBoxes() {
|
||||
const char* text = pango_layout_get_text(layout_);
|
||||
PangoLayoutIter* cluster_iter = pango_layout_get_iter(layout_);
|
||||
|
||||
// Do a first pass to store cluster start indexes.
|
||||
vector<int> cluster_start_indices;
|
||||
do {
|
||||
cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter));
|
||||
tlog(3, "Added %d\n", cluster_start_indices.back());
|
||||
} while (pango_layout_iter_next_cluster(cluster_iter));
|
||||
pango_layout_iter_free(cluster_iter);
|
||||
cluster_start_indices.push_back(strlen(text));
|
||||
tlog(3, "Added last index %d\n", cluster_start_indices.back());
|
||||
// Sort the indices and create a map from start to end indices.
|
||||
sort(cluster_start_indices.begin(), cluster_start_indices.end());
|
||||
map<int, int> cluster_start_to_end_index;
|
||||
for (int i = 0; i < cluster_start_indices.size() - 1; ++i) {
|
||||
cluster_start_to_end_index[cluster_start_indices[i]]
|
||||
= cluster_start_indices[i + 1];
|
||||
}
|
||||
|
||||
// Iterate again to compute cluster boxes and their text with the obtained
|
||||
// cluster extent information.
|
||||
cluster_iter = pango_layout_get_iter(layout_);
|
||||
// Store BoxChars* sorted by their byte start positions
|
||||
map<int, BoxChar*> start_byte_to_box;
|
||||
do {
|
||||
PangoRectangle cluster_rect;
|
||||
pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect,
|
||||
NULL);
|
||||
pango_extents_to_pixels(&cluster_rect, NULL);
|
||||
const int start_byte_index = pango_layout_iter_get_index(cluster_iter);
|
||||
const int end_byte_index = cluster_start_to_end_index[start_byte_index];
|
||||
string cluster_text = string(text + start_byte_index,
|
||||
end_byte_index - start_byte_index);
|
||||
if (cluster_text.size() && cluster_text[0] == '\n') {
|
||||
tlog(2, "Skipping newlines at start of text.\n");
|
||||
continue;
|
||||
}
|
||||
if (!cluster_rect.width || !cluster_rect.height ||
|
||||
IsUTF8Whitespace(cluster_text.c_str())) {
|
||||
tlog(2, "Skipping whitespace with boxdim (%d,%d) '%s'\n",
|
||||
cluster_rect.width, cluster_rect.height, cluster_text.c_str());
|
||||
BoxChar* boxchar = new BoxChar(" ", 1);
|
||||
boxchar->set_page(page_);
|
||||
start_byte_to_box[start_byte_index] = boxchar;
|
||||
continue;
|
||||
}
|
||||
// Prepare a boxchar for addition at this byte position.
|
||||
tlog(2, "[%d %d], %d, %d : start_byte=%d end_byte=%d : '%s'\n",
|
||||
cluster_rect.x, cluster_rect.y,
|
||||
cluster_rect.width, cluster_rect.height,
|
||||
start_byte_index, end_byte_index,
|
||||
cluster_text.c_str());
|
||||
ASSERT_HOST_MSG(cluster_rect.width,
|
||||
"cluster_text:%s start_byte_index:%d\n",
|
||||
cluster_text.c_str(), start_byte_index);
|
||||
ASSERT_HOST_MSG(cluster_rect.height,
|
||||
"cluster_text:%s start_byte_index:%d\n",
|
||||
cluster_text.c_str(), start_byte_index);
|
||||
if (box_padding_) {
|
||||
cluster_rect.x = max(0, cluster_rect.x - box_padding_);
|
||||
cluster_rect.width += 2 * box_padding_;
|
||||
cluster_rect.y = max(0, cluster_rect.y - box_padding_);
|
||||
cluster_rect.height += 2 * box_padding_;
|
||||
}
|
||||
if (add_ligatures_) {
|
||||
// Make sure the output box files have ligatured text in case the font
|
||||
// decided to use an unmapped glyph.
|
||||
cluster_text = LigatureTable::Get()->AddLigatures(cluster_text, NULL);
|
||||
}
|
||||
BoxChar* boxchar = new BoxChar(cluster_text.c_str(), cluster_text.size());
|
||||
boxchar->set_page(page_);
|
||||
boxchar->AddBox(cluster_rect.x, cluster_rect.y,
|
||||
cluster_rect.width, cluster_rect.height);
|
||||
start_byte_to_box[start_byte_index] = boxchar;
|
||||
} while (pango_layout_iter_next_cluster(cluster_iter));
|
||||
pango_layout_iter_free(cluster_iter);
|
||||
|
||||
// There is a subtle bug in the cluster text reported by the PangoLayoutIter
|
||||
// on ligatured characters (eg. The word "Lam-Aliph" in arabic). To work
|
||||
// around this, we use text reported using the PangoGlyphIter which is
|
||||
// accurate.
|
||||
// TODO(ranjith): Revisit whether this is still needed in newer versions of
|
||||
// pango.
|
||||
vector<string> cluster_text;
|
||||
if (GetClusterStrings(&cluster_text)) {
|
||||
ASSERT_HOST(cluster_text.size() == start_byte_to_box.size());
|
||||
int ind = 0;
|
||||
for (map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
|
||||
it != start_byte_to_box.end(); ++it, ++ind) {
|
||||
it->second->mutable_ch()->swap(cluster_text[ind]);
|
||||
}
|
||||
}
|
||||
|
||||
// Append to the boxchars list in byte order.
|
||||
vector<BoxChar*> page_boxchars;
|
||||
page_boxchars.reserve(start_byte_to_box.size());
|
||||
string last_ch;
|
||||
for (map<int, BoxChar*>::const_iterator it = start_byte_to_box.begin();
|
||||
it != start_byte_to_box.end(); ++it) {
|
||||
if (it->second->ch() == kWordJoinerUTF8) {
|
||||
// Skip zero-width joiner characters (ZWJs) here.
|
||||
delete it->second;
|
||||
} else {
|
||||
page_boxchars.push_back(it->second);
|
||||
}
|
||||
}
|
||||
CorrectBoxPositionsToLayout(&page_boxchars);
|
||||
|
||||
if (render_fullwidth_latin_) {
|
||||
for (map<int, BoxChar*>::iterator it = start_byte_to_box.begin();
|
||||
it != start_byte_to_box.end(); ++it) {
|
||||
// Convert fullwidth Latin characters to their halfwidth forms.
|
||||
string half(ConvertFullwidthLatinToBasicLatin(it->second->ch()));
|
||||
it->second->mutable_ch()->swap(half);
|
||||
}
|
||||
}
|
||||
|
||||
// Merge the character boxes into word boxes if we are rendering n-grams.
|
||||
if (output_word_boxes_) {
|
||||
MergeBoxCharsToWords(&page_boxchars);
|
||||
}
|
||||
|
||||
boxchars_.insert(boxchars_.end(), page_boxchars.begin(), page_boxchars.end());
|
||||
|
||||
// Compute the page bounding box
|
||||
Box* page_box = NULL;
|
||||
Boxa* all_boxes = NULL;
|
||||
for (int i = 0; i < page_boxchars.size(); ++i) {
|
||||
if (page_boxchars[i]->box() == NULL) continue;
|
||||
if (all_boxes == NULL)
|
||||
all_boxes = boxaCreate(0);
|
||||
boxaAddBox(all_boxes, page_boxchars[i]->mutable_box(), L_CLONE);
|
||||
}
|
||||
boxaGetExtent(all_boxes, NULL, NULL, &page_box);
|
||||
boxaDestroy(&all_boxes);
|
||||
if (page_boxes_ == NULL)
|
||||
page_boxes_ = boxaCreate(0);
|
||||
boxaAddBox(page_boxes_, page_box, L_INSERT);
|
||||
}
|
||||
|
||||
|
||||
void StringRenderer::CorrectBoxPositionsToLayout(vector<BoxChar*>* boxchars) {
|
||||
if (vertical_text_) {
|
||||
const double rotation = - pango_gravity_to_rotation(
|
||||
pango_context_get_base_gravity(pango_layout_get_context(layout_)));
|
||||
BoxChar::TranslateBoxes(page_width_ - h_margin_, v_margin_, boxchars);
|
||||
BoxChar::RotateBoxes(rotation, page_width_ - h_margin_, v_margin_,
|
||||
0, boxchars->size(), boxchars);
|
||||
} else {
|
||||
BoxChar::TranslateBoxes(h_margin_, v_margin_, boxchars);
|
||||
}
|
||||
}
|
||||
|
||||
int StringRenderer::StripUnrenderableWords(string* utf8_text) const {
|
||||
string output_text;
|
||||
const char* text = utf8_text->c_str();
|
||||
int offset = 0;
|
||||
int num_dropped = 0;
|
||||
while (offset < utf8_text->length()) {
|
||||
int space_len = SpanUTF8Whitespace(text + offset);
|
||||
output_text.append(text + offset, space_len);
|
||||
offset += space_len;
|
||||
if (offset == utf8_text->length()) break;
|
||||
|
||||
int word_len = SpanUTF8NotWhitespace(text + offset);
|
||||
if (font_.CanRenderString(text + offset, word_len)) {
|
||||
output_text.append(text + offset, word_len);
|
||||
} else {
|
||||
++num_dropped;
|
||||
}
|
||||
offset += word_len;
|
||||
}
|
||||
utf8_text->swap(output_text);
|
||||
|
||||
if (num_dropped > 0) {
|
||||
tprintf("Stripped %d unrenderable words\n", num_dropped);
|
||||
}
|
||||
return num_dropped;
|
||||
}
|
||||
|
||||
int StringRenderer::RenderToBinaryImage(const char* text, int text_length,
|
||||
int threshold, Pix** pix) {
|
||||
Pix *orig_pix = NULL;
|
||||
int offset = RenderToImage(text, text_length, &orig_pix);
|
||||
if (orig_pix) {
|
||||
Pix* gray_pix = pixConvertTo8(orig_pix, false);
|
||||
pixDestroy(&orig_pix);
|
||||
*pix = pixThresholdToBinary(gray_pix, threshold);
|
||||
pixDestroy(&gray_pix);
|
||||
} else {
|
||||
*pix = orig_pix;
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
// Add word joiner (WJ) characters between adjacent non-space characters except
|
||||
// immediately before a combiner.
|
||||
/* static */
|
||||
string StringRenderer::InsertWordJoiners(const string& text) {
|
||||
string out_str;
|
||||
const UNICHAR::const_iterator it_end = UNICHAR::end(text.c_str(),
|
||||
text.length());
|
||||
for (UNICHAR::const_iterator it = UNICHAR::begin(text.c_str(), text.length());
|
||||
it != it_end; ++it) {
|
||||
// Add the symbol to the output string.
|
||||
out_str.append(it.utf8_data(), it.utf8_len());
|
||||
// Check the next symbol.
|
||||
UNICHAR::const_iterator next_it = it;
|
||||
++next_it;
|
||||
bool next_char_is_boundary = (next_it == it_end || *next_it == ' ');
|
||||
bool next_char_is_combiner = (next_it == it_end) ?
|
||||
false : IsCombiner(*next_it);
|
||||
if (*it != ' ' && *it != '\n' && !next_char_is_boundary &&
|
||||
!next_char_is_combiner) {
|
||||
out_str += kWordJoinerUTF8;
|
||||
}
|
||||
}
|
||||
return out_str;
|
||||
}
|
||||
|
||||
// Convert halfwidth Basic Latin characters to their fullwidth forms.
|
||||
string StringRenderer::ConvertBasicLatinToFullwidthLatin(const string& str) {
|
||||
string full_str;
|
||||
const UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(),
|
||||
str.length());
|
||||
for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length());
|
||||
it != it_end; ++it) {
|
||||
// Convert printable and non-space 7-bit ASCII characters to
|
||||
// their fullwidth forms.
|
||||
if (IsInterchangeValid7BitAscii(*it) && isprint(*it) && !isspace(*it)) {
|
||||
// Convert by adding 0xFEE0 to the codepoint of 7-bit ASCII.
|
||||
char32 full_char = *it + 0xFEE0;
|
||||
full_str.append(EncodeAsUTF8(full_char));
|
||||
} else {
|
||||
full_str.append(it.utf8_data(), it.utf8_len());
|
||||
}
|
||||
}
|
||||
return full_str;
|
||||
}
|
||||
|
||||
// Convert fullwidth Latin characters to their halfwidth forms.
|
||||
string StringRenderer::ConvertFullwidthLatinToBasicLatin(const string& str) {
|
||||
string half_str;
|
||||
UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
|
||||
for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length());
|
||||
it != it_end; ++it) {
|
||||
char32 half_char = FullwidthToHalfwidth(*it);
|
||||
// Convert fullwidth Latin characters to their halfwidth forms
|
||||
// only if halfwidth forms are printable and non-space 7-bit ASCII.
|
||||
if (IsInterchangeValid7BitAscii(half_char) &&
|
||||
isprint(half_char) && !isspace(half_char)) {
|
||||
half_str.append(EncodeAsUTF8(half_char));
|
||||
} else {
|
||||
half_str.append(it.utf8_data(), it.utf8_len());
|
||||
}
|
||||
}
|
||||
return half_str;
|
||||
}
|
||||
|
||||
// Returns offset to end of text substring rendered in this method.
|
||||
int StringRenderer::RenderToImage(const char* text, int text_length,
|
||||
Pix** pix) {
|
||||
if (pix && *pix) pixDestroy(pix);
|
||||
InitPangoCairo();
|
||||
|
||||
const int page_offset = FindFirstPageBreakOffset(text, text_length);
|
||||
if (!page_offset) {
|
||||
return 0;
|
||||
}
|
||||
start_box_ = boxchars_.size();
|
||||
|
||||
if (!vertical_text_) {
|
||||
// Translate by the specified margin
|
||||
cairo_translate(cr_, h_margin_, v_margin_);
|
||||
} else {
|
||||
// Vertical text rendering is achieved by a two-step process of first
|
||||
// performing regular horizontal layout with character orientation set to
|
||||
// EAST, and then translating and rotating the layout before rendering onto
|
||||
// the desired image surface. The settings required for the former step are
|
||||
// done within InitPangoCairo().
|
||||
//
|
||||
// Translate to the top-right margin of page
|
||||
cairo_translate(cr_, page_width_ - h_margin_, v_margin_);
|
||||
// Rotate the layout
|
||||
double rotation = - pango_gravity_to_rotation(
|
||||
pango_context_get_base_gravity(pango_layout_get_context(layout_)));
|
||||
tlog(2, "Rotating by %f radians\n", rotation);
|
||||
cairo_rotate(cr_, rotation);
|
||||
pango_cairo_update_layout(cr_, layout_);
|
||||
}
|
||||
string page_text(text, page_offset);
|
||||
if (render_fullwidth_latin_) {
|
||||
// Convert Basic Latin to their fullwidth forms.
|
||||
page_text = ConvertBasicLatinToFullwidthLatin(page_text);
|
||||
}
|
||||
if (strip_unrenderable_words_) {
|
||||
StripUnrenderableWords(&page_text);
|
||||
}
|
||||
if (drop_uncovered_chars_ &&
|
||||
!font_.CoversUTF8Text(page_text.c_str(), page_text.length())) {
|
||||
int num_dropped = font_.DropUncoveredChars(&page_text);
|
||||
if (num_dropped) {
|
||||
tprintf("WARNING: Dropped %d uncovered characters\n", num_dropped);
|
||||
}
|
||||
}
|
||||
if (add_ligatures_) {
|
||||
// Add ligatures wherever possible, including custom ligatures.
|
||||
page_text = LigatureTable::Get()->AddLigatures(page_text, &font_);
|
||||
}
|
||||
|
||||
pango_layout_set_text(layout_, page_text.c_str(), page_text.length());
|
||||
|
||||
if (pix) {
|
||||
// Set a white background for the target image surface.
|
||||
cairo_set_source_rgb(cr_, 1.0, 1.0, 1.0); // sets drawing colour to white
|
||||
// Fill the surface with the active colour (if you don't do this, you will
|
||||
// be given a surface with a transparent background to draw on)
|
||||
cairo_paint(cr_);
|
||||
// Set the ink color to black
|
||||
cairo_set_source_rgb(cr_, pen_color_[0], pen_color_[1], pen_color_[2]);
|
||||
// If the target surface or transformation properties of the cairo instance
|
||||
// have changed, update the pango layout to reflect this
|
||||
pango_cairo_update_layout(cr_, layout_);
|
||||
// Draw the pango layout onto the cairo surface
|
||||
pango_cairo_show_layout(cr_, layout_);
|
||||
*pix = CairoARGB32ToPixFormat(surface_);
|
||||
}
|
||||
ComputeClusterBoxes();
|
||||
FreePangoCairo();
|
||||
// Update internal state variables.
|
||||
++page_;
|
||||
return page_offset;
|
||||
}
|
||||
|
||||
// Render a string to an image, returning it as an 8 bit pix. Behaves as
|
||||
// RenderString, except that it ignores the font set at construction and works
|
||||
// through all the fonts, returning 0 until they are exhausted, at which point
|
||||
// it returns the value it should have returned all along, but no pix this time.
|
||||
// Fonts that don't contain a large proportion of the characters in the string
|
||||
// get skipped.
|
||||
// Fonts that work each get rendered and the font name gets added
|
||||
// to the image.
|
||||
// NOTE that no boxes are produced by this function.
|
||||
//
|
||||
// Example usage: To render a null terminated char-array "txt"
|
||||
//
|
||||
// int offset = 0;
|
||||
// do {
|
||||
// Pix *pix;
|
||||
// offset += renderer.RenderAllFontsToImage(txt + offset,
|
||||
// strlen(txt + offset), NULL, &pix);
|
||||
// ...
|
||||
// } while (offset < strlen(text));
|
||||
//
|
||||
int StringRenderer::RenderAllFontsToImage(const char* text, int text_length,
|
||||
string* font_used, Pix** image) {
|
||||
// Select a suitable font to render the title with.
|
||||
const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%";
|
||||
string title_font;
|
||||
if (!FontUtils::SelectFont(kTitleTemplate, strlen(kTitleTemplate),
|
||||
&title_font, NULL)) {
|
||||
tprintf("WARNING: Could not find a font to render image title with!\n");
|
||||
title_font = "Arial";
|
||||
}
|
||||
title_font += " 8";
|
||||
tlog(1, "Selected title font: %s\n", title_font.c_str());
|
||||
if (font_used) font_used->clear();
|
||||
|
||||
string orig_font = font_.DescriptionName();
|
||||
if (char_map_.empty()) {
|
||||
total_chars_ = 0;
|
||||
// Fill the hash table and use that for computing which fonts to use.
|
||||
for (UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
|
||||
it != UNICHAR::end(text, text_length); ++it) {
|
||||
++total_chars_;
|
||||
++char_map_[*it];
|
||||
}
|
||||
tprintf("Total chars = %d\n", total_chars_);
|
||||
}
|
||||
const vector<string>& all_fonts = FontUtils::ListAvailableFonts();
|
||||
for (int i = font_index_; i < all_fonts.size(); ++i) {
|
||||
++font_index_;
|
||||
int raw_score = 0;
|
||||
int ok_chars = FontUtils::FontScore(char_map_, all_fonts[i], &raw_score,
|
||||
NULL);
|
||||
if (ok_chars > 0 && ok_chars == total_chars_) {
|
||||
set_font(all_fonts[i]);
|
||||
int offset = RenderToBinaryImage(text, text_length, 128, image);
|
||||
ClearBoxes(); // Get rid of them as they are garbage.
|
||||
const int kMaxTitleLength = 1024;
|
||||
char title[kMaxTitleLength];
|
||||
snprintf(title, kMaxTitleLength, kTitleTemplate,
|
||||
all_fonts[i].c_str(), ok_chars,
|
||||
100.0 * ok_chars / total_chars_, raw_score,
|
||||
100.0 * raw_score / char_map_.size());
|
||||
tprintf("%s\n", title);
|
||||
// This is a good font! Store the offset to return once we've tried all
|
||||
// the fonts.
|
||||
if (offset) {
|
||||
last_offset_ = offset;
|
||||
if (font_used) *font_used = all_fonts[i];
|
||||
}
|
||||
// Add the font to the image.
|
||||
set_font(title_font);
|
||||
v_margin_ /= 8;
|
||||
Pix* title_image = NULL;
|
||||
RenderToBinaryImage(title, strlen(title), 128, &title_image);
|
||||
pixOr(*image, *image, title_image);
|
||||
pixDestroy(&title_image);
|
||||
|
||||
v_margin_ *= 8;
|
||||
set_font(orig_font);
|
||||
// We return the real offset only after cycling through the list of fonts.
|
||||
return 0;
|
||||
} else {
|
||||
tprintf("Font %s failed with %d hits = %.2f%%\n",
|
||||
all_fonts[i].c_str(), ok_chars, 100.0 * ok_chars / total_chars_);
|
||||
}
|
||||
}
|
||||
*image = NULL;
|
||||
font_index_ = 0;
|
||||
char_map_.clear();
|
||||
return last_offset_;
|
||||
}
|
||||
|
||||
} // namespace tesseract
|
203
training/stringrenderer.h
Normal file
203
training/stringrenderer.h
Normal file
@ -0,0 +1,203 @@
|
||||
/**********************************************************************
|
||||
* File: stringrenderer.h
|
||||
* Description: Class for rendering UTF-8 text to an image, and retrieving
|
||||
* bounding boxes around each grapheme cluster.
|
||||
*
|
||||
* Instances are created using a font description string
|
||||
* (eg. "Arial Italic 12"; see pango_font_info.h for the format)
|
||||
* and the page dimensions. Other renderer properties such as
|
||||
* spacing, ligaturization, as well a preprocessing behavior such
|
||||
* as removal of unrenderable words and a special n-gram mode may
|
||||
* be set using respective set_* methods.
|
||||
*
|
||||
* Author: Ranjith Unnikrishnan
|
||||
* Created: Mon Nov 18 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TESSERACT_TRAINING_STRINGRENDERER_H_
|
||||
#define TESSERACT_TRAINING_STRINGRENDERER_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "hashfn.h"
|
||||
#include "host.h"
|
||||
#include "pango_font_info.h"
|
||||
#include "pango/pango-layout.h"
|
||||
#include "pango/pangocairo.h"
|
||||
|
||||
struct Boxa;
|
||||
struct Pix;
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class BoxChar;
|
||||
|
||||
class StringRenderer {
|
||||
public:
|
||||
StringRenderer(const string& font_desc, int page_width, int page_height);
|
||||
~StringRenderer();
|
||||
|
||||
// Renders the text with the chosen font and returns the byte offset upto
|
||||
// which the text could be rendered so as to fit the specified page
|
||||
// dimensions.
|
||||
int RenderToImage(const char* text, int text_length, Pix** pix);
|
||||
int RenderToBinaryImage(const char* text, int text_length, int threshold,
|
||||
Pix** pix);
|
||||
// Renders a line of text with all available fonts that were able to render
|
||||
// the text.
|
||||
int RenderAllFontsToImage(const char* text, int text_length,
|
||||
string* font_used, Pix** pix);
|
||||
|
||||
bool set_font(const string& desc);
|
||||
void set_char_spacing(double char_spacing) {
|
||||
char_spacing_ = char_spacing;
|
||||
}
|
||||
void set_leading(int leading) {
|
||||
leading_ = leading;
|
||||
}
|
||||
void set_resolution(const int resolution);
|
||||
void set_vertical_text(bool vertical_text) {
|
||||
vertical_text_ = vertical_text;
|
||||
}
|
||||
void set_gravity_hint_strong(bool gravity_hint_strong) {
|
||||
gravity_hint_strong_ = gravity_hint_strong;
|
||||
}
|
||||
void set_render_fullwidth_latin(bool render_fullwidth_latin) {
|
||||
render_fullwidth_latin_ = render_fullwidth_latin;
|
||||
}
|
||||
void set_page(int page) {
|
||||
page_ = page;
|
||||
}
|
||||
void set_box_padding(int val) {
|
||||
box_padding_ = val;
|
||||
}
|
||||
void set_drop_uncovered_chars(bool val) {
|
||||
drop_uncovered_chars_ = val;
|
||||
}
|
||||
void set_strip_unrenderable_words(bool val) {
|
||||
strip_unrenderable_words_ = val;
|
||||
}
|
||||
void set_output_word_boxes(bool val) {
|
||||
output_word_boxes_ = val;
|
||||
}
|
||||
// Before rendering the string, replace latin characters with their optional
|
||||
// ligatured forms (such as "fi", "ffi" etc.) if the font_ covers those
|
||||
// unicodes.
|
||||
void set_add_ligatures(bool add_ligatures) {
|
||||
add_ligatures_ = add_ligatures;
|
||||
}
|
||||
// Set the rgb value of the text ink. Values range in [0, 1.0]
|
||||
void set_pen_color(double r, double g, double b) {
|
||||
pen_color_[0] = r;
|
||||
pen_color_[1] = g;
|
||||
pen_color_[2] = b;
|
||||
}
|
||||
void set_h_margin(const int h_margin) {
|
||||
h_margin_ = h_margin;
|
||||
}
|
||||
void set_v_margin(const int v_margin) {
|
||||
v_margin_ = v_margin;
|
||||
}
|
||||
const PangoFontInfo& font() const {
|
||||
return font_;
|
||||
}
|
||||
const int h_margin() const {
|
||||
return h_margin_;
|
||||
}
|
||||
const int v_margin() const {
|
||||
return v_margin_;
|
||||
}
|
||||
|
||||
// Get the boxchars of all clusters rendered thus far (or since the last call
|
||||
// to ClearBoxes()).
|
||||
const vector<BoxChar*>& GetBoxes() const;
|
||||
// Get the rendered page bounding boxes of all pages created thus far (or
|
||||
// since last call to ClearBoxes()).
|
||||
Boxa* GetPageBoxes() const;
|
||||
|
||||
// Rotate the boxes on the most recent page by the given rotation.
|
||||
void RotatePageBoxes(float rotation);
|
||||
// Delete all boxes.
|
||||
void ClearBoxes();
|
||||
void WriteAllBoxes(const string& filename) const;
|
||||
// Removes space-delimited words from the string that are not renderable by
|
||||
// the current font and returns the count of such words.
|
||||
int StripUnrenderableWords(string* utf8_text) const;
|
||||
|
||||
// Insert a Word Joiner symbol (U+2060) between adjacent characters, excluding
|
||||
// spaces and combining types, in each word before rendering to ensure words
|
||||
// are not broken across lines. The output boxchars will not contain the
|
||||
// joiner.
|
||||
static string InsertWordJoiners(const string& text);
|
||||
|
||||
// Helper functions to convert fullwidth Latin and halfwidth Basic Latin.
|
||||
static string ConvertBasicLatinToFullwidthLatin(const string& text);
|
||||
static string ConvertFullwidthLatinToBasicLatin(const string& text);
|
||||
|
||||
protected:
|
||||
// Init and free local renderer objects.
|
||||
void InitPangoCairo();
|
||||
void SetLayoutProperties();
|
||||
void FreePangoCairo();
|
||||
// Compute bounding boxes around grapheme clusters.
|
||||
void ComputeClusterBoxes();
|
||||
void CorrectBoxPositionsToLayout(vector<BoxChar*>* boxchars);
|
||||
bool GetClusterStrings(vector<string>* cluster_text);
|
||||
int FindFirstPageBreakOffset(const char* text, int text_length);
|
||||
|
||||
PangoFontInfo font_;
|
||||
// Page properties
|
||||
int page_width_, page_height_, h_margin_, v_margin_;
|
||||
// Text rendering properties
|
||||
int pen_color_[3];
|
||||
double char_spacing_;
|
||||
int leading_, resolution_;
|
||||
bool vertical_text_;
|
||||
bool gravity_hint_strong_;
|
||||
bool render_fullwidth_latin_;
|
||||
// Text filtering options
|
||||
bool drop_uncovered_chars_;
|
||||
bool strip_unrenderable_words_;
|
||||
bool add_ligatures_;
|
||||
bool output_word_boxes_;
|
||||
// Pango and cairo specific objects
|
||||
cairo_surface_t* surface_;
|
||||
cairo_t* cr_;
|
||||
PangoLayout* layout_;
|
||||
// Internal state of current page number, updated on successive calls to
|
||||
// RenderToImage()
|
||||
int start_box_;
|
||||
int page_;
|
||||
// Boxes and associated text for all pages rendered with RenderToImage() since
|
||||
// the last call to ClearBoxes().
|
||||
vector<BoxChar*> boxchars_;
|
||||
int box_padding_;
|
||||
// Bounding boxes for pages since the last call to ClearBoxes().
|
||||
Boxa* page_boxes_;
|
||||
|
||||
// Objects cached for subsequent calls to RenderAllFontsToImage()
|
||||
hash_map<char32, inT64> char_map_; // Time-saving char histogram.
|
||||
int total_chars_; // Number in the string to be rendered.
|
||||
int font_index_; // Index of next font to use in font list.
|
||||
int last_offset_; // Offset returned from last successful rendering
|
||||
|
||||
private:
|
||||
StringRenderer(const StringRenderer&);
|
||||
void operator=(const StringRenderer&);
|
||||
};
|
||||
} // namespace tesseract
|
||||
|
||||
#endif // THIRD_PARTY_TESSERACT_TRAINING_STRINGRENDERER_H_
|
601
training/text2image.cpp
Normal file
601
training/text2image.cpp
Normal file
@ -0,0 +1,601 @@
|
||||
/**********************************************************************
|
||||
* File: text2image.cpp
|
||||
* Description: Program to generate OCR training pages. Given a text file it
|
||||
* outputs an image with a given font and degradation.
|
||||
*
|
||||
* Note that since the results depend on the fonts available on
|
||||
* your system, running the code on a different machine, or
|
||||
* different OS, or even at a different time on the same machine,
|
||||
* may produce different fonts even if --font is given explicitly.
|
||||
* To see names of available fonts, use --list_available_fonts with
|
||||
* the appropriate --fonts_dir path.
|
||||
* Specifying --use_only_legacy_fonts will restrict the available
|
||||
* fonts to those listed in legacy_fonts.h
|
||||
*
|
||||
* Authors: Ranjith Unnikrishnan, Ray Smith
|
||||
* Created: Tue Nov 19 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "allheaders.h" // from leptonica
|
||||
#include "boxchar.h"
|
||||
#include "commandlineflags.h"
|
||||
#include "degradeimage.h"
|
||||
#include "errcode.h"
|
||||
#include "fileio.h"
|
||||
#include "normstrngs.h"
|
||||
#include "stringrenderer.h"
|
||||
#include "tlog.h"
|
||||
#include "unicharset.h"
|
||||
#include "util.h"
|
||||
|
||||
#ifdef USE_STD_NAMESPACE
|
||||
using std::make_pair;
|
||||
using std::map;
|
||||
using std::pair;
|
||||
#endif
|
||||
|
||||
// The text input file.
|
||||
STRING_PARAM_FLAG(text, "", "File name of text input to process");
|
||||
|
||||
// The text output file.
|
||||
STRING_PARAM_FLAG(outputbase, "", "Basename for output image/box file");
|
||||
|
||||
// Degrade the rendered image to mimic scanner quality.
|
||||
BOOL_PARAM_FLAG(degrade_image, true,
|
||||
"Degrade rendered image with speckle noise, dilation/erosion "
|
||||
"and rotation");
|
||||
|
||||
// Degradation to apply to the image.
|
||||
INT_PARAM_FLAG(exposure, 0, "Exposure level in photocopier");
|
||||
|
||||
// Output image resolution.
|
||||
INT_PARAM_FLAG(resolution, 300, "Pixels per inch");
|
||||
|
||||
// Width of output image (in pixels).
|
||||
INT_PARAM_FLAG(xsize, 3600, "Width of output image");
|
||||
|
||||
// Max height of output image (in pixels).
|
||||
INT_PARAM_FLAG(ysize, 4800, "Height of output image");
|
||||
|
||||
// Margin around text (in pixels).
|
||||
INT_PARAM_FLAG(margin, 100, "Margin round edges of image");
|
||||
|
||||
// Size of text (in points).
|
||||
INT_PARAM_FLAG(ptsize, 12, "Size of printed text");
|
||||
|
||||
// Inter-character space (in ems).
|
||||
DOUBLE_PARAM_FLAG(char_spacing, 0, "Inter-character space in ems");
|
||||
|
||||
// Inter-line space (in pixels).
|
||||
INT_PARAM_FLAG(leading, 12, "Inter-line space (in pixels)");
|
||||
|
||||
// Layout and glyph orientation on rendering.
|
||||
STRING_PARAM_FLAG(writing_mode, "horizontal",
|
||||
"Specify one of the following writing"
|
||||
" modes.\n"
|
||||
"'horizontal' : Render regular horizontal text. (default)\n"
|
||||
"'vertical' : Render vertical text. Glyph orientation is"
|
||||
" selected by Pango.\n"
|
||||
"'vertical-upright' : Render vertical text. Glyph "
|
||||
" orientation is set to be upright.");
|
||||
|
||||
INT_PARAM_FLAG(box_padding, 0, "Padding around produced bounding boxes");
|
||||
|
||||
BOOL_PARAM_FLAG(strip_unrenderable_words, false,
|
||||
"Remove unrenderable words from source text");
|
||||
|
||||
// Font name.
|
||||
STRING_PARAM_FLAG(font, "Arial", "Font description name to use");
|
||||
|
||||
BOOL_PARAM_FLAG(ligatures, false,
|
||||
"Rebuild and render ligatures");
|
||||
|
||||
BOOL_PARAM_FLAG(find_fonts, false,
|
||||
"Search for all fonts that can render the text");
|
||||
BOOL_PARAM_FLAG(render_per_font, true,
|
||||
"If find_fonts==true, render each font to its own image. "
|
||||
"Image filenames are of the form output_name.font_name.tif");
|
||||
|
||||
BOOL_PARAM_FLAG(list_available_fonts, false, "List available fonts and quit.");
|
||||
|
||||
BOOL_PARAM_FLAG(render_ngrams, false, "Put each space-separated entity from the"
|
||||
" input file into one bounding box. The ngrams in the input"
|
||||
" file will be randomly permuted before rendering (so that"
|
||||
" there is sufficient variety of characters on each line).");
|
||||
|
||||
BOOL_PARAM_FLAG(output_word_boxes, false,
|
||||
"Output word bounding boxes instead of character boxes. "
|
||||
"This is used for Cube training, and implied by "
|
||||
"--render_ngrams.");
|
||||
|
||||
STRING_PARAM_FLAG(unicharset_file, "",
|
||||
"File with characters in the unicharset. If --render_ngrams"
|
||||
" is true and --unicharset_file is specified, ngrams with"
|
||||
" characters that are not in unicharset will be omitted");
|
||||
|
||||
BOOL_PARAM_FLAG(bidirectional_rotation, false,
|
||||
"Rotate the generated characters both ways.");
|
||||
|
||||
BOOL_PARAM_FLAG(only_extract_font_properties, false,
|
||||
"Assumes that the input file contains a list of ngrams. Renders"
|
||||
" each ngram, extracts spacing properties and records them in"
|
||||
" output_base/[font_name].fontinfo file.");
|
||||
|
||||
// Use these flags to output zero-padded, square individual character images
|
||||
BOOL_PARAM_FLAG(output_individual_glyph_images, false,
|
||||
"If true also outputs individual character images");
|
||||
INT_PARAM_FLAG(glyph_resized_size, 0,
|
||||
"Each glyph is square with this side length in pixels");
|
||||
INT_PARAM_FLAG(glyph_num_border_pixels_to_pad, 0,
|
||||
"Final_size=glyph_resized_size+2*glyph_num_border_pixels_to_pad");
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
struct SpacingProperties {
|
||||
SpacingProperties() : x_gap_before(0), x_gap_after(0) {}
|
||||
SpacingProperties(int b, int a) : x_gap_before(b), x_gap_after(a) {}
|
||||
// These values are obtained from FT_Glyph_Metrics struct
|
||||
// used by the FreeType font engine.
|
||||
int x_gap_before; // horizontal x bearing
|
||||
int x_gap_after; // horizontal advance - x_gap_before - width
|
||||
map<string, int> kerned_x_gaps;
|
||||
};
|
||||
|
||||
static bool IsWhitespaceBox(const BoxChar* boxchar) {
|
||||
return (boxchar->box() == NULL ||
|
||||
SpanUTF8Whitespace(boxchar->ch().c_str()));
|
||||
}
|
||||
|
||||
static string StringReplace(const string& in,
|
||||
const string& oldsub, const string& newsub) {
|
||||
string out;
|
||||
int start_pos = 0;
|
||||
do {
|
||||
int pos = in.find(oldsub, start_pos);
|
||||
if (pos == string::npos) break;
|
||||
out.append(in.data() + start_pos, pos - start_pos);
|
||||
out.append(newsub.data(), newsub.length());
|
||||
start_pos = pos + oldsub.length();
|
||||
} while (true);
|
||||
out.append(in.data() + start_pos, in.length() - start_pos);
|
||||
return out;
|
||||
}
|
||||
|
||||
// Assumes that each word (whitespace-separated entity) in text is a bigram.
|
||||
// Renders the bigrams and calls FontInfo::GetSpacingProperties() to
|
||||
// obtain spacing information. Produces the output .fontinfo file with a line
|
||||
// per unichar of the form:
|
||||
// unichar space_before space_after kerned1 kerned_space1 kerned2 ...
|
||||
// Fox example, if unichar "A" has spacing of 0 pixels before and -1 pixels
|
||||
// after, is kerned with "V" resulting in spacing of "AV" to be -7 and kerned
|
||||
// with "T", such that "AT" has spacing of -5, the entry/line for unichar "A"
|
||||
// in .fontinfo file will be:
|
||||
// A 0 -1 T -5 V -7
|
||||
void ExtractFontProperties(const string &utf8_text,
|
||||
StringRenderer *render,
|
||||
const string &output_base) {
|
||||
map<string, SpacingProperties> spacing_map;
|
||||
map<string, SpacingProperties>::iterator spacing_map_it0;
|
||||
map<string, SpacingProperties>::iterator spacing_map_it1;
|
||||
int x_bearing, x_advance;
|
||||
int len = utf8_text.length();
|
||||
int offset = 0;
|
||||
const char* text = utf8_text.c_str();
|
||||
while (offset < len) {
|
||||
offset += render->RenderToImage(text + offset, strlen(text + offset), NULL);
|
||||
const vector<BoxChar*> &boxes = render->GetBoxes();
|
||||
|
||||
// If the page break split a bigram, correct the offset so we try the bigram
|
||||
// on the next iteration.
|
||||
if (boxes.size() > 2 && !IsWhitespaceBox(boxes[boxes.size() - 1]) &&
|
||||
IsWhitespaceBox(boxes[boxes.size() - 2])) {
|
||||
if (boxes.size() > 3) {
|
||||
tprintf("WARNING: Adjusting to bad page break after '%s%s'\n",
|
||||
boxes[boxes.size() - 4]->ch().c_str(),
|
||||
boxes[boxes.size() - 3]->ch().c_str());
|
||||
}
|
||||
offset -= boxes[boxes.size() - 1]->ch().size();
|
||||
}
|
||||
|
||||
for (int b = 0; b < boxes.size(); b += 2) {
|
||||
while (b < boxes.size() && IsWhitespaceBox(boxes[b])) ++b;
|
||||
if (b + 1 >= boxes.size()) break;
|
||||
const string &ch0 = boxes[b]->ch();
|
||||
// We encountered a ligature. This happens in at least two scenarios:
|
||||
// One is when the rendered bigram forms a grapheme cluster (eg. the
|
||||
// second character in the bigram is a combining vowel), in which case we
|
||||
// correctly output only one bounding box.
|
||||
// A second far less frequent case is when caused some fonts like 'DejaVu
|
||||
// Sans Ultra-Light' force Pango to render a ligatured character even if
|
||||
// the input consists of the separated characters. NOTE(ranjith): As per
|
||||
// behdad@ this is not currently controllable at the level of the Pango
|
||||
// API.
|
||||
// Safeguard against these cases here by just skipping the bigram.
|
||||
if (IsWhitespaceBox(boxes[b+1])) {
|
||||
tprintf("WARNING: Found unexpected ligature: %s\n", ch0.c_str());
|
||||
continue;
|
||||
}
|
||||
int xgap = (boxes[b+1]->box()->x -
|
||||
(boxes[b]->box()->x + boxes[b]->box()->w));
|
||||
spacing_map_it0 = spacing_map.find(ch0);
|
||||
int ok_count = 0;
|
||||
if (spacing_map_it0 == spacing_map.end() &&
|
||||
render->font().GetSpacingProperties(ch0, &x_bearing, &x_advance)) {
|
||||
spacing_map[ch0] = SpacingProperties(
|
||||
x_bearing, x_advance - x_bearing - boxes[b]->box()->w);
|
||||
spacing_map_it0 = spacing_map.find(ch0);
|
||||
++ok_count;
|
||||
}
|
||||
const string &ch1 = boxes[b+1]->ch();
|
||||
tlog(3, "%s%s\n", ch0.c_str(), ch1.c_str());
|
||||
spacing_map_it1 = spacing_map.find(ch1);
|
||||
if (spacing_map_it1 == spacing_map.end() &&
|
||||
render->font().GetSpacingProperties(ch1, &x_bearing, &x_advance)) {
|
||||
spacing_map[ch1] = SpacingProperties(
|
||||
x_bearing, x_advance - x_bearing - boxes[b+1]->box()->w);
|
||||
spacing_map_it1 = spacing_map.find(ch1);
|
||||
++ok_count;
|
||||
}
|
||||
if (ok_count == 2 && xgap != (spacing_map_it0->second.x_gap_after +
|
||||
spacing_map_it1->second.x_gap_before)) {
|
||||
spacing_map_it0->second.kerned_x_gaps[ch1] = xgap;
|
||||
}
|
||||
}
|
||||
render->ClearBoxes();
|
||||
}
|
||||
string output_string;
|
||||
const int kBufSize = 1024;
|
||||
char buf[kBufSize];
|
||||
snprintf(buf, kBufSize, "%d\n", static_cast<int>(spacing_map.size()));
|
||||
output_string.append(buf);
|
||||
map<string, SpacingProperties>::const_iterator spacing_map_it;
|
||||
for (spacing_map_it = spacing_map.begin();
|
||||
spacing_map_it != spacing_map.end(); ++spacing_map_it) {
|
||||
snprintf(buf, kBufSize,
|
||||
"%s %d %d %d", spacing_map_it->first.c_str(),
|
||||
spacing_map_it->second.x_gap_before,
|
||||
spacing_map_it->second.x_gap_after,
|
||||
static_cast<int>(spacing_map_it->second.kerned_x_gaps.size()));
|
||||
output_string.append(buf);
|
||||
map<string, int>::const_iterator kern_it;
|
||||
for (kern_it = spacing_map_it->second.kerned_x_gaps.begin();
|
||||
kern_it != spacing_map_it->second.kerned_x_gaps.end(); ++kern_it) {
|
||||
snprintf(buf, kBufSize,
|
||||
" %s %d", kern_it->first.c_str(), kern_it->second);
|
||||
output_string.append(buf);
|
||||
}
|
||||
output_string.append("\n");
|
||||
}
|
||||
File::WriteStringToFileOrDie(output_string, output_base + ".fontinfo");
|
||||
}
|
||||
|
||||
bool MakeIndividualGlyphs(Pix* pix,
|
||||
const vector<BoxChar*>& vbox,
|
||||
const int input_tiff_page) {
|
||||
// If checks fail, return false without exiting text2image
|
||||
if (!pix) {
|
||||
tprintf("ERROR: MakeIndividualGlyphs(): Input Pix* is NULL\n");
|
||||
return false;
|
||||
} else if (FLAGS_glyph_resized_size <= 0) {
|
||||
tprintf("ERROR: --glyph_resized_size must be positive\n");
|
||||
return false;
|
||||
} else if (FLAGS_glyph_num_border_pixels_to_pad < 0) {
|
||||
tprintf("ERROR: --glyph_num_border_pixels_to_pad must be 0 or positive\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
const int n_boxes = vbox.size();
|
||||
int n_boxes_saved = 0;
|
||||
int current_tiff_page = 0;
|
||||
int y_previous = 0;
|
||||
static int glyph_count = 0;
|
||||
for (int i = 0; i < n_boxes; i++) {
|
||||
// Get one bounding box
|
||||
Box* b = vbox[i]->mutable_box();
|
||||
if (!b) continue;
|
||||
const int x = b->x;
|
||||
const int y = b->y;
|
||||
const int w = b->w;
|
||||
const int h = b->h;
|
||||
// Check present tiff page (for multipage tiff)
|
||||
if (y < y_previous-pixGetHeight(pix)/10) {
|
||||
tprintf("ERROR: Wrap-around encountered, at i=%d\n", i);
|
||||
current_tiff_page++;
|
||||
}
|
||||
if (current_tiff_page < input_tiff_page) continue;
|
||||
else if (current_tiff_page > input_tiff_page) break;
|
||||
// Check box validity
|
||||
if (x < 0 || y < 0 ||
|
||||
(x+w-1) >= pixGetWidth(pix) ||
|
||||
(y+h-1) >= pixGetHeight(pix)) {
|
||||
tprintf("ERROR: MakeIndividualGlyphs(): Index out of range, at i=%d"
|
||||
" (x=%d, y=%d, w=%d, h=%d\n)", i, x, y, w, h);
|
||||
continue;
|
||||
} else if (w < FLAGS_glyph_num_border_pixels_to_pad &&
|
||||
h < FLAGS_glyph_num_border_pixels_to_pad) {
|
||||
tprintf("ERROR: Input image too small to be a character, at i=%d\n", i);
|
||||
continue;
|
||||
}
|
||||
// Crop the boxed character
|
||||
Pix* pix_glyph = pixClipRectangle(pix, b, NULL);
|
||||
if (!pix_glyph) {
|
||||
tprintf("ERROR: MakeIndividualGlyphs(): Failed to clip, at i=%d\n", i);
|
||||
continue;
|
||||
}
|
||||
// Resize to square
|
||||
Pix* pix_glyph_sq = pixScaleToSize(pix_glyph,
|
||||
FLAGS_glyph_resized_size,
|
||||
FLAGS_glyph_resized_size);
|
||||
if (!pix_glyph_sq) {
|
||||
tprintf("ERROR: MakeIndividualGlyphs(): Failed to resize, at i=%d\n", i);
|
||||
continue;
|
||||
}
|
||||
// Zero-pad
|
||||
Pix* pix_glyph_sq_pad = pixAddBorder(pix_glyph_sq,
|
||||
FLAGS_glyph_num_border_pixels_to_pad,
|
||||
0);
|
||||
if (!pix_glyph_sq_pad) {
|
||||
tprintf("ERROR: MakeIndividualGlyphs(): Failed to zero-pad, at i=%d\n", i);
|
||||
continue;
|
||||
}
|
||||
// Write out
|
||||
Pix* pix_glyph_sq_pad_8 = pixConvertTo8(pix_glyph_sq_pad, false);
|
||||
char filename[1024];
|
||||
snprintf(filename, 1024, "%s_%d.jpg", FLAGS_outputbase.c_str(),
|
||||
glyph_count++);
|
||||
if (pixWriteJpeg(filename, pix_glyph_sq_pad_8, 100, 0)) {
|
||||
tprintf("ERROR: MakeIndividualGlyphs(): Failed to write JPEG to %s,"
|
||||
" at i=%d\n", filename, i);
|
||||
continue;
|
||||
}
|
||||
|
||||
pixDestroy(&pix_glyph);
|
||||
pixDestroy(&pix_glyph_sq);
|
||||
pixDestroy(&pix_glyph_sq_pad);
|
||||
pixDestroy(&pix_glyph_sq_pad_8);
|
||||
n_boxes_saved++;
|
||||
y_previous = y;
|
||||
}
|
||||
if (n_boxes_saved == 0) {
|
||||
return false;
|
||||
} else {
|
||||
tprintf("Total number of characters saved = %d\n", n_boxes_saved);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
} // namespace tesseract
|
||||
|
||||
using tesseract::DegradeImage;
|
||||
using tesseract::ExtractFontProperties;
|
||||
using tesseract::File;
|
||||
using tesseract::FontUtils;
|
||||
using tesseract::SpanUTF8NotWhitespace;
|
||||
using tesseract::SpanUTF8Whitespace;
|
||||
using tesseract::StringRenderer;
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
|
||||
|
||||
if (FLAGS_list_available_fonts) {
|
||||
const vector<string>& all_fonts = FontUtils::ListAvailableFonts();
|
||||
for (int i = 0; i < all_fonts.size(); ++i) {
|
||||
tprintf("%3d: %s\n", i, all_fonts[i].c_str());
|
||||
ASSERT_HOST_MSG(FontUtils::IsAvailableFont(all_fonts[i].c_str()),
|
||||
"Font %s is unrecognized.\n", all_fonts[i].c_str());
|
||||
}
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
// Check validity of input flags.
|
||||
ASSERT_HOST_MSG(!FLAGS_text.empty(), "Text file missing!\n");
|
||||
ASSERT_HOST_MSG(!FLAGS_outputbase.empty(), "Output file missing!\n");
|
||||
ASSERT_HOST_MSG(FLAGS_render_ngrams || FLAGS_unicharset_file.empty(),
|
||||
"Use --unicharset_file only if --render_ngrams is set.\n");
|
||||
|
||||
ASSERT_HOST_MSG(FLAGS_find_fonts ||
|
||||
FontUtils::IsAvailableFont(FLAGS_font.c_str()),
|
||||
"Could not find font named %s\n", FLAGS_font.c_str());
|
||||
|
||||
if (FLAGS_render_ngrams)
|
||||
FLAGS_output_word_boxes = true;
|
||||
|
||||
char font_desc_name[1024];
|
||||
snprintf(font_desc_name, 1024, "%s %d", FLAGS_font.c_str(),
|
||||
static_cast<int>(FLAGS_ptsize));
|
||||
StringRenderer render(font_desc_name, FLAGS_xsize, FLAGS_ysize);
|
||||
render.set_add_ligatures(FLAGS_ligatures);
|
||||
render.set_leading(FLAGS_leading);
|
||||
render.set_resolution(FLAGS_resolution);
|
||||
render.set_char_spacing(FLAGS_char_spacing * FLAGS_ptsize);
|
||||
render.set_h_margin(FLAGS_margin);
|
||||
render.set_v_margin(FLAGS_margin);
|
||||
render.set_output_word_boxes(FLAGS_output_word_boxes);
|
||||
render.set_box_padding(FLAGS_box_padding);
|
||||
render.set_strip_unrenderable_words(FLAGS_strip_unrenderable_words);
|
||||
|
||||
// Set text rendering orientation and their forms.
|
||||
if (FLAGS_writing_mode == "horizontal") {
|
||||
// Render regular horizontal text (default).
|
||||
render.set_vertical_text(false);
|
||||
render.set_gravity_hint_strong(false);
|
||||
render.set_render_fullwidth_latin(false);
|
||||
} else if (FLAGS_writing_mode == "vertical") {
|
||||
// Render vertical text. Glyph orientation is selected by Pango.
|
||||
render.set_vertical_text(true);
|
||||
render.set_gravity_hint_strong(false);
|
||||
render.set_render_fullwidth_latin(false);
|
||||
} else if (FLAGS_writing_mode == "vertical-upright") {
|
||||
// Render vertical text. Glyph orientation is set to be upright.
|
||||
// Also Basic Latin characters are converted to their fullwidth forms
|
||||
// on rendering, since fullwidth Latin characters are well designed to fit
|
||||
// vertical text lines, while .box files store halfwidth Basic Latin
|
||||
// unichars.
|
||||
render.set_vertical_text(true);
|
||||
render.set_gravity_hint_strong(true);
|
||||
render.set_render_fullwidth_latin(true);
|
||||
} else {
|
||||
TLOG_FATAL("Invalid writing mode : %s\n", FLAGS_writing_mode.c_str());
|
||||
}
|
||||
|
||||
string src_utf8;
|
||||
// This c_str is NOT redundant!
|
||||
File::ReadFileToStringOrDie(FLAGS_text.c_str(), &src_utf8);
|
||||
|
||||
// Remove the unicode mark if present.
|
||||
if (strncmp(src_utf8.c_str(), "\xef\xbb\xbf", 3) == 0) {
|
||||
src_utf8.erase(0, 3);
|
||||
}
|
||||
tlog(1, "Render string of size %d\n", src_utf8.length());
|
||||
|
||||
if (FLAGS_render_ngrams || FLAGS_only_extract_font_properties) {
|
||||
// Try to preserve behavior of old text2image by expanding inter-word
|
||||
// spaces by a factor of 4.
|
||||
const string kSeparator = FLAGS_render_ngrams ? " " : " ";
|
||||
// Also restrict the number of charactes per line to try and avoid
|
||||
// line-breaking in the middle of words like "-A", "R$" etc. which are
|
||||
// otherwise allowed by the standard unicode line-breaking rules.
|
||||
const int kCharsPerLine = (FLAGS_ptsize > 20) ? 50 : 100;
|
||||
string rand_utf8;
|
||||
UNICHARSET unicharset;
|
||||
if (FLAGS_render_ngrams && !FLAGS_unicharset_file.empty() &&
|
||||
!unicharset.load_from_file(FLAGS_unicharset_file.c_str())) {
|
||||
TLOG_FATAL("Failed to load unicharset from file %s\n",
|
||||
FLAGS_unicharset_file.c_str());
|
||||
}
|
||||
|
||||
// If we are rendering ngrams that will be OCRed later, shuffle them so that
|
||||
// tesseract does not have difficulties finding correct baseline, word
|
||||
// spaces, etc.
|
||||
const char *str8 = src_utf8.c_str();
|
||||
int len = src_utf8.length();
|
||||
int step;
|
||||
vector<pair<int, int> > offsets;
|
||||
int offset = SpanUTF8Whitespace(str8);
|
||||
while (offset < len) {
|
||||
step = SpanUTF8NotWhitespace(str8 + offset);
|
||||
offsets.push_back(make_pair(offset, step));
|
||||
offset += step;
|
||||
offset += SpanUTF8Whitespace(str8 + offset);
|
||||
}
|
||||
if (FLAGS_render_ngrams)
|
||||
std::random_shuffle(offsets.begin(), offsets.end());
|
||||
|
||||
for (int i = 0, line = 1; i < offsets.size(); ++i) {
|
||||
const char *curr_pos = str8 + offsets[i].first;
|
||||
int ngram_len = offsets[i].second;
|
||||
// Skip words that contain characters not in found in unicharset.
|
||||
if (!FLAGS_unicharset_file.empty() &&
|
||||
!unicharset.encodable_string(curr_pos, NULL)) {
|
||||
continue;
|
||||
}
|
||||
rand_utf8.append(curr_pos, ngram_len);
|
||||
if (rand_utf8.length() > line * kCharsPerLine) {
|
||||
rand_utf8.append(" \n");
|
||||
++line;
|
||||
if (line & 0x1) rand_utf8.append(kSeparator);
|
||||
} else {
|
||||
rand_utf8.append(kSeparator);
|
||||
}
|
||||
}
|
||||
tlog(1, "Rendered ngram string of size %d\n", rand_utf8.length());
|
||||
src_utf8.swap(rand_utf8);
|
||||
}
|
||||
if (FLAGS_only_extract_font_properties) {
|
||||
tprintf("Extracting font properties only\n");
|
||||
ExtractFontProperties(src_utf8, &render, FLAGS_outputbase.c_str());
|
||||
tprintf("Done!\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
int im = 0;
|
||||
vector<float> page_rotation;
|
||||
const char* to_render_utf8 = src_utf8.c_str();
|
||||
|
||||
// We use a two pass mechanism to rotate images in both direction.
|
||||
// The first pass(0) will rotate the images in random directions and
|
||||
// the second pass(1) will mirror those rotations.
|
||||
int num_pass = FLAGS_bidirectional_rotation ? 2 : 1;
|
||||
for (int pass = 0; pass < num_pass; ++pass) {
|
||||
int page_num = 0;
|
||||
string font_used;
|
||||
for (int offset = 0; offset < strlen(to_render_utf8); ++im, ++page_num) {
|
||||
tlog(1, "Starting page %d\n", im);
|
||||
Pix* pix = NULL;
|
||||
if (FLAGS_find_fonts) {
|
||||
offset += render.RenderAllFontsToImage(to_render_utf8 + offset,
|
||||
strlen(to_render_utf8 + offset),
|
||||
&font_used, &pix);
|
||||
} else {
|
||||
offset += render.RenderToImage(to_render_utf8 + offset,
|
||||
strlen(to_render_utf8 + offset), &pix);
|
||||
}
|
||||
if (pix != NULL) {
|
||||
float rotation = 0;
|
||||
if (pass == 1) {
|
||||
// Pass 2, do mirror rotation.
|
||||
rotation = -1 * page_rotation[page_num];
|
||||
}
|
||||
if (FLAGS_degrade_image) {
|
||||
pix = DegradeImage(pix, FLAGS_exposure, &rotation);
|
||||
}
|
||||
render.RotatePageBoxes(rotation);
|
||||
|
||||
if (pass == 0) {
|
||||
// Pass 1, rotate randomly and store the rotation..
|
||||
page_rotation.push_back(rotation);
|
||||
}
|
||||
|
||||
Pix* gray_pix = pixConvertTo8(pix, false);
|
||||
pixDestroy(&pix);
|
||||
Pix* binary = pixThresholdToBinary(gray_pix, 128);
|
||||
pixDestroy(&gray_pix);
|
||||
char tiff_name[1024];
|
||||
if (FLAGS_find_fonts && FLAGS_render_per_font) {
|
||||
string fontname_for_file = tesseract::StringReplace(
|
||||
font_used, " ", "_");
|
||||
snprintf(tiff_name, 1024, "%s.%s.tif", FLAGS_outputbase.c_str(),
|
||||
fontname_for_file.c_str());
|
||||
pixWriteTiff(tiff_name, binary, IFF_TIFF_G4, "w");
|
||||
} else {
|
||||
snprintf(tiff_name, 1024, "%s.tif", FLAGS_outputbase.c_str());
|
||||
pixWriteTiff(tiff_name, binary, IFF_TIFF_G4, im == 0 ? "w" : "a");
|
||||
}
|
||||
tprintf("Rendered page %d to file %s\n", im, tiff_name);
|
||||
// Make individual glyphs
|
||||
if (FLAGS_output_individual_glyph_images) {
|
||||
if (!MakeIndividualGlyphs(binary, render.GetBoxes(), im)) {
|
||||
tprintf("ERROR: Individual glyphs not saved\n");
|
||||
}
|
||||
}
|
||||
pixDestroy(&binary);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!FLAGS_find_fonts) {
|
||||
string box_name = FLAGS_outputbase.c_str();
|
||||
box_name += ".box";
|
||||
render.WriteAllBoxes(box_name);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
23
training/tlog.cpp
Normal file
23
training/tlog.cpp
Normal file
@ -0,0 +1,23 @@
|
||||
/**********************************************************************
|
||||
* File: tlog.cpp
|
||||
* Description: Variant of printf with logging level controllable by a
|
||||
* commandline flag.
|
||||
* Author: Ranjith Unnikrishnan
|
||||
* Created: Wed Nov 20 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "tlog.h"
|
||||
|
||||
INT_PARAM_FLAG(tlog_level, 0, "Minimum logging level for tlog() output");
|
49
training/tlog.h
Normal file
49
training/tlog.h
Normal file
@ -0,0 +1,49 @@
|
||||
/**********************************************************************
|
||||
* File: tlog.h
|
||||
* Description: Variant of printf with logging level controllable by a
|
||||
* commandline flag.
|
||||
* Author: Ranjith Unnikrishnan
|
||||
* Created: Wed Nov 20 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
#ifndef TESSERACT_TRAINING_TLOG_H_
|
||||
#define TESSERACT_TRAINING_TLOG_H_
|
||||
|
||||
#include "commandlineflags.h"
|
||||
#include "errcode.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
DECLARE_INT_PARAM_FLAG(tlog_level);
|
||||
|
||||
// Variant guarded by the numeric logging level parameter FLAGS_tlog_level
|
||||
// (default 0). Code using ParseCommandLineFlags() can control its value using
|
||||
// the --tlog_level commandline argument. Otherwise it must be specified in a
|
||||
// config file like other params.
|
||||
#define tlog(level, ...) { \
|
||||
if (FLAGS_tlog_level >= level) { \
|
||||
tprintf_internal(__VA_ARGS__); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define TLOG_IS_ON(level) (FLAGS_tlog_level >= level)
|
||||
|
||||
#define TLOG_FATAL(msg...) \
|
||||
{ \
|
||||
tprintf(msg); \
|
||||
ASSERT_FAILED.error("", ABORT, "in file %s, line %d", \
|
||||
__FILE__, __LINE__); \
|
||||
}
|
||||
|
||||
|
||||
#endif // TESSERACT_TRAINING_TLOG_H_
|
67
training/util.h
Normal file
67
training/util.h
Normal file
@ -0,0 +1,67 @@
|
||||
/**********************************************************************
|
||||
* File: util.h
|
||||
* Description: Misc STL string utility functions.
|
||||
* Author: Samuel Charron
|
||||
* Created: Mon Nov 18 2013
|
||||
*
|
||||
* (C) Copyright 2013, Google Inc.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TESSERACT_TRAINING_UTIL_H_
|
||||
#define TESSERACT_TRAINING_UTIL_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
#include <string>
|
||||
|
||||
#ifdef USE_STD_NAMESPACE
|
||||
using std::string;
|
||||
using std::vector;
|
||||
#endif
|
||||
|
||||
// StringHash is the hashing functor needed by the stl hash map.
|
||||
#ifndef COMPILER_MSVC
|
||||
struct StringHash {
|
||||
size_t operator()(const string& s) const {
|
||||
size_t hash_code = 0;
|
||||
const char* str = s.c_str();
|
||||
for (int ch = 0; str[ch] != 0; ++ch) {
|
||||
hash_code += str[ch] << (ch % 24);
|
||||
}
|
||||
return hash_code;
|
||||
}
|
||||
};
|
||||
#else // COMPILER_MSVC
|
||||
struct StringHash : public stdext::hash_compare <string> {
|
||||
size_t operator()(const string& s) const {
|
||||
size_t hash_code = 0;
|
||||
const char* str = s.c_str();
|
||||
for (int ch = 0; str[ch] != 0; ++ch) {
|
||||
hash_code += str[ch] << (ch % 24);
|
||||
}
|
||||
return hash_code;
|
||||
}
|
||||
bool operator()(const string& s1, const string& s2) const {
|
||||
return s1 == s2;
|
||||
}
|
||||
};
|
||||
#endif // !COMPILER_MSVC
|
||||
|
||||
#ifndef USE_STD_NAMESPACE
|
||||
#include "base/heap-checker.h"
|
||||
#define DISABLE_HEAP_LEAK_CHECK HeapLeakChecker::Disabler disabler
|
||||
#else
|
||||
#define DISABLE_HEAP_LEAK_CHECK {}
|
||||
#endif
|
||||
|
||||
#endif // TESSERACT_TRAINING_UTIL_H_
|
Loading…
Reference in New Issue
Block a user