mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-30 23:49:05 +08:00
023e1b340e
* api: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccmain: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccstruct: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * classify: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * cutil: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * dict: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * textord: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * training: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * wordrec: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccutil: Replace Tesseract data types by POSIX data types Now all Tesseract data types which are no longer needed can be removed from ccutil/host.h. Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccmain: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccstruct: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * classify: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * dict: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * lstm: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * textord: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * wordrec: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccutil: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Remove the macros which are now unused from ccutil/host.h. Remove also the obsolete history comments. Signed-off-by: Stefan Weil <sw@weilnetz.de> * Fix build error caused by ambiguous ClipToRange Error message vom Appveyor CI: C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2672: 'ClipToRange': no matching overloaded function found [C:\projects\tesseract\build\libtesseract.vcxproj] C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2782: 'T ClipToRange(const T &,const T &,const T &)': template parameter 'T' is ambiguous [C:\projects\tesseract\build\libtesseract.vcxproj] c:\projects\tesseract\ccutil\helpers.h(122): note: see declaration of 'ClipToRange' C:\projects\tesseract\ccstruct\coutln.cpp(818): note: could be 'char' C:\projects\tesseract\ccstruct\coutln.cpp(818): note: or 'int' Signed-off-by: Stefan Weil <sw@weilnetz.de> * unittest: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * arch: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de>
217 lines
9.3 KiB
C++
217 lines
9.3 KiB
C++
/**********************************************************************
|
|
* File: pango_font_info.h
|
|
* Description: Font-related objects and helper functions
|
|
* Author: Ranjith Unnikrishnan
|
|
* Created: Mon Nov 18 2013
|
|
*
|
|
* (C) Copyright 2013, Google Inc.
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
#ifndef TESSERACT_TRAINING_PANGO_FONT_INFO_H_
|
|
#define TESSERACT_TRAINING_PANGO_FONT_INFO_H_
|
|
|
|
#include <string>
|
|
#include <unordered_map>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
#include "commandlineflags.h"
|
|
#include "host.h"
|
|
#include "pango/pango-font.h"
|
|
#include "pango/pango.h"
|
|
#include "pango/pangocairo.h"
|
|
#include "util.h"
|
|
|
|
DECLARE_STRING_PARAM_FLAG(fonts_dir);
|
|
DECLARE_STRING_PARAM_FLAG(fontconfig_tmpdir);
|
|
|
|
typedef signed int char32;
|
|
|
|
namespace tesseract {
|
|
|
|
// Data holder class for a font, intended to avoid having to work with Pango or
|
|
// FontConfig-specific objects directly.
|
|
class PangoFontInfo {
|
|
public:
|
|
enum FontTypeEnum {
|
|
UNKNOWN,
|
|
SERIF,
|
|
SANS_SERIF,
|
|
DECORATIVE,
|
|
};
|
|
PangoFontInfo();
|
|
~PangoFontInfo();
|
|
// Initialize from parsing a font description name, defined as a string of the
|
|
// format:
|
|
// "FamilyName [FaceName] [PointSize]"
|
|
// where a missing FaceName implies the default regular face.
|
|
// eg. "Arial Italic 12", "Verdana"
|
|
//
|
|
// FaceName is a combination of:
|
|
// [StyleName] [Variant] [Weight] [Stretch]
|
|
// with (all optional) Pango-defined values of:
|
|
// StyleName: Oblique, Italic
|
|
// Variant : Small-Caps
|
|
// Weight : Ultra-Light, Light, Medium, Semi-Bold, Bold, Ultra-Bold, Heavy
|
|
// Stretch : Ultra-Condensed, Extra-Condensed, Condensed, Semi-Condensed,
|
|
// Semi-Expanded, Expanded, Extra-Expanded, Ultra-Expanded.
|
|
explicit PangoFontInfo(const std::string& name);
|
|
bool ParseFontDescriptionName(const std::string& name);
|
|
|
|
// Returns true if the font have codepoint coverage for the specified text.
|
|
bool CoversUTF8Text(const char* utf8_text, int byte_length) const;
|
|
// Modifies string to remove unicode points that are not covered by the
|
|
// font. Returns the number of characters dropped.
|
|
int DropUncoveredChars(std::string* utf8_text) const;
|
|
|
|
// Returns true if the entire string can be rendered by the font with full
|
|
// character coverage and no unknown glyph or dotted-circle glyph
|
|
// substitutions on encountering a badly formed unicode sequence.
|
|
// If true, returns individual graphemes. Any whitespace characters in the
|
|
// original string are also included in the list.
|
|
bool CanRenderString(const char* utf8_word, int len,
|
|
std::vector<std::string>* graphemes) const;
|
|
bool CanRenderString(const char* utf8_word, int len) const;
|
|
|
|
// Retrieves the x_bearing and x_advance for the given utf8 character in the
|
|
// font. Returns false if the glyph for the character could not be found in
|
|
// the font.
|
|
// Ref: http://freetype.sourceforge.net/freetype2/docs/glyphs/glyphs-3.html
|
|
bool GetSpacingProperties(const std::string& utf8_char,
|
|
int* x_bearing, int* x_advance) const;
|
|
|
|
// If not already initialized, initializes FontConfig by setting its
|
|
// environment variable and creating a fonts.conf file that points to the
|
|
// FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir.
|
|
static void SoftInitFontConfig();
|
|
// Re-initializes font config, whether or not already initialized.
|
|
// If already initialized, any existing cache is deleted, just to be sure.
|
|
static void HardInitFontConfig(const std::string& fonts_dir,
|
|
const std::string& cache_dir);
|
|
|
|
// Accessors
|
|
std::string DescriptionName() const;
|
|
// Font Family name eg. "Arial"
|
|
const std::string& family_name() const { return family_name_; }
|
|
// Size in points (1/72"), rounded to the nearest integer.
|
|
int font_size() const { return font_size_; }
|
|
FontTypeEnum font_type() const { return font_type_; }
|
|
|
|
int resolution() const { return resolution_; }
|
|
void set_resolution(const int resolution) {
|
|
resolution_ = resolution;
|
|
}
|
|
|
|
private:
|
|
friend class FontUtils;
|
|
void Clear();
|
|
bool ParseFontDescription(const PangoFontDescription* desc);
|
|
// Returns the PangoFont structure corresponding to the closest available font
|
|
// in the font map.
|
|
PangoFont* ToPangoFont() const;
|
|
|
|
// Font properties set automatically from parsing the font description name.
|
|
std::string family_name_;
|
|
int font_size_;
|
|
FontTypeEnum font_type_;
|
|
// The Pango description that was used to initialize the instance.
|
|
PangoFontDescription* desc_;
|
|
// Default output resolution to assume for GetSpacingProperties() and any
|
|
// other methods that returns pixel values.
|
|
int resolution_;
|
|
// Fontconfig operates through an environment variable, so it intrinsically
|
|
// cannot be thread-friendly, but you can serialize multiple independent
|
|
// font configurations by calling HardInitFontConfig(fonts_dir, cache_dir).
|
|
// These hold the last initialized values set by HardInitFontConfig or
|
|
// the first call to SoftInitFontConfig.
|
|
// Directory to be scanned for font files.
|
|
static std::string fonts_dir_;
|
|
// Directory to store the cache of font information. (Can be the same as
|
|
// fonts_dir_)
|
|
static std::string cache_dir_;
|
|
|
|
private:
|
|
PangoFontInfo(const PangoFontInfo&);
|
|
void operator=(const PangoFontInfo&);
|
|
};
|
|
|
|
// Static utility methods for querying font availability and font-selection
|
|
// based on codepoint coverage.
|
|
class FontUtils {
|
|
public:
|
|
// Returns true if the font of the given description name is available in the
|
|
// target directory specified by --fonts_dir
|
|
static bool IsAvailableFont(const char* font_desc) {
|
|
return IsAvailableFont(font_desc, nullptr);
|
|
}
|
|
// Returns true if the font of the given description name is available in the
|
|
// target directory specified by --fonts_dir. If false is returned, and
|
|
// best_match is not nullptr, the closest matching font is returned there.
|
|
static bool IsAvailableFont(const char* font_desc, std::string* best_match);
|
|
// Outputs description names of available fonts.
|
|
static const std::vector<std::string>& ListAvailableFonts();
|
|
|
|
// Picks font among available fonts that covers and can render the given word,
|
|
// and returns the font description name and the decomposition of the word to
|
|
// graphemes. Returns false if no suitable font was found.
|
|
static bool SelectFont(const char* utf8_word, const int utf8_len,
|
|
std::string* font_name, std::vector<std::string>* graphemes);
|
|
|
|
// Picks font among all_fonts that covers and can render the given word,
|
|
// and returns the font description name and the decomposition of the word to
|
|
// graphemes. Returns false if no suitable font was found.
|
|
static bool SelectFont(const char* utf8_word, const int utf8_len,
|
|
const std::vector<std::string>& all_fonts,
|
|
std::string* font_name, std::vector<std::string>* graphemes);
|
|
|
|
// Returns a bitmask where the value of true at index 'n' implies that unicode
|
|
// value 'n' is renderable by at least one available font.
|
|
static void GetAllRenderableCharacters(std::vector<bool>* unichar_bitmap);
|
|
// Variant of the above function that inspects only the provided font names.
|
|
static void GetAllRenderableCharacters(const std::vector<std::string>& font_names,
|
|
std::vector<bool>* unichar_bitmap);
|
|
static void GetAllRenderableCharacters(const std::string& font_name,
|
|
std::vector<bool>* unichar_bitmap);
|
|
|
|
// NOTE: The following utilities were written to be backward compatible with
|
|
// StringRender.
|
|
|
|
// BestFonts returns a font name and a bit vector of the characters it
|
|
// can render for the fonts that score within some fraction of the best
|
|
// font on the characters in the given hash map.
|
|
// In the flags vector, each flag is set according to whether the
|
|
// corresponding character (in order of iterating ch_map) can be rendered.
|
|
// The return string is a list of the acceptable fonts that were used.
|
|
static std::string BestFonts(
|
|
const std::unordered_map<char32, int64_t>& ch_map,
|
|
std::vector<std::pair<const char*, std::vector<bool> > >* font_flag);
|
|
|
|
// FontScore returns the weighted renderability score of the given
|
|
// hash map character table in the given font. The unweighted score
|
|
// is also returned in raw_score.
|
|
// The values in the bool vector ch_flags correspond to whether the
|
|
// corresponding character (in order of iterating ch_map) can be rendered.
|
|
static int FontScore(const std::unordered_map<char32, int64_t>& ch_map,
|
|
const std::string& fontname, int* raw_score,
|
|
std::vector<bool>* ch_flags);
|
|
|
|
// PangoFontInfo is reinitialized, so clear the static list of fonts.
|
|
static void ReInit();
|
|
|
|
private:
|
|
static std::vector<std::string> available_fonts_; // cache list
|
|
};
|
|
} // namespace tesseract
|
|
|
|
#endif // TESSERACT_TRAINING_PANGO_FONT_INFO_H_
|