2007-05-16 09:25:41 +08:00
|
|
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
|
// File: unicharset.cpp
|
|
|
|
|
// Description: Unicode character/ligature set class.
|
|
|
|
|
// Author: Thomas Kielbus
|
|
|
|
|
//
|
|
|
|
|
// (C) Copyright 2006, Google Inc.
|
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
|
// limitations under the License.
|
|
|
|
|
//
|
|
|
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
2014-08-12 07:23:06 +08:00
|
|
|
|
#include "unicharset.h"
|
|
|
|
|
|
2018-05-20 21:18:07 +08:00
|
|
|
|
#include <algorithm>
|
2018-05-20 05:52:04 +08:00
|
|
|
|
#include <cassert>
|
|
|
|
|
#include <cstdio>
|
|
|
|
|
#include <cstring>
|
2007-05-16 09:25:41 +08:00
|
|
|
|
|
2014-08-12 07:23:06 +08:00
|
|
|
|
#include "params.h"
|
|
|
|
|
#include "serialis.h"
|
2012-02-02 11:14:43 +08:00
|
|
|
|
#include "tesscallback.h"
|
2007-05-16 09:25:41 +08:00
|
|
|
|
#include "unichar.h"
|
|
|
|
|
|
2017-07-15 00:30:14 +08:00
|
|
|
|
// TODO(rays) Move UNICHARSET to tesseract namespace.
|
|
|
|
|
using tesseract::char32;
|
|
|
|
|
using tesseract::UNICHAR;
|
|
|
|
|
|
2012-02-02 11:14:43 +08:00
|
|
|
|
// Special character used in representing character fragments.
|
|
|
|
|
static const char kSeparator = '|';
|
|
|
|
|
// Special character used in representing 'natural' character fragments.
|
|
|
|
|
static const char kNaturalFlag = 'n';
|
|
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
|
static const int ISALPHA_MASK = 0x1;
|
|
|
|
|
static const int ISLOWER_MASK = 0x2;
|
|
|
|
|
static const int ISUPPER_MASK = 0x4;
|
|
|
|
|
static const int ISDIGIT_MASK = 0x8;
|
2009-07-11 10:50:24 +08:00
|
|
|
|
static const int ISPUNCTUATION_MASK = 0x10;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
|
// Y coordinate threshold for determining cap-height vs x-height.
|
|
|
|
|
// TODO(rays) Bring the global definition down to the ccutil library level,
|
|
|
|
|
// so this constant is relative to some other constants.
|
|
|
|
|
static const int kMeanlineThreshold = 220;
|
|
|
|
|
// Let C be the number of alpha chars for which all tops exceed
|
2012-02-02 11:14:43 +08:00
|
|
|
|
// kMeanlineThreshold, and X the number of alpha chars for which all
|
|
|
|
|
// tops are below kMeanlineThreshold, then if X > C *
|
|
|
|
|
// kMinXHeightFraction and C > X * kMinCapHeightFraction or more than
|
|
|
|
|
// half the alpha characters have upper or lower case, then the
|
|
|
|
|
// unicharset "has x-height".
|
2010-11-24 02:34:14 +08:00
|
|
|
|
const double kMinXHeightFraction = 0.25;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
const double kMinCapHeightFraction = 0.05;
|
|
|
|
|
|
|
|
|
|
/*static */
|
|
|
|
|
const char* UNICHARSET::kCustomLigatures[][2] = {
|
|
|
|
|
{"ct", "\uE003"}, // c + t -> U+E003
|
|
|
|
|
{"ſh", "\uE006"}, // long-s + h -> U+E006
|
|
|
|
|
{"ſi", "\uE007"}, // long-s + i -> U+E007
|
|
|
|
|
{"ſl", "\uE008"}, // long-s + l -> U+E008
|
|
|
|
|
{"ſſ", "\uE009"}, // long-s + long-s -> U+E009
|
2016-12-13 00:20:28 +08:00
|
|
|
|
{nullptr, nullptr}
|
2012-02-02 11:14:43 +08:00
|
|
|
|
};
|
2010-11-24 02:34:14 +08:00
|
|
|
|
|
2017-07-25 02:45:57 +08:00
|
|
|
|
// List of mappings to make when ingesting strings from the outside.
|
|
|
|
|
// The substitutions clean up text that should exist for rendering of
|
|
|
|
|
// synthetic data, but not in the recognition set.
|
|
|
|
|
const char* UNICHARSET::kCleanupMaps[][2] = {
|
|
|
|
|
{"\u0640", ""}, // TATWEEL is deleted.
|
|
|
|
|
{"\ufb01", "fi"}, // fi ligature->fi pair.
|
|
|
|
|
{"\ufb02", "fl"}, // fl ligature->fl pair.
|
|
|
|
|
{nullptr, nullptr}};
|
|
|
|
|
|
2013-09-23 23:16:01 +08:00
|
|
|
|
// List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
|
|
|
|
|
const char* UNICHARSET::kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT] = {
|
|
|
|
|
" ",
|
|
|
|
|
"Joined",
|
|
|
|
|
"|Broken|0|1"
|
|
|
|
|
};
|
|
|
|
|
|
2018-04-23 14:32:23 +08:00
|
|
|
|
const char* UNICHARSET::null_script = "NULL";
|
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
|
UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
|
|
|
|
|
Init();
|
|
|
|
|
}
|
2012-02-02 11:14:43 +08:00
|
|
|
|
|
|
|
|
|
// Initialize all properties to sensible default values.
|
2010-11-24 02:34:14 +08:00
|
|
|
|
void UNICHARSET::UNICHAR_PROPERTIES::Init() {
|
|
|
|
|
isalpha = false;
|
|
|
|
|
islower = false;
|
|
|
|
|
isupper = false;
|
|
|
|
|
isdigit = false;
|
|
|
|
|
ispunctuation = false;
|
|
|
|
|
isngram = false;
|
|
|
|
|
enabled = false;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
SetRangesOpen();
|
|
|
|
|
script_id = 0;
|
|
|
|
|
other_case = 0;
|
|
|
|
|
mirror = 0;
|
|
|
|
|
normed = "";
|
|
|
|
|
direction = UNICHARSET::U_LEFT_TO_RIGHT;
|
2016-12-13 00:20:28 +08:00
|
|
|
|
fragment = nullptr;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Sets all ranges wide open. Initialization default in case there are
|
|
|
|
|
// no useful values available.
|
|
|
|
|
void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
|
2010-11-24 02:34:14 +08:00
|
|
|
|
min_bottom = 0;
|
Use POSIX data types and macros (#878)
* api: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* cutil: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* training: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract data types by POSIX data types
Now all Tesseract data types which are no longer needed can be removed
from ccutil/host.h.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* lstm: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Remove the macros which are now unused from ccutil/host.h.
Remove also the obsolete history comments.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* Fix build error caused by ambiguous ClipToRange
Error message vom Appveyor CI:
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2672: 'ClipToRange': no matching overloaded function found [C:\projects\tesseract\build\libtesseract.vcxproj]
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2782: 'T ClipToRange(const T &,const T &,const T &)': template parameter 'T' is ambiguous [C:\projects\tesseract\build\libtesseract.vcxproj]
c:\projects\tesseract\ccutil\helpers.h(122): note: see declaration of 'ClipToRange'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: could be 'char'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: or 'int'
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* unittest: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* arch: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
2018-03-14 04:36:30 +08:00
|
|
|
|
max_bottom = UINT8_MAX;
|
2010-11-24 02:34:14 +08:00
|
|
|
|
min_top = 0;
|
Use POSIX data types and macros (#878)
* api: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* cutil: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* training: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract data types by POSIX data types
Now all Tesseract data types which are no longer needed can be removed
from ccutil/host.h.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* lstm: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Remove the macros which are now unused from ccutil/host.h.
Remove also the obsolete history comments.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* Fix build error caused by ambiguous ClipToRange
Error message vom Appveyor CI:
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2672: 'ClipToRange': no matching overloaded function found [C:\projects\tesseract\build\libtesseract.vcxproj]
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2782: 'T ClipToRange(const T &,const T &,const T &)': template parameter 'T' is ambiguous [C:\projects\tesseract\build\libtesseract.vcxproj]
c:\projects\tesseract\ccutil\helpers.h(122): note: see declaration of 'ClipToRange'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: could be 'char'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: or 'int'
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* unittest: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* arch: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
2018-03-14 04:36:30 +08:00
|
|
|
|
max_top = UINT8_MAX;
|
2015-07-10 05:28:20 +08:00
|
|
|
|
width = 0.0f;
|
|
|
|
|
width_sd = 0.0f;
|
|
|
|
|
bearing = 0.0f;
|
|
|
|
|
bearing_sd = 0.0f;
|
|
|
|
|
advance = 0.0f;
|
|
|
|
|
advance_sd = 0.0f;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Sets all ranges to empty. Used before expanding with font-based data.
|
|
|
|
|
void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
|
Use POSIX data types and macros (#878)
* api: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* cutil: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* training: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract data types by POSIX data types
Now all Tesseract data types which are no longer needed can be removed
from ccutil/host.h.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* lstm: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Remove the macros which are now unused from ccutil/host.h.
Remove also the obsolete history comments.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* Fix build error caused by ambiguous ClipToRange
Error message vom Appveyor CI:
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2672: 'ClipToRange': no matching overloaded function found [C:\projects\tesseract\build\libtesseract.vcxproj]
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2782: 'T ClipToRange(const T &,const T &,const T &)': template parameter 'T' is ambiguous [C:\projects\tesseract\build\libtesseract.vcxproj]
c:\projects\tesseract\ccutil\helpers.h(122): note: see declaration of 'ClipToRange'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: could be 'char'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: or 'int'
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* unittest: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* arch: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
2018-03-14 04:36:30 +08:00
|
|
|
|
min_bottom = UINT8_MAX;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
max_bottom = 0;
|
Use POSIX data types and macros (#878)
* api: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* cutil: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* training: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract data types by POSIX data types
Now all Tesseract data types which are no longer needed can be removed
from ccutil/host.h.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* lstm: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Remove the macros which are now unused from ccutil/host.h.
Remove also the obsolete history comments.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* Fix build error caused by ambiguous ClipToRange
Error message vom Appveyor CI:
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2672: 'ClipToRange': no matching overloaded function found [C:\projects\tesseract\build\libtesseract.vcxproj]
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2782: 'T ClipToRange(const T &,const T &,const T &)': template parameter 'T' is ambiguous [C:\projects\tesseract\build\libtesseract.vcxproj]
c:\projects\tesseract\ccutil\helpers.h(122): note: see declaration of 'ClipToRange'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: could be 'char'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: or 'int'
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* unittest: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* arch: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
2018-03-14 04:36:30 +08:00
|
|
|
|
min_top = UINT8_MAX;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
max_top = 0;
|
2015-07-10 05:28:20 +08:00
|
|
|
|
width = 0.0f;
|
|
|
|
|
width_sd = 0.0f;
|
|
|
|
|
bearing = 0.0f;
|
|
|
|
|
bearing_sd = 0.0f;
|
|
|
|
|
advance = 0.0f;
|
|
|
|
|
advance_sd = 0.0f;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
}
|
|
|
|
|
|
2015-07-10 05:28:20 +08:00
|
|
|
|
// Returns true if any of the top/bottom/width/bearing/advance ranges/stats
|
2018-05-28 00:40:13 +08:00
|
|
|
|
// is empty.
|
2012-02-02 11:14:43 +08:00
|
|
|
|
bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {
|
2015-07-10 05:28:20 +08:00
|
|
|
|
return width == 0.0f || advance == 0.0f;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Expands the ranges with the ranges from the src properties.
|
|
|
|
|
void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
|
|
|
|
|
const UNICHAR_PROPERTIES& src) {
|
|
|
|
|
UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
|
|
|
|
|
UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
|
|
|
|
|
UpdateRange(src.min_top, &min_top, &max_top);
|
|
|
|
|
UpdateRange(src.max_top, &min_top, &max_top);
|
2015-07-10 05:28:20 +08:00
|
|
|
|
if (src.width_sd > width_sd) {
|
|
|
|
|
width = src.width;
|
|
|
|
|
width_sd = src.width_sd;
|
|
|
|
|
}
|
|
|
|
|
if (src.bearing_sd > bearing_sd) {
|
|
|
|
|
bearing = src.bearing;
|
|
|
|
|
bearing_sd = src.bearing_sd;
|
|
|
|
|
}
|
|
|
|
|
if (src.advance_sd > advance_sd) {
|
|
|
|
|
advance = src.advance;
|
|
|
|
|
advance_sd = src.advance_sd;
|
|
|
|
|
}
|
2012-02-02 11:14:43 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Copies the properties from src into this.
|
|
|
|
|
void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES& src) {
|
|
|
|
|
// Apart from the fragment, everything else can be done with a default copy.
|
|
|
|
|
CHAR_FRAGMENT* saved_fragment = fragment;
|
|
|
|
|
*this = src; // Bitwise copy.
|
|
|
|
|
fragment = saved_fragment;
|
2010-11-24 02:34:14 +08:00
|
|
|
|
}
|
2007-07-18 09:15:07 +08:00
|
|
|
|
|
2007-05-16 09:25:41 +08:00
|
|
|
|
UNICHARSET::UNICHARSET() :
|
2016-12-13 00:20:28 +08:00
|
|
|
|
unichars(nullptr),
|
2007-05-16 09:25:41 +08:00
|
|
|
|
ids(),
|
|
|
|
|
size_used(0),
|
2008-04-22 08:23:41 +08:00
|
|
|
|
size_reserved(0),
|
2016-12-13 00:20:28 +08:00
|
|
|
|
script_table(nullptr),
|
2018-04-23 14:32:23 +08:00
|
|
|
|
script_table_size_used(0) {
|
2010-11-24 02:34:14 +08:00
|
|
|
|
clear();
|
2013-09-23 23:16:01 +08:00
|
|
|
|
for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
|
|
|
|
|
unichar_insert(kSpecialUnicharCodes[i]);
|
|
|
|
|
if (i == UNICHAR_JOINED)
|
|
|
|
|
set_isngram(i, true);
|
|
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
|
}
|
2007-05-16 09:25:41 +08:00
|
|
|
|
|
|
|
|
|
UNICHARSET::~UNICHARSET() {
|
2010-11-24 02:34:14 +08:00
|
|
|
|
clear();
|
2007-05-16 09:25:41 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void UNICHARSET::reserve(int unichars_number) {
|
2008-02-01 08:21:49 +08:00
|
|
|
|
if (unichars_number > size_reserved) {
|
2019-03-26 14:55:08 +08:00
|
|
|
|
auto* unichars_new = new UNICHAR_SLOT[unichars_number];
|
2007-05-16 09:25:41 +08:00
|
|
|
|
for (int i = 0; i < size_used; ++i)
|
2012-02-02 11:14:43 +08:00
|
|
|
|
unichars_new[i] = unichars[i];
|
2009-07-11 10:50:24 +08:00
|
|
|
|
for (int j = size_used; j < unichars_number; ++j) {
|
|
|
|
|
unichars_new[j].properties.script_id = add_script(null_script);
|
|
|
|
|
}
|
2007-05-16 09:25:41 +08:00
|
|
|
|
delete[] unichars;
|
|
|
|
|
unichars = unichars_new;
|
|
|
|
|
size_reserved = unichars_number;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-05 05:34:22 +08:00
|
|
|
|
UNICHAR_ID
|
2007-05-16 09:25:41 +08:00
|
|
|
|
UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
|
2018-03-03 21:36:28 +08:00
|
|
|
|
std::string cleaned =
|
2017-07-25 02:45:57 +08:00
|
|
|
|
old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
|
|
|
|
|
return ids.contains(cleaned.data(), cleaned.size())
|
|
|
|
|
? ids.unichar_to_id(cleaned.data(), cleaned.size())
|
|
|
|
|
: INVALID_UNICHAR_ID;
|
2007-05-16 09:25:41 +08:00
|
|
|
|
}
|
|
|
|
|
|
2015-11-05 05:34:22 +08:00
|
|
|
|
UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
|
|
|
|
|
int length) const {
|
2007-05-16 09:25:41 +08:00
|
|
|
|
assert(length > 0 && length <= UNICHAR_LEN);
|
2018-03-03 21:36:28 +08:00
|
|
|
|
std::string cleaned(unichar_repr, length);
|
2017-07-25 02:45:57 +08:00
|
|
|
|
if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
|
|
|
|
|
return ids.contains(cleaned.data(), cleaned.size())
|
|
|
|
|
? ids.unichar_to_id(cleaned.data(), cleaned.size())
|
|
|
|
|
: INVALID_UNICHAR_ID;
|
2007-05-16 09:25:41 +08:00
|
|
|
|
}
|
|
|
|
|
|
2008-02-01 08:21:49 +08:00
|
|
|
|
// Return the minimum number of bytes that matches a legal UNICHAR_ID,
|
2013-09-23 23:16:01 +08:00
|
|
|
|
// while leaving the rest of the string encodable. Returns 0 if the
|
|
|
|
|
// beginning of the string is not encodable.
|
|
|
|
|
// WARNING: this function now encodes the whole string for precision.
|
|
|
|
|
// Use encode_string in preference to repeatedly calling step.
|
2008-02-01 08:21:49 +08:00
|
|
|
|
int UNICHARSET::step(const char* str) const {
|
2013-09-23 23:16:01 +08:00
|
|
|
|
GenericVector<UNICHAR_ID> encoding;
|
|
|
|
|
GenericVector<char> lengths;
|
2016-12-13 00:20:28 +08:00
|
|
|
|
encode_string(str, true, &encoding, &lengths, nullptr);
|
2013-09-23 23:16:01 +08:00
|
|
|
|
if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
|
|
|
|
|
return lengths[0];
|
|
|
|
|
}
|
2008-02-01 08:21:49 +08:00
|
|
|
|
|
2012-02-02 11:14:43 +08:00
|
|
|
|
// Return whether the given UTF-8 string is encodable with this UNICHARSET.
|
|
|
|
|
// If not encodable, write the first byte offset which cannot be converted
|
|
|
|
|
// into the second (return) argument.
|
|
|
|
|
bool UNICHARSET::encodable_string(const char *str,
|
|
|
|
|
int *first_bad_position) const {
|
2013-09-23 23:16:01 +08:00
|
|
|
|
GenericVector<UNICHAR_ID> encoding;
|
2016-12-13 00:20:28 +08:00
|
|
|
|
return encode_string(str, true, &encoding, nullptr, first_bad_position);
|
2013-09-23 23:16:01 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Encodes the given UTF-8 string with this UNICHARSET.
|
|
|
|
|
// Returns true if the encoding succeeds completely, false if there is at
|
|
|
|
|
// least one INVALID_UNICHAR_ID in the returned encoding, but in this case
|
|
|
|
|
// the rest of the string is still encoded.
|
2016-12-13 00:20:28 +08:00
|
|
|
|
// If lengths is not nullptr, then it is filled with the corresponding
|
2013-09-23 23:16:01 +08:00
|
|
|
|
// byte length of each encoded UNICHAR_ID.
|
2017-07-25 02:45:57 +08:00
|
|
|
|
// WARNING: Caller must guarantee that str has already been cleaned of codes
|
|
|
|
|
// that do not belong in the unicharset, or encoding may fail.
|
|
|
|
|
// Use CleanupString to perform the cleaning.
|
2013-09-23 23:16:01 +08:00
|
|
|
|
bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
|
|
|
|
|
GenericVector<UNICHAR_ID>* encoding,
|
|
|
|
|
GenericVector<char>* lengths,
|
|
|
|
|
int* encoded_length) const {
|
|
|
|
|
GenericVector<UNICHAR_ID> working_encoding;
|
|
|
|
|
GenericVector<char> working_lengths;
|
|
|
|
|
GenericVector<char> best_lengths;
|
|
|
|
|
encoding->truncate(0); // Just in case str is empty.
|
|
|
|
|
int str_length = strlen(str);
|
|
|
|
|
int str_pos = 0;
|
|
|
|
|
bool perfect = true;
|
|
|
|
|
while (str_pos < str_length) {
|
|
|
|
|
encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
|
|
|
|
|
&str_pos, encoding, &best_lengths);
|
|
|
|
|
if (str_pos < str_length) {
|
|
|
|
|
// This is a non-match. Skip one utf-8 character.
|
|
|
|
|
perfect = false;
|
|
|
|
|
if (give_up_on_failure) break;
|
|
|
|
|
int step = UNICHAR::utf8_step(str + str_pos);
|
|
|
|
|
if (step == 0) step = 1;
|
|
|
|
|
encoding->push_back(INVALID_UNICHAR_ID);
|
|
|
|
|
best_lengths.push_back(step);
|
|
|
|
|
str_pos += step;
|
|
|
|
|
working_encoding = *encoding;
|
|
|
|
|
working_lengths = best_lengths;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
2016-12-13 00:20:28 +08:00
|
|
|
|
if (lengths != nullptr) *lengths = best_lengths;
|
|
|
|
|
if (encoded_length != nullptr) *encoded_length = str_pos;
|
2013-09-23 23:16:01 +08:00
|
|
|
|
return perfect;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
}
|
|
|
|
|
|
2015-11-05 05:34:22 +08:00
|
|
|
|
const char* UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
|
2009-07-11 10:50:24 +08:00
|
|
|
|
if (id == INVALID_UNICHAR_ID) {
|
|
|
|
|
return INVALID_UNICHAR;
|
|
|
|
|
}
|
2012-02-02 11:14:43 +08:00
|
|
|
|
ASSERT_HOST(id < this->size());
|
|
|
|
|
return unichars[id].representation;
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-05 05:34:22 +08:00
|
|
|
|
const char* UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const {
|
2012-02-02 11:14:43 +08:00
|
|
|
|
if (id == INVALID_UNICHAR_ID) {
|
|
|
|
|
return INVALID_UNICHAR;
|
|
|
|
|
}
|
|
|
|
|
ASSERT_HOST(id < this->size());
|
|
|
|
|
// Resolve from the kCustomLigatures table if this is a private encoding.
|
|
|
|
|
if (get_isprivate(id)) {
|
|
|
|
|
const char* ch = id_to_unichar(id);
|
2016-12-13 00:20:28 +08:00
|
|
|
|
for (int i = 0; kCustomLigatures[i][0] != nullptr; ++i) {
|
2012-02-02 11:14:43 +08:00
|
|
|
|
if (!strcmp(ch, kCustomLigatures[i][1])) {
|
|
|
|
|
return kCustomLigatures[i][0];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// Otherwise return the stored representation.
|
2007-07-18 09:15:07 +08:00
|
|
|
|
return unichars[id].representation;
|
2007-05-16 09:25:41 +08:00
|
|
|
|
}
|
|
|
|
|
|
2009-07-11 10:50:24 +08:00
|
|
|
|
// Return a STRING that reformats the utf8 str into the str followed
|
|
|
|
|
// by its hex unicodes.
|
|
|
|
|
STRING UNICHARSET::debug_utf8_str(const char* str) {
|
2008-04-22 08:23:41 +08:00
|
|
|
|
STRING result = str;
|
|
|
|
|
result += " [";
|
|
|
|
|
int step = 1;
|
|
|
|
|
// Chop into unicodes and code each as hex.
|
|
|
|
|
for (int i = 0; str[i] != '\0'; i += step) {
|
|
|
|
|
char hex[sizeof(int) * 2 + 1];
|
|
|
|
|
step = UNICHAR::utf8_step(str + i);
|
|
|
|
|
if (step == 0) {
|
|
|
|
|
step = 1;
|
|
|
|
|
sprintf(hex, "%x", str[i]);
|
|
|
|
|
} else {
|
|
|
|
|
UNICHAR ch(str + i, step);
|
|
|
|
|
sprintf(hex, "%x", ch.first_uni());
|
|
|
|
|
}
|
|
|
|
|
result += hex;
|
|
|
|
|
result += " ";
|
|
|
|
|
}
|
|
|
|
|
result += "]";
|
2009-07-11 10:50:24 +08:00
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Return a STRING containing debug information on the unichar, including
|
|
|
|
|
// the id_to_unichar, its hex unicodes and the properties.
|
|
|
|
|
STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
|
2010-11-24 02:34:14 +08:00
|
|
|
|
if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
|
2009-07-11 10:50:24 +08:00
|
|
|
|
const CHAR_FRAGMENT *fragment = this->get_fragment(id);
|
|
|
|
|
if (fragment) {
|
2012-02-02 11:14:43 +08:00
|
|
|
|
return fragment->to_string();
|
2009-07-11 10:50:24 +08:00
|
|
|
|
}
|
|
|
|
|
const char* str = id_to_unichar(id);
|
|
|
|
|
STRING result = debug_utf8_str(str);
|
2008-04-22 08:23:41 +08:00
|
|
|
|
// Append a for lower alpha, A for upper alpha, and x if alpha but neither.
|
|
|
|
|
if (get_isalpha(id)) {
|
|
|
|
|
if (get_islower(id))
|
|
|
|
|
result += "a";
|
|
|
|
|
else if (get_isupper(id))
|
|
|
|
|
result += "A";
|
|
|
|
|
else
|
|
|
|
|
result += "x";
|
|
|
|
|
}
|
|
|
|
|
// Append 0 if a digit.
|
|
|
|
|
if (get_isdigit(id)) {
|
|
|
|
|
result += "0";
|
|
|
|
|
}
|
2009-07-11 10:50:24 +08:00
|
|
|
|
// Append p is a punctuation symbol.
|
|
|
|
|
if (get_ispunctuation(id)) {
|
|
|
|
|
result += "p";
|
|
|
|
|
}
|
2008-04-22 08:23:41 +08:00
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2013-09-23 23:16:01 +08:00
|
|
|
|
// Sets the normed_ids vector from the normed string. normed_ids is not
|
|
|
|
|
// stored in the file, and needs to be set when the UNICHARSET is loaded.
|
|
|
|
|
void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) {
|
|
|
|
|
unichars[unichar_id].properties.normed_ids.truncate(0);
|
2015-07-10 05:50:25 +08:00
|
|
|
|
if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
|
|
|
|
|
unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
|
|
|
|
|
} else if (!encode_string(unichars[unichar_id].properties.normed.string(),
|
|
|
|
|
true, &unichars[unichar_id].properties.normed_ids,
|
2016-12-13 00:20:28 +08:00
|
|
|
|
nullptr, nullptr)) {
|
2015-07-10 05:50:25 +08:00
|
|
|
|
unichars[unichar_id].properties.normed_ids.truncate(0);
|
|
|
|
|
unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
|
2013-09-23 23:16:01 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2012-02-02 11:14:43 +08:00
|
|
|
|
// Returns whether the unichar id represents a unicode value in the private use
|
|
|
|
|
// area. We use this range only internally to represent uncommon ligatures
|
|
|
|
|
// (eg. 'ct') that do not have regular unicode values.
|
|
|
|
|
bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {
|
|
|
|
|
UNICHAR uc(id_to_unichar(unichar_id), -1);
|
|
|
|
|
int uni = uc.first_uni();
|
|
|
|
|
return (uni >= 0xE000 && uni <= 0xF8FF);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Sets all ranges to empty, so they can be expanded to set the values.
|
|
|
|
|
void UNICHARSET::set_ranges_empty() {
|
|
|
|
|
for (int id = 0; id < size_used; ++id) {
|
|
|
|
|
unichars[id].properties.SetRangesEmpty();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Sets all the properties for this unicharset given a src unicharset with
|
|
|
|
|
// everything set. The unicharsets don't have to be the same, and graphemes
|
|
|
|
|
// are correctly accounted for.
|
2013-09-23 23:16:01 +08:00
|
|
|
|
void UNICHARSET::PartialSetPropertiesFromOther(int start_index,
|
|
|
|
|
const UNICHARSET& src) {
|
|
|
|
|
for (int ch = start_index; ch < size_used; ++ch) {
|
2012-02-02 11:14:43 +08:00
|
|
|
|
const char* utf8 = id_to_unichar(ch);
|
|
|
|
|
UNICHAR_PROPERTIES properties;
|
|
|
|
|
if (src.GetStrProperties(utf8, &properties)) {
|
|
|
|
|
// Setup the script_id, other_case, and mirror properly.
|
|
|
|
|
const char* script = src.get_script_from_script_id(properties.script_id);
|
|
|
|
|
properties.script_id = add_script(script);
|
|
|
|
|
const char* other_case = src.id_to_unichar(properties.other_case);
|
|
|
|
|
if (contains_unichar(other_case)) {
|
|
|
|
|
properties.other_case = unichar_to_id(other_case);
|
|
|
|
|
} else {
|
|
|
|
|
properties.other_case = ch;
|
|
|
|
|
}
|
|
|
|
|
const char* mirror_str = src.id_to_unichar(properties.mirror);
|
|
|
|
|
if (contains_unichar(mirror_str)) {
|
|
|
|
|
properties.mirror = unichar_to_id(mirror_str);
|
|
|
|
|
} else {
|
|
|
|
|
properties.mirror = ch;
|
|
|
|
|
}
|
|
|
|
|
unichars[ch].properties.CopyFrom(properties);
|
2013-09-23 23:16:01 +08:00
|
|
|
|
set_normed_ids(ch);
|
2012-02-02 11:14:43 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Expands the tops and bottoms and widths for this unicharset given a
|
|
|
|
|
// src unicharset with ranges in it. The unicharsets don't have to be the
|
|
|
|
|
// same, and graphemes are correctly accounted for.
|
|
|
|
|
void UNICHARSET::ExpandRangesFromOther(const UNICHARSET& src) {
|
|
|
|
|
for (int ch = 0; ch < size_used; ++ch) {
|
|
|
|
|
const char* utf8 = id_to_unichar(ch);
|
|
|
|
|
UNICHAR_PROPERTIES properties;
|
|
|
|
|
if (src.GetStrProperties(utf8, &properties)) {
|
|
|
|
|
// Expand just the ranges from properties.
|
|
|
|
|
unichars[ch].properties.ExpandRangesFrom(properties);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2014-08-12 07:23:06 +08:00
|
|
|
|
// Makes this a copy of src. Clears this completely first, so the automatic
|
|
|
|
|
// ids will not be present in this if not in src. Does NOT reorder the set!
|
2013-09-23 23:16:01 +08:00
|
|
|
|
void UNICHARSET::CopyFrom(const UNICHARSET& src) {
|
|
|
|
|
clear();
|
2014-08-12 07:23:06 +08:00
|
|
|
|
for (int ch = 0; ch < src.size_used; ++ch) {
|
|
|
|
|
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
|
|
|
|
|
const char* utf8 = src.id_to_unichar(ch);
|
2017-07-25 02:45:57 +08:00
|
|
|
|
unichar_insert_backwards_compatible(utf8);
|
2014-08-12 07:23:06 +08:00
|
|
|
|
unichars[ch].properties.ExpandRangesFrom(src_props);
|
|
|
|
|
}
|
|
|
|
|
// Set properties, including mirror and other_case, WITHOUT reordering
|
|
|
|
|
// the unicharset.
|
|
|
|
|
PartialSetPropertiesFromOther(0, src);
|
2013-09-23 23:16:01 +08:00
|
|
|
|
}
|
|
|
|
|
|
2012-02-02 11:14:43 +08:00
|
|
|
|
// For each id in src, if it does not occur in this, add it, as in
|
|
|
|
|
// SetPropertiesFromOther, otherwise expand the ranges, as in
|
|
|
|
|
// ExpandRangesFromOther.
|
|
|
|
|
void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) {
|
2013-09-23 23:16:01 +08:00
|
|
|
|
int initial_used = size_used;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
for (int ch = 0; ch < src.size_used; ++ch) {
|
|
|
|
|
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
|
|
|
|
|
const char* utf8 = src.id_to_unichar(ch);
|
|
|
|
|
int id = size_used;
|
|
|
|
|
if (contains_unichar(utf8)) {
|
|
|
|
|
id = unichar_to_id(utf8);
|
|
|
|
|
// Just expand current ranges.
|
|
|
|
|
unichars[id].properties.ExpandRangesFrom(src_props);
|
|
|
|
|
} else {
|
2017-07-25 02:45:57 +08:00
|
|
|
|
unichar_insert_backwards_compatible(utf8);
|
2013-09-23 23:16:01 +08:00
|
|
|
|
unichars[id].properties.SetRangesEmpty();
|
2012-02-02 11:14:43 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
2013-09-23 23:16:01 +08:00
|
|
|
|
// Set properties, including mirror and other_case, WITHOUT reordering
|
|
|
|
|
// the unicharset.
|
|
|
|
|
PartialSetPropertiesFromOther(initial_used, src);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Returns true if the acceptable ranges of the tops of the characters do
|
|
|
|
|
// not overlap, making their x-height calculations distinct.
|
|
|
|
|
bool UNICHARSET::SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const {
|
2018-05-20 21:18:07 +08:00
|
|
|
|
int overlap = std::min(unichars[id1].properties.max_top,
|
2013-09-23 23:16:01 +08:00
|
|
|
|
unichars[id2].properties.max_top) -
|
2018-05-20 21:18:07 +08:00
|
|
|
|
std::max(unichars[id1].properties.min_top,
|
2013-09-23 23:16:01 +08:00
|
|
|
|
unichars[id2].properties.min_top);
|
|
|
|
|
return overlap <= 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Internal recursive version of encode_string above.
|
|
|
|
|
// Seeks to encode the given string as a sequence of UNICHAR_IDs such that
|
|
|
|
|
// each UNICHAR_ID uses the least possible part of the utf8 str.
|
|
|
|
|
// It does this by depth-first tail recursion on increasing length matches
|
|
|
|
|
// to the UNICHARSET, saving the first encountered result that encodes the
|
|
|
|
|
// maximum total length of str. It stops on a failure to encode to make
|
|
|
|
|
// the overall process of encoding a partially failed string more efficient.
|
|
|
|
|
// See unicharset.h for definition of the args.
|
|
|
|
|
void UNICHARSET::encode_string(const char* str, int str_index, int str_length,
|
|
|
|
|
GenericVector<UNICHAR_ID>* encoding,
|
|
|
|
|
GenericVector<char>* lengths,
|
|
|
|
|
int* best_total_length,
|
|
|
|
|
GenericVector<UNICHAR_ID>* best_encoding,
|
|
|
|
|
GenericVector<char>* best_lengths) const {
|
|
|
|
|
if (str_index > *best_total_length) {
|
|
|
|
|
// This is the best result so far.
|
|
|
|
|
*best_total_length = str_index;
|
|
|
|
|
*best_encoding = *encoding;
|
2016-12-13 00:20:28 +08:00
|
|
|
|
if (best_lengths != nullptr)
|
2013-09-23 23:16:01 +08:00
|
|
|
|
*best_lengths = *lengths;
|
|
|
|
|
}
|
|
|
|
|
if (str_index == str_length) return;
|
|
|
|
|
int encoding_index = encoding->size();
|
|
|
|
|
// Find the length of the first matching unicharset member.
|
|
|
|
|
int length = ids.minmatch(str + str_index);
|
|
|
|
|
if (length == 0 || str_index + length > str_length) return;
|
|
|
|
|
do {
|
|
|
|
|
if (ids.contains(str + str_index, length)) {
|
|
|
|
|
// Successful encoding so far.
|
|
|
|
|
UNICHAR_ID id = ids.unichar_to_id(str + str_index, length);
|
|
|
|
|
encoding->push_back(id);
|
|
|
|
|
lengths->push_back(length);
|
|
|
|
|
encode_string(str, str_index + length, str_length, encoding, lengths,
|
|
|
|
|
best_total_length, best_encoding, best_lengths);
|
|
|
|
|
if (*best_total_length == str_length)
|
|
|
|
|
return; // Tail recursion success!
|
|
|
|
|
// Failed with that length, truncate back and try again.
|
|
|
|
|
encoding->truncate(encoding_index);
|
|
|
|
|
lengths->truncate(encoding_index);
|
|
|
|
|
}
|
|
|
|
|
int step = UNICHAR::utf8_step(str + str_index + length);
|
|
|
|
|
if (step == 0) step = 1;
|
|
|
|
|
length += step;
|
|
|
|
|
} while (length <= UNICHAR_LEN && str_index + length <= str_length);
|
2012-02-02 11:14:43 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Gets the properties for a grapheme string, combining properties for
|
|
|
|
|
// multiple characters in a meaningful way where possible.
|
|
|
|
|
// Returns false if no valid match was found in the unicharset.
|
|
|
|
|
// NOTE that script_id, mirror, and other_case refer to this unicharset on
|
|
|
|
|
// return and will need translation if the target unicharset is different.
|
|
|
|
|
bool UNICHARSET::GetStrProperties(const char* utf8_str,
|
|
|
|
|
UNICHAR_PROPERTIES* props) const {
|
|
|
|
|
props->Init();
|
|
|
|
|
props->SetRangesEmpty();
|
|
|
|
|
int total_unicodes = 0;
|
2013-09-23 23:16:01 +08:00
|
|
|
|
GenericVector<UNICHAR_ID> encoding;
|
2016-12-13 00:20:28 +08:00
|
|
|
|
if (!encode_string(utf8_str, true, &encoding, nullptr, nullptr))
|
2013-09-23 23:16:01 +08:00
|
|
|
|
return false; // Some part was invalid.
|
|
|
|
|
for (int i = 0; i < encoding.size(); ++i) {
|
|
|
|
|
int id = encoding[i];
|
2012-02-02 11:14:43 +08:00
|
|
|
|
const UNICHAR_PROPERTIES& src_props = unichars[id].properties;
|
|
|
|
|
// Logical OR all the bools.
|
|
|
|
|
if (src_props.isalpha) props->isalpha = true;
|
|
|
|
|
if (src_props.islower) props->islower = true;
|
|
|
|
|
if (src_props.isupper) props->isupper = true;
|
|
|
|
|
if (src_props.isdigit) props->isdigit = true;
|
|
|
|
|
if (src_props.ispunctuation) props->ispunctuation = true;
|
|
|
|
|
if (src_props.isngram) props->isngram = true;
|
|
|
|
|
if (src_props.enabled) props->enabled = true;
|
|
|
|
|
// Min/max the tops/bottoms.
|
|
|
|
|
UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
|
|
|
|
|
UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
|
|
|
|
|
UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
|
|
|
|
|
UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
|
2015-07-10 05:28:20 +08:00
|
|
|
|
float bearing = props->advance + src_props.bearing;
|
|
|
|
|
if (total_unicodes == 0 || bearing < props->bearing) {
|
|
|
|
|
props->bearing = bearing;
|
|
|
|
|
props->bearing_sd = props->advance_sd + src_props.bearing_sd;
|
|
|
|
|
}
|
|
|
|
|
props->advance += src_props.advance;
|
|
|
|
|
props->advance_sd += src_props.advance_sd;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
// With a single width, just use the widths stored in the unicharset.
|
2015-07-10 05:28:20 +08:00
|
|
|
|
props->width = src_props.width;
|
|
|
|
|
props->width_sd = src_props.width_sd;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
// Use the first script id, other_case, mirror, direction.
|
|
|
|
|
// Note that these will need translation, except direction.
|
|
|
|
|
if (total_unicodes == 0) {
|
|
|
|
|
props->script_id = src_props.script_id;
|
|
|
|
|
props->other_case = src_props.other_case;
|
|
|
|
|
props->mirror = src_props.mirror;
|
|
|
|
|
props->direction = src_props.direction;
|
|
|
|
|
}
|
|
|
|
|
// The normed string for the compound character is the concatenation of
|
|
|
|
|
// the normed versions of the individual characters.
|
|
|
|
|
props->normed += src_props.normed;
|
|
|
|
|
++total_unicodes;
|
|
|
|
|
}
|
|
|
|
|
if (total_unicodes > 1) {
|
|
|
|
|
// Estimate the total widths from the advance - bearing.
|
2015-07-10 05:28:20 +08:00
|
|
|
|
props->width = props->advance - props->bearing;
|
|
|
|
|
props->width_sd = props->advance_sd + props->bearing_sd;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
}
|
|
|
|
|
return total_unicodes > 0;
|
|
|
|
|
}
|
|
|
|
|
|
2013-09-23 23:16:01 +08:00
|
|
|
|
// TODO(rays) clean-up the order of functions to match unicharset.h.
|
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
|
unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const {
|
|
|
|
|
unsigned int properties = 0;
|
|
|
|
|
if (this->get_isalpha(id))
|
|
|
|
|
properties |= ISALPHA_MASK;
|
|
|
|
|
if (this->get_islower(id))
|
|
|
|
|
properties |= ISLOWER_MASK;
|
|
|
|
|
if (this->get_isupper(id))
|
|
|
|
|
properties |= ISUPPER_MASK;
|
|
|
|
|
if (this->get_isdigit(id))
|
|
|
|
|
properties |= ISDIGIT_MASK;
|
|
|
|
|
if (this->get_ispunctuation(id))
|
|
|
|
|
properties |= ISPUNCTUATION_MASK;
|
|
|
|
|
return properties;
|
|
|
|
|
}
|
2009-07-11 10:50:24 +08:00
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
|
char UNICHARSET::get_chartype(UNICHAR_ID id) const {
|
|
|
|
|
if (this->get_isupper(id)) return 'A';
|
|
|
|
|
if (this->get_islower(id)) return 'a';
|
|
|
|
|
if (this->get_isalpha(id)) return 'x';
|
|
|
|
|
if (this->get_isdigit(id)) return '0';
|
|
|
|
|
if (this->get_ispunctuation(id)) return 'p';
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
2009-07-11 10:50:24 +08:00
|
|
|
|
|
2017-07-25 02:45:57 +08:00
|
|
|
|
void UNICHARSET::unichar_insert(const char* const unichar_repr,
|
|
|
|
|
OldUncleanUnichars old_style) {
|
|
|
|
|
if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true;
|
2018-03-03 21:36:28 +08:00
|
|
|
|
std::string cleaned =
|
2017-07-25 02:45:57 +08:00
|
|
|
|
old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
|
|
|
|
|
if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
|
|
|
|
|
const char* str = cleaned.c_str();
|
|
|
|
|
GenericVector<int> encoding;
|
|
|
|
|
if (!old_style_included_ &&
|
|
|
|
|
encode_string(str, true, &encoding, nullptr, nullptr))
|
2009-07-11 10:50:24 +08:00
|
|
|
|
return;
|
2008-02-01 08:21:49 +08:00
|
|
|
|
if (size_used == size_reserved) {
|
2007-05-16 09:25:41 +08:00
|
|
|
|
if (size_used == 0)
|
|
|
|
|
reserve(8);
|
|
|
|
|
else
|
|
|
|
|
reserve(2 * size_used);
|
|
|
|
|
}
|
2017-07-25 02:45:57 +08:00
|
|
|
|
int index = 0;
|
|
|
|
|
do {
|
2018-04-17 22:30:03 +08:00
|
|
|
|
if (index >= UNICHAR_LEN) {
|
2017-07-25 02:45:57 +08:00
|
|
|
|
fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
|
|
|
|
|
unichar_repr);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
unichars[size_used].representation[index++] = *str++;
|
|
|
|
|
} while (*str != '\0');
|
|
|
|
|
unichars[size_used].representation[index] = '\0';
|
2009-07-11 10:50:24 +08:00
|
|
|
|
this->set_script(size_used, null_script);
|
|
|
|
|
// If the given unichar_repr represents a fragmented character, set
|
|
|
|
|
// fragment property to a pointer to CHAR_FRAGMENT class instance with
|
|
|
|
|
// information parsed from the unichar representation. Use the script
|
|
|
|
|
// of the base unichar for the fragmented character if possible.
|
2017-07-25 02:45:57 +08:00
|
|
|
|
CHAR_FRAGMENT* frag =
|
|
|
|
|
CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation);
|
2009-07-11 10:50:24 +08:00
|
|
|
|
this->unichars[size_used].properties.fragment = frag;
|
2016-12-13 00:20:28 +08:00
|
|
|
|
if (frag != nullptr && this->contains_unichar(frag->get_unichar())) {
|
2009-07-11 10:50:24 +08:00
|
|
|
|
this->unichars[size_used].properties.script_id =
|
|
|
|
|
this->get_script(frag->get_unichar());
|
|
|
|
|
}
|
2008-02-01 08:21:49 +08:00
|
|
|
|
this->unichars[size_used].properties.enabled = true;
|
2017-07-25 02:45:57 +08:00
|
|
|
|
ids.insert(unichars[size_used].representation, size_used);
|
2007-05-16 09:25:41 +08:00
|
|
|
|
++size_used;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2009-07-11 10:50:24 +08:00
|
|
|
|
bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
|
2018-03-03 21:36:28 +08:00
|
|
|
|
std::string cleaned =
|
2017-07-25 02:45:57 +08:00
|
|
|
|
old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
|
|
|
|
|
return ids.contains(cleaned.data(), cleaned.size());
|
2007-05-16 09:25:41 +08:00
|
|
|
|
}
|
|
|
|
|
|
2009-07-11 10:50:24 +08:00
|
|
|
|
bool UNICHARSET::contains_unichar(const char* const unichar_repr,
|
|
|
|
|
int length) const {
|
|
|
|
|
if (length == 0) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2018-03-03 21:36:28 +08:00
|
|
|
|
std::string cleaned(unichar_repr, length);
|
2017-07-25 02:45:57 +08:00
|
|
|
|
if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
|
|
|
|
|
return ids.contains(cleaned.data(), cleaned.size());
|
2008-02-01 08:21:49 +08:00
|
|
|
|
}
|
|
|
|
|
|
2009-07-11 10:50:24 +08:00
|
|
|
|
bool UNICHARSET::eq(UNICHAR_ID unichar_id,
|
|
|
|
|
const char* const unichar_repr) const {
|
2007-05-16 09:25:41 +08:00
|
|
|
|
return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
|
|
|
|
|
}
|
|
|
|
|
|
2014-08-12 07:23:06 +08:00
|
|
|
|
bool UNICHARSET::save_to_string(STRING *str) const {
|
|
|
|
|
const int kFileBufSize = 1024;
|
|
|
|
|
char buffer[kFileBufSize + 1];
|
|
|
|
|
snprintf(buffer, kFileBufSize, "%d\n", this->size());
|
|
|
|
|
*str = buffer;
|
2007-05-16 09:25:41 +08:00
|
|
|
|
for (UNICHAR_ID id = 0; id < this->size(); ++id) {
|
2010-11-24 02:34:14 +08:00
|
|
|
|
int min_bottom, max_bottom, min_top, max_top;
|
|
|
|
|
get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
|
2015-07-10 05:28:20 +08:00
|
|
|
|
float width, width_sd;
|
|
|
|
|
get_width_stats(id, &width, &width_sd);
|
|
|
|
|
float bearing, bearing_sd;
|
|
|
|
|
get_bearing_stats(id, &bearing, &bearing_sd);
|
|
|
|
|
float advance, advance_sd;
|
|
|
|
|
get_advance_stats(id, &advance, &advance_sd);
|
2010-11-24 02:34:14 +08:00
|
|
|
|
unsigned int properties = this->get_properties(id);
|
2012-02-02 11:14:43 +08:00
|
|
|
|
if (strcmp(this->id_to_unichar(id), " ") == 0) {
|
2014-08-12 07:23:06 +08:00
|
|
|
|
snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
|
2009-07-11 10:50:24 +08:00
|
|
|
|
this->get_script_from_script_id(this->get_script(id)),
|
|
|
|
|
this->get_other_case(id));
|
2012-02-02 11:14:43 +08:00
|
|
|
|
} else {
|
2014-08-12 07:23:06 +08:00
|
|
|
|
snprintf(buffer, kFileBufSize,
|
2015-07-10 05:28:20 +08:00
|
|
|
|
"%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %s %d %d %d %s\t# %s\n",
|
2010-11-24 02:34:14 +08:00
|
|
|
|
this->id_to_unichar(id), properties,
|
2015-07-10 05:28:20 +08:00
|
|
|
|
min_bottom, max_bottom, min_top, max_top, width, width_sd,
|
|
|
|
|
bearing, bearing_sd, advance, advance_sd,
|
2009-07-11 10:50:24 +08:00
|
|
|
|
this->get_script_from_script_id(this->get_script(id)),
|
2012-02-02 11:14:43 +08:00
|
|
|
|
this->get_other_case(id), this->get_direction(id),
|
|
|
|
|
this->get_mirror(id), this->get_normed_unichar(id),
|
|
|
|
|
this->debug_str(id).string());
|
|
|
|
|
}
|
2014-08-12 07:23:06 +08:00
|
|
|
|
*str += buffer;
|
2007-05-16 09:25:41 +08:00
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2014-08-12 07:23:06 +08:00
|
|
|
|
// TODO(rays) Replace with TFile everywhere.
|
2012-02-02 11:14:43 +08:00
|
|
|
|
class InMemoryFilePointer {
|
|
|
|
|
public:
|
|
|
|
|
InMemoryFilePointer(const char *memory, int mem_size)
|
|
|
|
|
: memory_(memory), fgets_ptr_(memory), mem_size_(mem_size) { }
|
|
|
|
|
|
|
|
|
|
char *fgets(char *orig_dst, int size) {
|
|
|
|
|
const char *src_end = memory_ + mem_size_;
|
|
|
|
|
char *dst_end = orig_dst + size - 1;
|
|
|
|
|
if (size < 1) {
|
2016-12-13 00:20:28 +08:00
|
|
|
|
return fgets_ptr_ < src_end ? orig_dst : nullptr;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
char *dst = orig_dst;
|
|
|
|
|
char ch = '^';
|
|
|
|
|
while (fgets_ptr_ < src_end && dst < dst_end && ch != '\n') {
|
|
|
|
|
ch = *dst++ = *fgets_ptr_++;
|
|
|
|
|
}
|
|
|
|
|
*dst = 0;
|
2016-12-13 00:20:28 +08:00
|
|
|
|
return (dst == orig_dst) ? nullptr : orig_dst;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
const char *memory_;
|
|
|
|
|
const char *fgets_ptr_;
|
|
|
|
|
const int mem_size_;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
bool UNICHARSET::load_from_inmemory_file(
|
|
|
|
|
const char *memory, int mem_size, bool skip_fragments) {
|
|
|
|
|
InMemoryFilePointer mem_fp(memory, mem_size);
|
|
|
|
|
TessResultCallback2<char *, char *, int> *fgets_cb =
|
|
|
|
|
NewPermanentTessCallback(&mem_fp, &InMemoryFilePointer::fgets);
|
|
|
|
|
bool success = load_via_fgets(fgets_cb, skip_fragments);
|
|
|
|
|
delete fgets_cb;
|
|
|
|
|
return success;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
class LocalFilePointer {
|
|
|
|
|
public:
|
|
|
|
|
LocalFilePointer(FILE *stream) : fp_(stream) {}
|
|
|
|
|
char *fgets(char *dst, int size) {
|
|
|
|
|
return ::fgets(dst, size, fp_);
|
|
|
|
|
}
|
|
|
|
|
private:
|
|
|
|
|
FILE *fp_;
|
|
|
|
|
};
|
|
|
|
|
|
2010-11-30 08:53:31 +08:00
|
|
|
|
bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
|
2012-02-02 11:14:43 +08:00
|
|
|
|
LocalFilePointer lfp(file);
|
|
|
|
|
TessResultCallback2<char *, char *, int> *fgets_cb =
|
|
|
|
|
NewPermanentTessCallback(&lfp, &LocalFilePointer::fgets);
|
|
|
|
|
bool success = load_via_fgets(fgets_cb, skip_fragments);
|
|
|
|
|
delete fgets_cb;
|
|
|
|
|
return success;
|
|
|
|
|
}
|
|
|
|
|
|
2014-08-12 07:23:06 +08:00
|
|
|
|
bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) {
|
|
|
|
|
TessResultCallback2<char *, char *, int> *fgets_cb =
|
|
|
|
|
NewPermanentTessCallback(file, &tesseract::TFile::FGets);
|
|
|
|
|
bool success = load_via_fgets(fgets_cb, skip_fragments);
|
|
|
|
|
delete fgets_cb;
|
|
|
|
|
return success;
|
|
|
|
|
}
|
|
|
|
|
|
2012-02-02 11:14:43 +08:00
|
|
|
|
bool UNICHARSET::load_via_fgets(
|
|
|
|
|
TessResultCallback2<char *, char *, int> *fgets_cb,
|
|
|
|
|
bool skip_fragments) {
|
2007-05-16 09:25:41 +08:00
|
|
|
|
int unicharset_size;
|
|
|
|
|
char buffer[256];
|
|
|
|
|
|
|
|
|
|
this->clear();
|
2016-12-13 00:20:28 +08:00
|
|
|
|
if (fgets_cb->Run(buffer, sizeof(buffer)) == nullptr ||
|
2008-02-01 08:21:49 +08:00
|
|
|
|
sscanf(buffer, "%d", &unicharset_size) != 1) {
|
2007-05-16 09:25:41 +08:00
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
this->reserve(unicharset_size);
|
|
|
|
|
for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
|
|
|
|
|
char unichar[256];
|
2007-07-18 09:15:07 +08:00
|
|
|
|
unsigned int properties;
|
2008-04-22 08:23:41 +08:00
|
|
|
|
char script[64];
|
2007-05-16 09:25:41 +08:00
|
|
|
|
|
2019-02-04 05:20:08 +08:00
|
|
|
|
strncpy(script, null_script, sizeof(script) - 1);
|
2010-11-24 02:34:14 +08:00
|
|
|
|
int min_bottom = 0;
|
Use POSIX data types and macros (#878)
* api: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* cutil: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* training: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract data types by POSIX data types
Now all Tesseract data types which are no longer needed can be removed
from ccutil/host.h.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* lstm: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Remove the macros which are now unused from ccutil/host.h.
Remove also the obsolete history comments.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* Fix build error caused by ambiguous ClipToRange
Error message vom Appveyor CI:
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2672: 'ClipToRange': no matching overloaded function found [C:\projects\tesseract\build\libtesseract.vcxproj]
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2782: 'T ClipToRange(const T &,const T &,const T &)': template parameter 'T' is ambiguous [C:\projects\tesseract\build\libtesseract.vcxproj]
c:\projects\tesseract\ccutil\helpers.h(122): note: see declaration of 'ClipToRange'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: could be 'char'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: or 'int'
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* unittest: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* arch: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
2018-03-14 04:36:30 +08:00
|
|
|
|
int max_bottom = UINT8_MAX;
|
2010-11-24 02:34:14 +08:00
|
|
|
|
int min_top = 0;
|
Use POSIX data types and macros (#878)
* api: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* cutil: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* training: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract data types by POSIX data types
Now all Tesseract data types which are no longer needed can be removed
from ccutil/host.h.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* lstm: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Remove the macros which are now unused from ccutil/host.h.
Remove also the obsolete history comments.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* Fix build error caused by ambiguous ClipToRange
Error message vom Appveyor CI:
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2672: 'ClipToRange': no matching overloaded function found [C:\projects\tesseract\build\libtesseract.vcxproj]
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2782: 'T ClipToRange(const T &,const T &,const T &)': template parameter 'T' is ambiguous [C:\projects\tesseract\build\libtesseract.vcxproj]
c:\projects\tesseract\ccutil\helpers.h(122): note: see declaration of 'ClipToRange'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: could be 'char'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: or 'int'
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* unittest: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* arch: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
2018-03-14 04:36:30 +08:00
|
|
|
|
int max_top = UINT8_MAX;
|
2015-07-10 05:28:20 +08:00
|
|
|
|
float width = 0.0f;
|
|
|
|
|
float width_sd = 0.0f;
|
|
|
|
|
float bearing = 0.0f;
|
|
|
|
|
float bearing_sd = 0.0f;
|
|
|
|
|
float advance = 0.0f;
|
|
|
|
|
float advance_sd = 0.0f;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
// TODO(eger): check that this default it ok
|
|
|
|
|
// after enabling BiDi iterator for Arabic+Cube.
|
|
|
|
|
int direction = UNICHARSET::U_LEFT_TO_RIGHT;
|
|
|
|
|
UNICHAR_ID other_case = id;
|
|
|
|
|
UNICHAR_ID mirror = id;
|
|
|
|
|
char normed[64];
|
|
|
|
|
int v = -1;
|
2016-12-13 00:20:28 +08:00
|
|
|
|
if (fgets_cb->Run(buffer, sizeof (buffer)) == nullptr ||
|
2012-02-02 11:14:43 +08:00
|
|
|
|
((v = sscanf(buffer,
|
2015-07-10 05:28:20 +08:00
|
|
|
|
"%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d %63s",
|
2012-02-02 11:14:43 +08:00
|
|
|
|
unichar, &properties,
|
|
|
|
|
&min_bottom, &max_bottom, &min_top, &max_top,
|
2015-07-10 05:28:20 +08:00
|
|
|
|
&width, &width_sd, &bearing, &bearing_sd,
|
|
|
|
|
&advance, &advance_sd, script, &other_case,
|
2012-02-02 11:14:43 +08:00
|
|
|
|
&direction, &mirror, normed)) != 17 &&
|
|
|
|
|
(v = sscanf(buffer,
|
2015-07-10 05:28:20 +08:00
|
|
|
|
"%s %x %d,%d,%d,%d,%g,%g,%g,%g,%g,%g %63s %d %d %d",
|
2012-02-02 11:14:43 +08:00
|
|
|
|
unichar, &properties,
|
|
|
|
|
&min_bottom, &max_bottom, &min_top, &max_top,
|
2015-07-10 05:28:20 +08:00
|
|
|
|
&width, &width_sd, &bearing, &bearing_sd,
|
|
|
|
|
&advance, &advance_sd, script, &other_case,
|
|
|
|
|
&direction, &mirror)) != 16 &&
|
2012-02-02 11:14:43 +08:00
|
|
|
|
(v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d",
|
|
|
|
|
unichar, &properties,
|
|
|
|
|
&min_bottom, &max_bottom, &min_top, &max_top,
|
|
|
|
|
script, &other_case, &direction, &mirror)) != 10 &&
|
|
|
|
|
(v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties,
|
|
|
|
|
&min_bottom, &max_bottom, &min_top, &max_top,
|
|
|
|
|
script, &other_case)) != 8 &&
|
|
|
|
|
(v = sscanf(buffer, "%s %x %63s %d", unichar, &properties,
|
|
|
|
|
script, &other_case)) != 4 &&
|
|
|
|
|
(v = sscanf(buffer, "%s %x %63s",
|
|
|
|
|
unichar, &properties, script)) != 3 &&
|
2014-01-10 01:27:40 +08:00
|
|
|
|
(v = sscanf(buffer, "%s %x", unichar, &properties)) != 2)) {
|
2007-05-16 09:25:41 +08:00
|
|
|
|
return false;
|
|
|
|
|
}
|
2012-02-02 11:14:43 +08:00
|
|
|
|
|
2010-11-30 08:53:31 +08:00
|
|
|
|
// Skip fragments if needed.
|
2016-12-13 00:20:28 +08:00
|
|
|
|
CHAR_FRAGMENT *frag = nullptr;
|
2010-11-30 08:53:31 +08:00
|
|
|
|
if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
|
2014-09-18 09:29:32 +08:00
|
|
|
|
int num_pieces = frag->get_total();
|
2010-11-30 08:53:31 +08:00
|
|
|
|
delete frag;
|
2014-09-18 09:29:32 +08:00
|
|
|
|
// Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in.
|
|
|
|
|
if (num_pieces > 1)
|
|
|
|
|
continue;
|
2010-11-30 08:53:31 +08:00
|
|
|
|
}
|
|
|
|
|
// Insert unichar into unicharset and set its properties.
|
2007-05-16 09:25:41 +08:00
|
|
|
|
if (strcmp(unichar, "NULL") == 0)
|
|
|
|
|
this->unichar_insert(" ");
|
|
|
|
|
else
|
2017-07-25 02:45:57 +08:00
|
|
|
|
this->unichar_insert_backwards_compatible(unichar);
|
2007-07-18 09:15:07 +08:00
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
|
this->set_isalpha(id, properties & ISALPHA_MASK);
|
|
|
|
|
this->set_islower(id, properties & ISLOWER_MASK);
|
|
|
|
|
this->set_isupper(id, properties & ISUPPER_MASK);
|
|
|
|
|
this->set_isdigit(id, properties & ISDIGIT_MASK);
|
|
|
|
|
this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);
|
2009-07-11 10:50:24 +08:00
|
|
|
|
this->set_isngram(id, false);
|
|
|
|
|
this->set_script(id, script);
|
2008-02-01 08:21:49 +08:00
|
|
|
|
this->unichars[id].properties.enabled = true;
|
2010-11-24 02:34:14 +08:00
|
|
|
|
this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
|
2015-07-10 05:28:20 +08:00
|
|
|
|
this->set_width_stats(id, width, width_sd);
|
|
|
|
|
this->set_bearing_stats(id, bearing, bearing_sd);
|
|
|
|
|
this->set_advance_stats(id, advance, advance_sd);
|
2012-02-02 11:14:43 +08:00
|
|
|
|
this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
|
2017-09-08 18:49:57 +08:00
|
|
|
|
this->set_other_case(
|
|
|
|
|
id, (v > 3 && other_case < unicharset_size) ? other_case : id);
|
|
|
|
|
this->set_mirror(id, (v > 8 && mirror < unicharset_size) ? mirror : id);
|
2012-02-02 11:14:43 +08:00
|
|
|
|
this->set_normed(id, (v>16) ? normed : unichar);
|
2010-11-24 02:34:14 +08:00
|
|
|
|
}
|
|
|
|
|
post_load_setup();
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Sets up internal data after loading the file, based on the char
|
|
|
|
|
// properties. Called from load_from_file, but also needs to be run
|
|
|
|
|
// during set_unicharset_properties.
|
|
|
|
|
void UNICHARSET::post_load_setup() {
|
|
|
|
|
// Number of alpha chars with the case property minus those without,
|
|
|
|
|
// in order to determine that half the alpha chars have case.
|
|
|
|
|
int net_case_alphas = 0;
|
|
|
|
|
int x_height_alphas = 0;
|
|
|
|
|
int cap_height_alphas = 0;
|
|
|
|
|
top_bottom_set_ = false;
|
|
|
|
|
for (UNICHAR_ID id = 0; id < size_used; ++id) {
|
|
|
|
|
int min_bottom = 0;
|
Use POSIX data types and macros (#878)
* api: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* cutil: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* training: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract data types by POSIX data types
Now all Tesseract data types which are no longer needed can be removed
from ccutil/host.h.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* lstm: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Remove the macros which are now unused from ccutil/host.h.
Remove also the obsolete history comments.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* Fix build error caused by ambiguous ClipToRange
Error message vom Appveyor CI:
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2672: 'ClipToRange': no matching overloaded function found [C:\projects\tesseract\build\libtesseract.vcxproj]
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2782: 'T ClipToRange(const T &,const T &,const T &)': template parameter 'T' is ambiguous [C:\projects\tesseract\build\libtesseract.vcxproj]
c:\projects\tesseract\ccutil\helpers.h(122): note: see declaration of 'ClipToRange'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: could be 'char'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: or 'int'
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* unittest: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* arch: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
2018-03-14 04:36:30 +08:00
|
|
|
|
int max_bottom = UINT8_MAX;
|
2010-11-24 02:34:14 +08:00
|
|
|
|
int min_top = 0;
|
Use POSIX data types and macros (#878)
* api: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* cutil: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* training: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract data types by POSIX data types
Now all Tesseract data types which are no longer needed can be removed
from ccutil/host.h.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* lstm: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Remove the macros which are now unused from ccutil/host.h.
Remove also the obsolete history comments.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* Fix build error caused by ambiguous ClipToRange
Error message vom Appveyor CI:
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2672: 'ClipToRange': no matching overloaded function found [C:\projects\tesseract\build\libtesseract.vcxproj]
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2782: 'T ClipToRange(const T &,const T &,const T &)': template parameter 'T' is ambiguous [C:\projects\tesseract\build\libtesseract.vcxproj]
c:\projects\tesseract\ccutil\helpers.h(122): note: see declaration of 'ClipToRange'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: could be 'char'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: or 'int'
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* unittest: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* arch: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
2018-03-14 04:36:30 +08:00
|
|
|
|
int max_top = UINT8_MAX;
|
2010-11-24 02:34:14 +08:00
|
|
|
|
get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
|
|
|
|
|
if (min_top > 0)
|
|
|
|
|
top_bottom_set_ = true;
|
|
|
|
|
if (get_isalpha(id)) {
|
|
|
|
|
if (get_islower(id) || get_isupper(id))
|
|
|
|
|
++net_case_alphas;
|
|
|
|
|
else
|
|
|
|
|
--net_case_alphas;
|
|
|
|
|
if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
|
|
|
|
|
++x_height_alphas;
|
|
|
|
|
else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
|
|
|
|
|
++cap_height_alphas;
|
|
|
|
|
}
|
2013-09-23 23:16:01 +08:00
|
|
|
|
set_normed_ids(id);
|
2007-05-16 09:25:41 +08:00
|
|
|
|
}
|
2012-02-02 11:14:43 +08:00
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
|
script_has_upper_lower_ = net_case_alphas > 0;
|
|
|
|
|
script_has_xheight_ = script_has_upper_lower_ ||
|
2012-02-02 11:14:43 +08:00
|
|
|
|
(x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
|
|
|
|
|
cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
|
2009-07-11 10:50:24 +08:00
|
|
|
|
|
|
|
|
|
null_sid_ = get_script_id_from_name(null_script);
|
|
|
|
|
ASSERT_HOST(null_sid_ == 0);
|
|
|
|
|
common_sid_ = get_script_id_from_name("Common");
|
|
|
|
|
latin_sid_ = get_script_id_from_name("Latin");
|
|
|
|
|
cyrillic_sid_ = get_script_id_from_name("Cyrillic");
|
|
|
|
|
greek_sid_ = get_script_id_from_name("Greek");
|
|
|
|
|
han_sid_ = get_script_id_from_name("Han");
|
2010-11-24 02:34:14 +08:00
|
|
|
|
hiragana_sid_ = get_script_id_from_name("Hiragana");
|
|
|
|
|
katakana_sid_ = get_script_id_from_name("Katakana");
|
2016-11-08 07:38:07 +08:00
|
|
|
|
thai_sid_ = get_script_id_from_name("Thai");
|
|
|
|
|
hangul_sid_ = get_script_id_from_name("Hangul");
|
2010-11-24 02:34:14 +08:00
|
|
|
|
|
2012-02-02 11:14:43 +08:00
|
|
|
|
// Compute default script. Use the highest-counting alpha script, that is
|
|
|
|
|
// not the common script, as that still contains some "alphas".
|
2010-11-24 02:34:14 +08:00
|
|
|
|
int* script_counts = new int[script_table_size_used];
|
|
|
|
|
memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
|
2012-02-02 11:14:43 +08:00
|
|
|
|
for (int id = 0; id < size_used; ++id) {
|
|
|
|
|
if (get_isalpha(id)) {
|
|
|
|
|
++script_counts[get_script(id)];
|
|
|
|
|
}
|
|
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
|
default_sid_ = 0;
|
|
|
|
|
for (int s = 1; s < script_table_size_used; ++s) {
|
|
|
|
|
if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
|
|
|
|
|
default_sid_ = s;
|
|
|
|
|
}
|
|
|
|
|
delete [] script_counts;
|
|
|
|
|
}
|
|
|
|
|
|
2012-02-02 11:14:43 +08:00
|
|
|
|
// Returns true if right_to_left scripts are significant in the unicharset,
|
|
|
|
|
// but without being so sensitive that "universal" unicharsets containing
|
|
|
|
|
// characters from many scripts, like orientation and script detection,
|
|
|
|
|
// look like they are right_to_left.
|
|
|
|
|
bool UNICHARSET::major_right_to_left() const {
|
|
|
|
|
int ltr_count = 0;
|
|
|
|
|
int rtl_count = 0;
|
|
|
|
|
for (int id = 0; id < size_used; ++id) {
|
|
|
|
|
int dir = get_direction(id);
|
|
|
|
|
if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
|
|
|
|
|
if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
|
|
|
|
|
dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC ||
|
|
|
|
|
dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
|
2010-11-24 02:34:14 +08:00
|
|
|
|
}
|
2012-02-02 11:14:43 +08:00
|
|
|
|
return rtl_count > ltr_count;
|
2007-05-16 09:25:41 +08:00
|
|
|
|
}
|
2008-02-01 08:21:49 +08:00
|
|
|
|
|
|
|
|
|
// Set a whitelist and/or blacklist of characters to recognize.
|
2016-12-13 00:20:28 +08:00
|
|
|
|
// An empty or nullptr whitelist enables everything (minus any blacklist).
|
|
|
|
|
// An empty or nullptr blacklist disables nothing.
|
|
|
|
|
// An empty or nullptr blacklist has no effect.
|
2008-02-01 08:21:49 +08:00
|
|
|
|
void UNICHARSET::set_black_and_whitelist(const char* blacklist,
|
2014-10-10 04:28:03 +08:00
|
|
|
|
const char* whitelist,
|
|
|
|
|
const char* unblacklist) {
|
2016-12-13 00:20:28 +08:00
|
|
|
|
bool def_enabled = whitelist == nullptr || whitelist[0] == '\0';
|
2008-02-01 08:21:49 +08:00
|
|
|
|
// Set everything to default
|
|
|
|
|
for (int ch = 0; ch < size_used; ++ch)
|
|
|
|
|
unichars[ch].properties.enabled = def_enabled;
|
|
|
|
|
if (!def_enabled) {
|
|
|
|
|
// Enable the whitelist.
|
2013-09-23 23:16:01 +08:00
|
|
|
|
GenericVector<UNICHAR_ID> encoding;
|
2016-12-13 00:20:28 +08:00
|
|
|
|
encode_string(whitelist, false, &encoding, nullptr, nullptr);
|
2013-09-23 23:16:01 +08:00
|
|
|
|
for (int i = 0; i < encoding.size(); ++i) {
|
|
|
|
|
if (encoding[i] != INVALID_UNICHAR_ID)
|
|
|
|
|
unichars[encoding[i]].properties.enabled = true;
|
2008-02-01 08:21:49 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
2016-12-13 00:20:28 +08:00
|
|
|
|
if (blacklist != nullptr && blacklist[0] != '\0') {
|
2008-02-01 08:21:49 +08:00
|
|
|
|
// Disable the blacklist.
|
2013-09-23 23:16:01 +08:00
|
|
|
|
GenericVector<UNICHAR_ID> encoding;
|
2016-12-13 00:20:28 +08:00
|
|
|
|
encode_string(blacklist, false, &encoding, nullptr, nullptr);
|
2013-09-23 23:16:01 +08:00
|
|
|
|
for (int i = 0; i < encoding.size(); ++i) {
|
|
|
|
|
if (encoding[i] != INVALID_UNICHAR_ID)
|
|
|
|
|
unichars[encoding[i]].properties.enabled = false;
|
2008-02-01 08:21:49 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
2016-12-13 00:20:28 +08:00
|
|
|
|
if (unblacklist != nullptr && unblacklist[0] != '\0') {
|
2014-10-10 04:28:03 +08:00
|
|
|
|
// Re-enable the unblacklist.
|
|
|
|
|
GenericVector<UNICHAR_ID> encoding;
|
2016-12-13 00:20:28 +08:00
|
|
|
|
encode_string(unblacklist, false, &encoding, nullptr, nullptr);
|
2014-10-10 04:28:03 +08:00
|
|
|
|
for (int i = 0; i < encoding.size(); ++i) {
|
|
|
|
|
if (encoding[i] != INVALID_UNICHAR_ID)
|
|
|
|
|
unichars[encoding[i]].properties.enabled = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
2008-02-01 08:21:49 +08:00
|
|
|
|
}
|
|
|
|
|
|
2015-07-10 05:50:25 +08:00
|
|
|
|
// Returns true if there are any repeated unicodes in the normalized
|
|
|
|
|
// text of any unichar-id in the unicharset.
|
|
|
|
|
bool UNICHARSET::AnyRepeatedUnicodes() const {
|
|
|
|
|
int start_id = 0;
|
|
|
|
|
if (has_special_codes()) start_id = SPECIAL_UNICHAR_CODES_COUNT;
|
|
|
|
|
for (int id = start_id; id < size_used; ++id) {
|
|
|
|
|
// Convert to unicodes.
|
2017-07-15 00:30:14 +08:00
|
|
|
|
std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
|
2019-03-25 04:18:21 +08:00
|
|
|
|
for (size_t u = 1; u < unicodes.size(); ++u) {
|
2017-07-15 00:30:14 +08:00
|
|
|
|
if (unicodes[u - 1] == unicodes[u]) return true;
|
2015-07-10 05:50:25 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2009-07-11 10:50:24 +08:00
|
|
|
|
int UNICHARSET::add_script(const char* script) {
|
2008-04-22 08:23:41 +08:00
|
|
|
|
for (int i = 0; i < script_table_size_used; ++i) {
|
|
|
|
|
if (strcmp(script, script_table[i]) == 0)
|
2009-07-11 10:50:24 +08:00
|
|
|
|
return i;
|
2008-04-22 08:23:41 +08:00
|
|
|
|
}
|
|
|
|
|
if (script_table_size_reserved == 0) {
|
|
|
|
|
script_table_size_reserved = 8;
|
|
|
|
|
script_table = new char*[script_table_size_reserved];
|
2017-05-13 01:58:41 +08:00
|
|
|
|
} else if (script_table_size_used >= script_table_size_reserved) {
|
|
|
|
|
assert(script_table_size_used == script_table_size_reserved);
|
|
|
|
|
script_table_size_reserved += script_table_size_reserved;
|
|
|
|
|
char** new_script_table = new char*[script_table_size_reserved];
|
2017-07-15 00:30:14 +08:00
|
|
|
|
memcpy(new_script_table, script_table,
|
|
|
|
|
script_table_size_used * sizeof(char*));
|
2008-04-22 08:23:41 +08:00
|
|
|
|
delete[] script_table;
|
|
|
|
|
script_table = new_script_table;
|
|
|
|
|
}
|
|
|
|
|
script_table[script_table_size_used] = new char[strlen(script) + 1];
|
|
|
|
|
strcpy(script_table[script_table_size_used], script);
|
2009-07-11 10:50:24 +08:00
|
|
|
|
return script_table_size_used++;
|
|
|
|
|
}
|
|
|
|
|
|
2012-02-02 11:14:43 +08:00
|
|
|
|
// Returns the string that represents a fragment
|
|
|
|
|
// with the given unichar, pos and total.
|
|
|
|
|
STRING CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,
|
|
|
|
|
bool natural) {
|
|
|
|
|
if (total == 1) return STRING(unichar);
|
|
|
|
|
STRING result = "";
|
|
|
|
|
result += kSeparator;
|
|
|
|
|
result += unichar;
|
|
|
|
|
char buffer[kMaxLen];
|
|
|
|
|
snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
|
|
|
|
|
natural ? kNaturalFlag : kSeparator, total);
|
|
|
|
|
result += buffer;
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2009-07-11 10:50:24 +08:00
|
|
|
|
CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
|
|
|
|
|
const char *ptr = string;
|
|
|
|
|
int len = strlen(string);
|
|
|
|
|
if (len < kMinLen || *ptr != kSeparator) {
|
2016-12-13 00:20:28 +08:00
|
|
|
|
return nullptr; // this string can not represent a fragment
|
2009-07-11 10:50:24 +08:00
|
|
|
|
}
|
|
|
|
|
ptr++; // move to the next character
|
|
|
|
|
int step = 0;
|
|
|
|
|
while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
|
|
|
|
|
step += UNICHAR::utf8_step(ptr + step);
|
|
|
|
|
}
|
|
|
|
|
if (step == 0 || step > UNICHAR_LEN) {
|
2016-12-13 00:20:28 +08:00
|
|
|
|
return nullptr; // no character for unichar or the character is too long
|
2009-07-11 10:50:24 +08:00
|
|
|
|
}
|
|
|
|
|
char unichar[UNICHAR_LEN + 1];
|
|
|
|
|
strncpy(unichar, ptr, step);
|
|
|
|
|
unichar[step] = '\0'; // null terminate unichar
|
|
|
|
|
ptr += step; // move to the next fragment separator
|
|
|
|
|
int pos = 0;
|
|
|
|
|
int total = 0;
|
2012-02-02 11:14:43 +08:00
|
|
|
|
bool natural = false;
|
2016-12-13 00:20:28 +08:00
|
|
|
|
char *end_ptr = nullptr;
|
2009-07-11 10:50:24 +08:00
|
|
|
|
for (int i = 0; i < 2; i++) {
|
|
|
|
|
if (ptr > string + len || *ptr != kSeparator) {
|
2012-02-02 11:14:43 +08:00
|
|
|
|
if (i == 1 && *ptr == kNaturalFlag)
|
|
|
|
|
natural = true;
|
|
|
|
|
else
|
2016-12-13 00:20:28 +08:00
|
|
|
|
return nullptr; // Failed to parse fragment representation.
|
2009-07-11 10:50:24 +08:00
|
|
|
|
}
|
|
|
|
|
ptr++; // move to the next character
|
|
|
|
|
i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
|
|
|
|
|
: total = static_cast<int>(strtol(ptr, &end_ptr, 10));
|
|
|
|
|
ptr = end_ptr;
|
|
|
|
|
}
|
|
|
|
|
if (ptr != string + len) {
|
2016-12-13 00:20:28 +08:00
|
|
|
|
return nullptr; // malformed fragment representation
|
2009-07-11 10:50:24 +08:00
|
|
|
|
}
|
2019-03-26 14:55:08 +08:00
|
|
|
|
auto *fragment = new CHAR_FRAGMENT();
|
2012-02-02 11:14:43 +08:00
|
|
|
|
fragment->set_all(unichar, pos, total, natural);
|
2009-07-11 10:50:24 +08:00
|
|
|
|
return fragment;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int UNICHARSET::get_script_id_from_name(const char* script_name) const {
|
|
|
|
|
for (int i = 0; i < script_table_size_used; ++i) {
|
|
|
|
|
if (strcmp(script_name, script_table[i]) == 0)
|
|
|
|
|
return i;
|
|
|
|
|
}
|
|
|
|
|
return 0; // 0 is always the null_script
|
2008-04-22 08:23:41 +08:00
|
|
|
|
}
|
2017-07-25 02:45:57 +08:00
|
|
|
|
|
|
|
|
|
// Removes/replaces content that belongs in rendered text, but not in the
|
|
|
|
|
// unicharset.
|
|
|
|
|
/* static */
|
2018-03-11 03:51:52 +08:00
|
|
|
|
std::string UNICHARSET::CleanupString(const char* utf8_str, size_t length) {
|
2018-03-03 21:36:28 +08:00
|
|
|
|
std::string result;
|
2017-07-25 02:45:57 +08:00
|
|
|
|
result.reserve(length);
|
|
|
|
|
char ch;
|
2018-03-11 03:51:52 +08:00
|
|
|
|
while ((ch = *utf8_str) != '\0' && length-- > 0) {
|
2017-07-25 02:45:57 +08:00
|
|
|
|
int key_index = 0;
|
|
|
|
|
const char* key;
|
|
|
|
|
while ((key = kCleanupMaps[key_index][0]) != nullptr) {
|
|
|
|
|
int match = 0;
|
|
|
|
|
while (key[match] != '\0' && key[match] == utf8_str[match]) ++match;
|
|
|
|
|
if (key[match] == '\0') {
|
|
|
|
|
utf8_str += match;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
++key_index;
|
|
|
|
|
}
|
|
|
|
|
if (key == nullptr) {
|
|
|
|
|
result.push_back(ch);
|
|
|
|
|
++utf8_str;
|
|
|
|
|
} else {
|
|
|
|
|
result.append(kCleanupMaps[key_index][1]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return result;
|
|
|
|
|
}
|