mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 12:49:35 +08:00
normstrngs: add more hyphens and quotes
This commit is contained in:
parent
f7d4b39125
commit
2fbb81070f
@ -38,13 +38,22 @@ namespace tesseract {
|
||||
static bool is_hyphen_punc(const char32 ch) {
|
||||
static const int kNumHyphenPuncUnicodes = 13;
|
||||
static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
|
||||
'-', 0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, // hyphen..horizontal bar
|
||||
0x207b, // superscript minus
|
||||
0x208b, // subscript minus
|
||||
0x2212, // minus sign
|
||||
0xfe58, // small em dash
|
||||
0xfe63, // small hyphen-minus
|
||||
0xff0d, // fullwidth hyphen-minus
|
||||
'-',
|
||||
0x2010, // hyphen
|
||||
0x2011, // non-breaking hyphen
|
||||
0x2012, // figure dash
|
||||
0x2013, // en dash
|
||||
0x2014, // em dash
|
||||
0x2015, // horizontal bar
|
||||
// how about 0x2043 hyphen bullet?
|
||||
// how about 0x2500 box drawings light horizontal?
|
||||
0x207b, // superscript minus
|
||||
0x208b, // subscript minus
|
||||
0x2212, // minus sign
|
||||
0xfe58, // small em dash
|
||||
0xfe63, // small hyphen-minus
|
||||
0xff0d, // fullwidth hyphen-minus
|
||||
0x2e17, // double oblique hyphen (Fraktur)
|
||||
};
|
||||
for (int kHyphenPuncUnicode : kHyphenPuncUnicodes) {
|
||||
if (kHyphenPuncUnicode == ch) {
|
||||
@ -61,6 +70,7 @@ static bool is_single_quote(const char32 ch) {
|
||||
0x2018, // left single quotation mark (English, others)
|
||||
0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
|
||||
// We may have to introduce a comma set with 0x201a
|
||||
0x201A, // single low-9 quotation mark (German)
|
||||
0x201B, // single high-reversed-9 quotation mark (PropList.txt)
|
||||
0x2032, // prime
|
||||
0x300C, // left corner bracket (East Asian languages)
|
||||
@ -82,6 +92,7 @@ static bool is_double_quote(const char32 ch) {
|
||||
0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
|
||||
0x201F, // double high-reversed-9 quotation mark (PropList.txt)
|
||||
0x2033, // double prime
|
||||
0x201E, // double low-9 quotation mark (German)
|
||||
0x301D, // reversed double prime quotation mark (East Asian langs,
|
||||
// horiz.)
|
||||
0x301E, // close double prime (East Asian languages written horizontally)
|
||||
|
Loading…
Reference in New Issue
Block a user