normstrngs: add more hyphens and quotes

This commit is contained in:
Robert Sachunsky 2024-03-05 12:53:00 +01:00 committed by Stefan Weil
parent f7d4b39125
commit 2fbb81070f

View File

@ -38,13 +38,22 @@ namespace tesseract {
static bool is_hyphen_punc(const char32 ch) {
static const int kNumHyphenPuncUnicodes = 13;
static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
'-', 0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, // hyphen..horizontal bar
0x207b, // superscript minus
0x208b, // subscript minus
0x2212, // minus sign
0xfe58, // small em dash
0xfe63, // small hyphen-minus
0xff0d, // fullwidth hyphen-minus
'-',
0x2010, // hyphen
0x2011, // non-breaking hyphen
0x2012, // figure dash
0x2013, // en dash
0x2014, // em dash
0x2015, // horizontal bar
// how about 0x2043 hyphen bullet?
// how about 0x2500 box drawings light horizontal?
0x207b, // superscript minus
0x208b, // subscript minus
0x2212, // minus sign
0xfe58, // small em dash
0xfe63, // small hyphen-minus
0xff0d, // fullwidth hyphen-minus
0x2e17, // double oblique hyphen (Fraktur)
};
for (int kHyphenPuncUnicode : kHyphenPuncUnicodes) {
if (kHyphenPuncUnicode == ch) {
@ -61,6 +70,7 @@ static bool is_single_quote(const char32 ch) {
0x2018, // left single quotation mark (English, others)
0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
// We may have to introduce a comma set with 0x201a
0x201A, // single low-9 quotation mark (German)
0x201B, // single high-reversed-9 quotation mark (PropList.txt)
0x2032, // prime
0x300C, // left corner bracket (East Asian languages)
@ -82,6 +92,7 @@ static bool is_double_quote(const char32 ch) {
0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
0x201F, // double high-reversed-9 quotation mark (PropList.txt)
0x2033, // double prime
0x201E, // double low-9 quotation mark (German)
0x301D, // reversed double prime quotation mark (East Asian langs,
// horiz.)
0x301E, // close double prime (East Asian languages written horizontally)