diff --git a/src/ccutil/unicodes.cpp b/src/ccutil/unicodes.cpp index e57a311c..41cc0ee9 100644 --- a/src/ccutil/unicodes.cpp +++ b/src/ccutil/unicodes.cpp @@ -20,36 +20,9 @@ namespace tesseract { -const char *kUTF8LineSeparator = "\u2028"; // "\xe2\x80\xa8"; -const char *kUTF8ParagraphSeparator = "\u2029"; // "\xe2\x80\xa9"; -const char *kLRM = "\u200E"; // Left-to-Right Mark -const char *kRLM = "\u200F"; // Right-to-Left Mark -const char *kRLE = "\u202A"; // Right-to-Left Embedding -const char *kPDF = "\u202C"; // Pop Directional Formatting - -const char *kHyphenLikeUTF8[] = { - "-", // ASCII hyphen-minus - "\u05BE", // word hyphen in hybrew - "\u2010", // hyphen - "\u2011", // non-breaking hyphen - "\u2012", // a hyphen the same width as digits - "\u2013", // en dash - "\u2014", // em dash - "\u2015", // horizontal bar - "\u2212", // arithmetic minus sign - "\uFE58", // small em dash - "\uFE63", // small hyphen-minus - "\uFF0D", // fullwidth hyphen-minus - nullptr, // end of our list -}; - -const char *kApostropheLikeUTF8[] = { - "'", // ASCII apostrophe - "`", // ASCII backtick - "\u2018", // opening single quote - "\u2019", // closing single quote - "\u2032", // mathematical prime mark - nullptr, // end of our list. -}; +constexpr const char *kLRM = "\u200E"; // Left-to-Right Mark +constexpr const char *kRLM = "\u200F"; // Right-to-Left Mark +constexpr const char *kRLE = "\u202A"; // Right-to-Left Embedding +constexpr const char *kPDF = "\u202C"; // Pop Directional Formatting } // namespace diff --git a/src/ccutil/unicodes.h b/src/ccutil/unicodes.h index 7c0223a8..de00544d 100644 --- a/src/ccutil/unicodes.h +++ b/src/ccutil/unicodes.h @@ -2,7 +2,6 @@ * File: unicodes.h * Description: Unicode related machinery * Author: David Eger - * Created: Wed Jun 15 16:37:50 PST 2011 * * (C) Copyright 2011, Google, Inc. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -22,17 +21,10 @@ namespace tesseract { -extern const char *kUTF8LineSeparator; -extern const char *kUTF8ParagraphSeparator; -extern const char *kLRM; ///< Left-to-Right Mark -extern const char *kRLM; ///< Right-to-Left Mark -extern const char *kRLE; ///< Right-to-Left Embedding -extern const char *kPDF; ///< Pop Directional Formatting - -/// The following are confusable internal word punctuation symbols -/// which we normalize to the first variant when matching in dawgs. -extern const char *kHyphenLikeUTF8[]; -extern const char *kApostropheLikeUTF8[]; +extern const char* const kLRM; ///< Left-to-Right Mark +extern const char* const kRLM; ///< Right-to-Left Mark +extern const char* const kRLE; ///< Right-to-Left Embedding +extern const char* const kPDF; ///< Pop Directional Formatting } // namespace