tesseract/training/normstrngs.h

/**********************************************************************
 * File:        normstrngs.h
 * Description: Utilities to normalize and manipulate UTF-32 and
 *              UTF-8 strings.
 * Author:      Ranjith Unnikrishnan
 * Created:     Thu July 4 2013
 *
 * (C) Copyright 2013, Google Inc.
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 **********************************************************************/

#ifndef TESSERACT_CCUTIL_NORMSTRNGS_H_
#define TESSERACT_CCUTIL_NORMSTRNGS_H_

#include <string>
#include <vector>

#include "validator.h"

namespace tesseract {

// The standard unicode normalizations.
enum class UnicodeNormMode {
  kNFD,
  kNFC,
  kNFKD,
  kNFKC,
};

// To normalize away differences in punctuation that are ambiguous, like
// curly quotes and different widths of dash.
enum class OCRNorm {
  kNone,
  kNormalize,
};

// To validate and normalize away some subtle differences that can occur in
// Indic scripts, eg ensuring that an explicit virama is always followed by
// a zero-width non-joiner.
enum class GraphemeNorm {
  kNone,
  kNormalize,
};

// Normalizes a UTF8 string according to the given modes. Returns true on
// success. If false is returned, some failure or invalidity was present, and
// the result string is produced on a "best effort" basis.
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
                         GraphemeNorm grapheme_normalize, const char* str8,
                         std::string* normalized);
// Normalizes a UTF8 string according to the given modes and splits into
// graphemes according to g_mode. Returns true on success. If false is returned,
// some failure or invalidity was present, and the result string is produced on
// a "best effort" basis.
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
                                  GraphemeNormMode g_mode, bool report_errors,
                                  const char* str8,
                                  std::vector<std::string>* graphemes);

// Applies just the OCR-specific normalizations and return the normalized char.
char32 OCRNormalize(char32 ch);

// Returns true if the OCRNormalized ch1 and ch2 are the same.
bool IsOCREquivalent(char32 ch1, char32 ch2);

// Returns true if the value lies in the range of valid unicodes.
bool IsValidCodepoint(const char32 ch);

// Returns true a code point has the White_Space Unicode property.
bool IsWhitespace(const char32 ch);
// Returns true if every char in the given (null-terminated) string has the
// White_Space Unicode property.
bool IsUTF8Whitespace(const char* text);

// Returns the length of bytes of the prefix of 'text' that have the White_Space
// unicode property.
unsigned int SpanUTF8Whitespace(const char* text);

// Returns the length of bytes of the prefix of 'text' that DO NOT have the
// White_Space unicode property.
unsigned int SpanUTF8NotWhitespace(const char* text);

// Returns true if the char is interchange valid i.e. no C0 or C1 control codes
// (other than CR LF HT FF) and no non-characters.
bool IsInterchangeValid(const char32 ch);
// Same as above but restricted to 7-bit ASCII.
bool IsInterchangeValid7BitAscii(const char32 ch);

// Convert a full-width UTF-8 string to half-width.
char32 FullwidthToHalfwidth(const char32 ch);

}  // namespace tesseract

#endif  // TESSERACT_CCUTIL_NORMSTRNGS_H_
Major refactor of beam search, elimination of dead code, misc bug fixes, updates to Makefile.am, Changelog etc. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@878 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2013-09-23 23:26:50 +08:00			`/**********************************************************************`
			`* File: normstrngs.h`
			`* Description: Utilities to normalize and manipulate UTF-32 and`
			`* UTF-8 strings.`
			`* Author: Ranjith Unnikrishnan`
			`* Created: Thu July 4 2013`
			`*`
			`* (C) Copyright 2013, Google Inc.`
			`* Licensed under the Apache License, Version 2.0 (the "License");`
			`* you may not use this file except in compliance with the License.`
			`* You may obtain a copy of the License at`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS,`
			`* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`* See the License for the specific language governing permissions and`
			`* limitations under the License.`
			`*`
			`**********************************************************************/`

			`#ifndef TESSERACT_CCUTIL_NORMSTRNGS_H_`
			`#define TESSERACT_CCUTIL_NORMSTRNGS_H_`

Added script-specific validation and normalization for virama-using scripts and updated normalization for others 2017-07-15 01:05:05 +08:00			`#include <string>`
			`#include <vector>`
Major refactor of beam search, elimination of dead code, misc bug fixes, updates to Makefile.am, Changelog etc. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@878 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2013-09-23 23:26:50 +08:00
Added script-specific validation and normalization for virama-using scripts and updated normalization for others 2017-07-15 01:05:05 +08:00			`#include "validator.h"`
Major refactor of beam search, elimination of dead code, misc bug fixes, updates to Makefile.am, Changelog etc. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@878 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2013-09-23 23:26:50 +08:00
			`namespace tesseract {`

Added script-specific validation and normalization for virama-using scripts and updated normalization for others 2017-07-15 01:05:05 +08:00			`// The standard unicode normalizations.`
			`enum class UnicodeNormMode {`
			`kNFD,`
			`kNFC,`
			`kNFKD,`
			`kNFKC,`
			`};`

			`// To normalize away differences in punctuation that are ambiguous, like`
			`// curly quotes and different widths of dash.`
			`enum class OCRNorm {`
			`kNone,`
			`kNormalize,`
			`};`

			`// To validate and normalize away some subtle differences that can occur in`
			`// Indic scripts, eg ensuring that an explicit virama is always followed by`
			`// a zero-width non-joiner.`
			`enum class GraphemeNorm {`
			`kNone,`
			`kNormalize,`
			`};`

			`// Normalizes a UTF8 string according to the given modes. Returns true on`
			`// success. If false is returned, some failure or invalidity was present, and`
			`// the result string is produced on a "best effort" basis.`
			`bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,`
			`GraphemeNorm grapheme_normalize, const char* str8,`
Remove old code for string class (no longer needed) (#1354) * Remove old code for string class (no longer needed) Signed-off-by: Stefan Weil <sw@weilnetz.de> * Add std namespace to string class Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-03-03 21:36:28 +08:00			`std::string* normalized);`
Added script-specific validation and normalization for virama-using scripts and updated normalization for others 2017-07-15 01:05:05 +08:00			`// Normalizes a UTF8 string according to the given modes and splits into`
			`// graphemes according to g_mode. Returns true on success. If false is returned,`
			`// some failure or invalidity was present, and the result string is produced on`
			`// a "best effort" basis.`
			`bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,`
			`GraphemeNormMode g_mode, bool report_errors,`
			`const char* str8,`
Remove old code for string class (no longer needed) (#1354) * Remove old code for string class (no longer needed) Signed-off-by: Stefan Weil <sw@weilnetz.de> * Add std namespace to string class Signed-off-by: Stefan Weil <sw@weilnetz.de> 2018-03-03 21:36:28 +08:00			`std::vector<std::string>* graphemes);`
Major refactor of beam search, elimination of dead code, misc bug fixes, updates to Makefile.am, Changelog etc. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@878 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2013-09-23 23:26:50 +08:00
Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion 2017-07-15 00:30:14 +08:00			`// Applies just the OCR-specific normalizations and return the normalized char.`
Major refactor of beam search, elimination of dead code, misc bug fixes, updates to Makefile.am, Changelog etc. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@878 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2013-09-23 23:26:50 +08:00			`char32 OCRNormalize(char32 ch);`

			`// Returns true if the OCRNormalized ch1 and ch2 are the same.`
			`bool IsOCREquivalent(char32 ch1, char32 ch2);`

New training tool text2image git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@964 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-01-10 02:01:34 +08:00			`// Returns true if the value lies in the range of valid unicodes.`
			`bool IsValidCodepoint(const char32 ch);`

			`// Returns true a code point has the White_Space Unicode property.`
			`bool IsWhitespace(const char32 ch);`
			`// Returns true if every char in the given (null-terminated) string has the`
			`// White_Space Unicode property.`
			`bool IsUTF8Whitespace(const char* text);`

			`// Returns the length of bytes of the prefix of 'text' that have the White_Space`
			`// unicode property.`
Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion 2017-07-15 00:30:14 +08:00			`unsigned int SpanUTF8Whitespace(const char* text);`
New training tool text2image git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@964 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-01-10 02:01:34 +08:00
			`// Returns the length of bytes of the prefix of 'text' that DO NOT have the`
			`// White_Space unicode property.`
Fixes from pull of cleanups: clang tidied, reviewed, fixed new bugs, undeleted needed code. Probably breaks the build, due to some inclusion of changes in utf8/32 conversion 2017-07-15 00:30:14 +08:00			`unsigned int SpanUTF8NotWhitespace(const char* text);`
New training tool text2image git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@964 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2014-01-10 02:01:34 +08:00
			`// Returns true if the char is interchange valid i.e. no C0 or C1 control codes`
			`// (other than CR LF HT FF) and no non-characters.`
			`bool IsInterchangeValid(const char32 ch);`
			`// Same as above but restricted to 7-bit ASCII.`
			`bool IsInterchangeValid7BitAscii(const char32 ch);`

			`// Convert a full-width UTF-8 string to half-width.`
			`char32 FullwidthToHalfwidth(const char32 ch);`

Major refactor of beam search, elimination of dead code, misc bug fixes, updates to Makefile.am, Changelog etc. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@878 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2013-09-23 23:26:50 +08:00			`} // namespace tesseract`

			`#endif // TESSERACT_CCUTIL_NORMSTRNGS_H_`