mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-23 18:49:08 +08:00
Remaining misc changes for 3.02
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@658 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
23dfabcab1
commit
e0d735b122
63
ChangeLog
63
ChangeLog
@ -1,29 +1,43 @@
|
||||
2012-02-01 - v3.02
|
||||
* Moved ResultIterator/PageIterator to ccmain.
|
||||
* Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic.
|
||||
* Added paragraph detection in layout analysis/post OCR.
|
||||
* Fixed inconsistent xheight during training and over-chopping.
|
||||
* Added simultaneous multi-language capability.
|
||||
* Refactored top-level word recognition module.
|
||||
* Added experimental equation detector.
|
||||
* Improved handling of resolution from input images.
|
||||
* Blamer module added for error analysis.
|
||||
* Cleaned up externally used namespace by removing includes from baseapi.h.
|
||||
* Removed dead memory mangagement code.
|
||||
* Tidied up constraints on control parameters.
|
||||
* Added support for ShapeTable in classifier and training.
|
||||
* Refactored class pruner.
|
||||
* Fixed training leaks and randomness.
|
||||
* Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding.
|
||||
* Improved line detection and removal.
|
||||
* Added fixed pitch chopper for CJK.
|
||||
* Added UNICHARSET to WERD_CHOICE to make mult-language handling easier.
|
||||
* Fixed problems with internally scaled images.
|
||||
* Added page and bbox to string in tr files to identify source of training data better.
|
||||
* Fixes to Hindi Shiroreka splitter.
|
||||
* Added word bigram correction.
|
||||
* Reduced stack memory consumption and eliminated some ugly typedefs.
|
||||
* Added new uniform classifier API.
|
||||
* Added new training error counter.
|
||||
* Fixed endian bug in dawg reader.
|
||||
* Many other fixes, including the way in which the chopper finds chops and messes with the outline while it does so.
|
||||
|
||||
2010-11-29 - V3.01
|
||||
* Removed old/dead serialise/deserialze methods on *LISTIZED classes.
|
||||
* Total rewrite of DENORM to better encapsulate operation and make
|
||||
for potential to extract features from images.
|
||||
* Thread-safety! Moved all critical globals and statics to
|
||||
members of the appropriate class. Tesseract is now
|
||||
thread-safe (multiple instances can be used in parallel
|
||||
in multiple threads.) with the minor exception that some
|
||||
control parameters are still global and affect all threads.
|
||||
* Added Cube, a new recognizer for Arabic. Cube can also be
|
||||
used in combination with normal Tesseract for other languages
|
||||
with an improvement in accuracy at the cost of (much) lower speed.
|
||||
There is no training module for Cube yet.
|
||||
* OcrEngineMode in Init replaces AccuracyVSpeed to control cube.
|
||||
* Greatly improved segmentation search with consequent accuracy and
|
||||
speed improvements, especially for Chinese.
|
||||
* Added PageIterator and ResultIterator as cleaner ways to get the
|
||||
full results out of Tesseract, that are not currently provided
|
||||
by any of the TessBaseAPI::Get* methods.
|
||||
All other methods, such as the ETEXT_STRUCT in particular are
|
||||
deprecated and will be deleted in the future.
|
||||
* ApplyBoxes totally rewritten to make training easier.
|
||||
It can now cope with touching/overlapping training characters,
|
||||
and a new boxfile format allows word boxes instead of character
|
||||
boxes, BUT to use that you have to have already boostrapped the
|
||||
language with character boxes. "Cyclic dependency" on traineddata.
|
||||
* Thread-safety! Moved all critical globals and statics to members of the appropriate class. Tesseract is now thread-safe (multiple instances can be used in parallel in multiple threads.) with the minor exception that some control parameters are still global and affect all threads.
|
||||
* Added Cube, a new recognizer for Arabic. Cube can also be used in combination with normal Tesseract for other languages with an improvement in accuracy at the cost of (much) lower speed. *There is no training module for Cube yet.*
|
||||
* `OcrEngineMode` in `Init` replaces `AccuracyVSpeed` to control cube.
|
||||
* Greatly improved segmentation search with consequent accuracy and speed improvements, especially for Chinese.
|
||||
* Added `PageIterator` and `ResultIterator` as cleaner ways to get the full results out of Tesseract, that are not currently provided by any of the `TessBaseAPI::Get*` methods. All other methods, such as the `ETEXT_STRUCT` in particular are deprecated and will be deleted in the future.
|
||||
* ApplyBoxes totally rewritten to make training easier. It can now cope with touching/overlapping training characters, and a new boxfile format allows word boxes instead of character boxes, BUT to use that you have to have already boostrapped the language with character boxes. "Cyclic dependency" on traineddata.
|
||||
* Auto orientation and script detection added to page layout analysis.
|
||||
* Deleted *lots* of dead code.
|
||||
* Fixxht module replaced with scalable data-driven module.
|
||||
@ -35,6 +49,11 @@
|
||||
* Handling of vertical text improved.
|
||||
* Handling of leader dots improved.
|
||||
* Table detection greatly improved.
|
||||
* Fixed a couple of memory leaks.
|
||||
* Fixed font labels on output text. (Not perfect, but a lot better than before.)
|
||||
* Cleanup and more bug fixes
|
||||
* Special treatments for Hindi.
|
||||
* Support for build in VS2010 with Microsoft Windows SDK for Windows 7 (thanks to Michael Lutz)
|
||||
|
||||
2010-09-21 - V3.00
|
||||
* Preparations for thread safety:
|
||||
|
45
ReleaseNotes
45
ReleaseNotes
@ -1,26 +1,24 @@
|
||||
= Tesseract release notes Feb 01 2012 - V3.02 =
|
||||
* Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic.
|
||||
* Added paragraph detection in layout analysis/post OCR.
|
||||
* Added simultaneous multi-language capability.
|
||||
* Added experimental equation detector.
|
||||
* Major improvements to layout analysis for better image detection, diacritic detection, better textline finding, better tabstop finding.
|
||||
* Improved line detection and removal.
|
||||
* Added fixed pitch chopper for CJK.
|
||||
* Added word bigram correction.
|
||||
* Added new uniform classifier API.
|
||||
* Added new training error counter.
|
||||
* More detailed changes recorded in ChangeLog.
|
||||
|
||||
|
||||
= Tesseract release notes Oct 21 2011 - V3.01 =
|
||||
* Thread-safety! Moved all critical globals and statics to
|
||||
members of the appropriate class. Tesseract is now
|
||||
thread-safe (multiple instances can be used in parallel
|
||||
in multiple threads.) with the minor exception that some
|
||||
control parameters are still global and affect all threads.
|
||||
* Added Cube, a new recognizer for Arabic. Cube can also be
|
||||
used in combination with normal Tesseract for other languages
|
||||
with an improvement in accuracy at the cost of (much) lower speed.
|
||||
There is no training module for Cube yet.
|
||||
* OcrEngineMode in Init replaces AccuracyVSpeed to control cube.
|
||||
* Greatly improved segmentation search with consequent accuracy and
|
||||
speed improvements, especially for Chinese.
|
||||
* Added PageIterator and ResultIterator as cleaner ways to get the
|
||||
full results out of Tesseract, that are not currently provided
|
||||
by any of the TessBaseAPI::Get* methods.
|
||||
All other methods, such as the ETEXT_STRUCT in particular are
|
||||
deprecated and will be deleted in the future.
|
||||
* ApplyBoxes totally rewritten to make training easier.
|
||||
It can now cope with touching/overlapping training characters,
|
||||
and a new boxfile format allows word boxes instead of character
|
||||
boxes, BUT to use that you have to have already boostrapped the
|
||||
language with character boxes. "Cyclic dependency" on traineddata.
|
||||
* Thread-safety! Moved all critical globals and statics to members of the appropriate class. Tesseract is now thread-safe (multiple instances can be used in parallel in multiple threads.) with the minor exception that some control parameters are still global and affect all threads.
|
||||
* Added Cube, a new recognizer for Arabic. Cube can also be used in combination with normal Tesseract for other languages with an improvement in accuracy at the cost of (much) lower speed. *There is no training module for Cube yet.*
|
||||
* `OcrEngineMode` in `Init` replaces `AccuracyVSpeed` to control cube.
|
||||
* Greatly improved segmentation search with consequent accuracy and speed improvements, especially for Chinese.
|
||||
* Added `PageIterator` and `ResultIterator` as cleaner ways to get the full results out of Tesseract, that are not currently provided by any of the `TessBaseAPI::Get*` methods. All other methods, such as the `ETEXT_STRUCT` in particular are deprecated and will be deleted in the future.
|
||||
* ApplyBoxes totally rewritten to make training easier. It can now cope with touching/overlapping training characters, and a new boxfile format allows word boxes instead of character boxes, BUT to use that you have to have already boostrapped the language with character boxes. "Cyclic dependency" on traineddata.
|
||||
* Auto orientation and script detection added to page layout analysis.
|
||||
* Deleted *lots* of dead code.
|
||||
* Fixxht module replaced with scalable data-driven module.
|
||||
@ -33,8 +31,7 @@
|
||||
* Handling of leader dots improved.
|
||||
* Table detection greatly improved.
|
||||
* Fixed a couple of memory leaks.
|
||||
* Fixed font labels on output text. (Not perfect, but a lot better than
|
||||
before.)
|
||||
* Fixed font labels on output text. (Not perfect, but a lot better than before.)
|
||||
* Cleanup and more bug fixes
|
||||
* Special treatments for Hindi.
|
||||
* Support for build in VS2010 with Microsoft Windows SDK for Windows 7 (thanks to Michael Lutz)
|
||||
|
@ -4,17 +4,17 @@ AM_CXXFLAGS = -DTESSDATA_PREFIX=@datadir@/
|
||||
EXTRA_DIST = mfcpch.cpp scanutils.cpp scanutils.h
|
||||
|
||||
include_HEADERS = \
|
||||
ambigs.h basedir.h bits16.h boxread.h \
|
||||
ambigs.h basedir.h bits16.h bitvector.h \
|
||||
tesscallback.h ccutil.h clst.h \
|
||||
elst2.h elst.h errcode.h \
|
||||
fileerr.h genericvector.h globaloc.h \
|
||||
hashfn.h helpers.h host.h hosthplb.h lsterr.h \
|
||||
memblk.h memry.h memryerr.h mfcpch.h \
|
||||
hashfn.h helpers.h host.h hosthplb.h indexmapbidi.h lsterr.h \
|
||||
memry.h mfcpch.h \
|
||||
ndminx.h notdll.h nwmain.h \
|
||||
ocrclass.h platform.h qrsequence.h \
|
||||
secname.h serialis.h sorthelper.h stderr.h strngs.h \
|
||||
tessdatamanager.h tprintf.h \
|
||||
unichar.h unicharmap.h unicharset.h unicity_table.h \
|
||||
unichar.h unicharmap.h unicharset.h unicity_table.h unicodes.h \
|
||||
params.h
|
||||
|
||||
if !USING_MULTIPLELIBS
|
||||
@ -25,14 +25,14 @@ libtesseract_ccutil_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION)
|
||||
endif
|
||||
|
||||
libtesseract_ccutil_la_SOURCES = \
|
||||
ambigs.cpp basedir.cpp bits16.cpp boxread.cpp \
|
||||
ambigs.cpp basedir.cpp bits16.cpp bitvector.cpp \
|
||||
ccutil.cpp clst.cpp \
|
||||
elst2.cpp elst.cpp errcode.cpp \
|
||||
globaloc.cpp hashfn.cpp \
|
||||
mainblk.cpp memblk.cpp memry.cpp \
|
||||
globaloc.cpp hashfn.cpp indexmapbidi.cpp \
|
||||
mainblk.cpp memry.cpp \
|
||||
serialis.cpp strngs.cpp \
|
||||
tessdatamanager.cpp tprintf.cpp \
|
||||
unichar.cpp unicharmap.cpp unicharset.cpp \
|
||||
unichar.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \
|
||||
params.cpp
|
||||
|
||||
|
||||
|
@ -18,9 +18,6 @@
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
|
||||
#include "ambigs.h"
|
||||
#include "helpers.h"
|
||||
|
||||
@ -314,7 +311,7 @@ void UnicharAmbigs::InsertIntoTable(
|
||||
unichar_id = ambig_spec->correct_ngram_id;
|
||||
} else {
|
||||
STRING frag_str = CHAR_FRAGMENT::to_string(
|
||||
ReplacementString, i, TestAmbigPartSize);
|
||||
ReplacementString, i, TestAmbigPartSize, false);
|
||||
unicharset->unichar_insert(frag_str.string());
|
||||
unichar_id = unicharset->unichar_to_id(frag_str.string());
|
||||
}
|
||||
|
104
ccutil/bitvector.cpp
Normal file
104
ccutil/bitvector.cpp
Normal file
@ -0,0 +1,104 @@
|
||||
// Copyright 2011 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: bitvector.cpp
|
||||
// Description: Class replacement for BITVECTOR.
|
||||
// Author: Ray Smith
|
||||
// Created: Mon Jan 10 17:45:01 PST 2011
|
||||
//
|
||||
// (C) Copyright 2011, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "bitvector.h"
|
||||
#include <string.h>
|
||||
#include "helpers.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
BitVector::BitVector() : bit_size_(0), array_(NULL) {}
|
||||
|
||||
BitVector::BitVector(int length) : bit_size_(length) {
|
||||
array_ = new uinT32[WordLength()];
|
||||
SetAllFalse();
|
||||
}
|
||||
|
||||
BitVector::BitVector(const BitVector& src) : bit_size_(src.bit_size_) {
|
||||
array_ = new uinT32[WordLength()];
|
||||
memcpy(array_, src.array_, ByteLength());
|
||||
}
|
||||
|
||||
BitVector& BitVector::operator=(const BitVector& src) {
|
||||
Alloc(src.bit_size_);
|
||||
memcpy(array_, src.array_, ByteLength());
|
||||
return *this;
|
||||
}
|
||||
|
||||
BitVector::~BitVector() {
|
||||
delete [] array_;
|
||||
}
|
||||
|
||||
// Initializes the array to length * false.
|
||||
void BitVector::Init(int length) {
|
||||
Alloc(length);
|
||||
SetAllFalse();
|
||||
}
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool BitVector::Serialize(FILE* fp) const {
|
||||
if (fwrite(&bit_size_, sizeof(bit_size_), 1, fp) != 1) return false;
|
||||
int wordlen = WordLength();
|
||||
if (fwrite(array_, sizeof(*array_), wordlen, fp) != wordlen) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool BitVector::DeSerialize(bool swap, FILE* fp) {
|
||||
uinT32 new_bit_size;
|
||||
if (fread(&new_bit_size, sizeof(new_bit_size), 1, fp) != 1) return false;
|
||||
if (swap) {
|
||||
ReverseN(&new_bit_size, sizeof(new_bit_size));
|
||||
}
|
||||
Alloc(new_bit_size);
|
||||
int wordlen = WordLength();
|
||||
if (fread(array_, sizeof(*array_), wordlen, fp) != wordlen) return false;
|
||||
if (swap) {
|
||||
for (int i = 0; i < wordlen; ++i)
|
||||
ReverseN(&array_[i], sizeof(array_[i]));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void BitVector::SetAllFalse() {
|
||||
memset(array_, 0, ByteLength());
|
||||
}
|
||||
void BitVector::SetAllTrue() {
|
||||
memset(array_, ~0, ByteLength());
|
||||
}
|
||||
|
||||
// Allocates memory for a vector of the given length.
|
||||
// Reallocates if the array is a different size, larger or smaller.
|
||||
void BitVector::Alloc(int length) {
|
||||
int initial_wordlength = WordLength();
|
||||
bit_size_ = length;
|
||||
int new_wordlength = WordLength();
|
||||
if (new_wordlength != initial_wordlength) {
|
||||
delete [] array_;
|
||||
array_ = new uinT32[new_wordlength];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
|
116
ccutil/bitvector.h
Normal file
116
ccutil/bitvector.h
Normal file
@ -0,0 +1,116 @@
|
||||
// Copyright 2011 Google Inc. All Rights Reserved.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: bitvector.h
|
||||
// Description: Class replacement for BITVECTOR.
|
||||
// Author: Ray Smith
|
||||
// Created: Mon Jan 10 17:44:01 PST 2011
|
||||
//
|
||||
// (C) Copyright 2011, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
||||
#ifndef TESSERACT_CCUTIL_BITVECTOR_H__
|
||||
#define TESSERACT_CCUTIL_BITVECTOR_H__
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include "host.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// Trivial class to encapsulate a fixed-length array of bits, with
|
||||
// Serialize/DeSerialize. Replaces the old macros.
|
||||
class BitVector {
|
||||
public:
|
||||
BitVector();
|
||||
// Initializes the array to length * false.
|
||||
explicit BitVector(int length);
|
||||
BitVector(const BitVector& src);
|
||||
BitVector& operator=(const BitVector& src);
|
||||
~BitVector();
|
||||
|
||||
// Initializes the array to length * false.
|
||||
void Init(int length);
|
||||
|
||||
// Returns the number of bits that are accessible in the vector.
|
||||
int size() const {
|
||||
return bit_size_;
|
||||
}
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
|
||||
void SetAllFalse();
|
||||
void SetAllTrue();
|
||||
|
||||
// Accessors to set/reset/get bits.
|
||||
// The range of index is [0, size()-1].
|
||||
// There is debug-only bounds checking.
|
||||
void SetBit(int index) {
|
||||
array_[WordIndex(index)] |= BitMask(index);
|
||||
}
|
||||
void ResetBit(int index) {
|
||||
array_[WordIndex(index)] &= ~BitMask(index);
|
||||
}
|
||||
void SetValue(int index, bool value) {
|
||||
if (value)
|
||||
SetBit(index);
|
||||
else
|
||||
ResetBit(index);
|
||||
}
|
||||
bool At(int index) const {
|
||||
return (array_[WordIndex(index)] & BitMask(index)) != 0;
|
||||
}
|
||||
bool operator[](int index) const {
|
||||
return (array_[WordIndex(index)] & BitMask(index)) != 0;
|
||||
}
|
||||
|
||||
private:
|
||||
// Allocates memory for a vector of the given length.
|
||||
void Alloc(int length);
|
||||
|
||||
// Computes the index to array_ for the given index, with debug range
|
||||
// checking.
|
||||
int WordIndex(int index) const {
|
||||
assert(0 <= index && index < bit_size_);
|
||||
return index / kBitFactor;
|
||||
}
|
||||
// Returns a mask to select the appropriate bit for the given index.
|
||||
uinT32 BitMask(int index) const {
|
||||
return 1 << (index & (kBitFactor - 1));
|
||||
}
|
||||
// Returns the number of array elements needed to represent the current
|
||||
// bit_size_.
|
||||
int WordLength() const {
|
||||
return (bit_size_ + kBitFactor - 1) / kBitFactor;
|
||||
}
|
||||
// Returns the number of bytes consumed by the array_.
|
||||
int ByteLength() const {
|
||||
return WordLength() * sizeof(*array_);
|
||||
}
|
||||
|
||||
// Number of bits in this BitVector.
|
||||
uinT32 bit_size_;
|
||||
// Array of words used to pack the bits.
|
||||
uinT32* array_;
|
||||
// Number of bits in an array_ element.
|
||||
static const int kBitFactor = sizeof(uinT32) * 8;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_CCUTIL_BITVECTOR_H__
|
@ -1,138 +0,0 @@
|
||||
/**********************************************************************
|
||||
* File: boxread.cpp
|
||||
* Description: Read data from a box file.
|
||||
* Author: Ray Smith
|
||||
* Created: Fri Aug 24 17:47:23 PDT 2007
|
||||
*
|
||||
* (C) Copyright 2007, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "mfcpch.h"
|
||||
#include <string.h>
|
||||
#include "boxread.h"
|
||||
#include "unichar.h"
|
||||
#include "tprintf.h"
|
||||
|
||||
// Special char code used to identify multi-blob labels.
|
||||
static const char* kMultiBlobLabelCode = "WordStr";
|
||||
|
||||
// Open the boxfile based on the given image filename.
|
||||
FILE* OpenBoxFile(const STRING& fname) {
|
||||
STRING filename = fname;
|
||||
const char *lastdot = strrchr(filename.string(), '.');
|
||||
if (lastdot != NULL)
|
||||
filename[lastdot - filename.string()] = '\0';
|
||||
|
||||
filename += ".box";
|
||||
FILE* box_file = NULL;
|
||||
if (!(box_file = fopen(filename.string(), "rb"))) {
|
||||
CANTOPENFILE.error("read_next_box", TESSEXIT,
|
||||
"Cant open box file %s",
|
||||
filename.string());
|
||||
}
|
||||
return box_file;
|
||||
}
|
||||
|
||||
// Box files are used ONLY DURING TRAINING, but by both processes of
|
||||
// creating tr files with tesseract, and unicharset_extractor.
|
||||
// read_next_box factors out the code to interpret a line of a box
|
||||
// file so that applybox and unicharset_extractor interpret the same way.
|
||||
// This function returns the next valid box file utf8 string and coords
|
||||
// and returns true, or false on eof (and closes the file).
|
||||
// It ignores the uft8 file signature, checks for valid utf-8 and allows
|
||||
// space or tab between fields.
|
||||
// utf8_str must be at least kBoxReadBufSize in length.
|
||||
// If there are page numbers in the file, it reads them all.
|
||||
bool read_next_box(int *line_number, FILE* box_file, char* utf8_str,
|
||||
int* x_min, int* y_min, int* x_max, int* y_max) {
|
||||
return read_next_box(-1, line_number, box_file, utf8_str,
|
||||
x_min, y_min, x_max, y_max);
|
||||
}
|
||||
|
||||
// As read_next_box above, but get a specific page number. (0-based)
|
||||
// Use -1 to read any page number. Files without page number all
|
||||
// read as if they are page 0.
|
||||
bool read_next_box(int target_page, int *line_number,
|
||||
FILE* box_file, char* utf8_str,
|
||||
int* x_min, int* y_min, int* x_max, int* y_max) {
|
||||
int count = 0;
|
||||
int page = 0;
|
||||
char buff[kBoxReadBufSize]; // boxfile read buffer
|
||||
char uch[kBoxReadBufSize];
|
||||
char *buffptr = buff;
|
||||
|
||||
while (fgets(buff, sizeof(buff) - 1, box_file)) {
|
||||
(*line_number)++;
|
||||
|
||||
buffptr = buff;
|
||||
const unsigned char *ubuf = reinterpret_cast<const unsigned char*>(buffptr);
|
||||
if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf)
|
||||
buffptr += 3; // Skip unicode file designation.
|
||||
// Check for blank lines in box file
|
||||
while (*buffptr == ' ' || *buffptr == '\t')
|
||||
buffptr++;
|
||||
if (*buffptr != '\0') {
|
||||
// Read the unichar without messing up on Tibetan.
|
||||
// According to issue 253 the utf-8 surrogates 85 and A0 are treated
|
||||
// as whitespace by sscanf, so it is more reliable to just find
|
||||
// ascii space and tab.
|
||||
int uch_len = 0;
|
||||
while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t')
|
||||
uch[uch_len++] = *buffptr++;
|
||||
uch[uch_len] = '\0';
|
||||
if (*buffptr != '\0') ++buffptr;
|
||||
count = sscanf(buffptr, "%d %d %d %d %d",
|
||||
x_min, y_min, x_max, y_max, &page);
|
||||
if (count != 5) {
|
||||
if (target_page <= 0) {
|
||||
// If target_page is negative or zero, allow lines with no page number
|
||||
page = 0;
|
||||
count = sscanf(buffptr, "%d %d %d %d", x_min, y_min, x_max, y_max);
|
||||
} else {
|
||||
tprintf("Box file format error on line %i; ignored\n", *line_number);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (target_page >= 0 && target_page != page)
|
||||
continue; // Not on the appropriate page.
|
||||
// Test for long space-delimited string label.
|
||||
if (strcmp(uch, kMultiBlobLabelCode) == 0 &&
|
||||
(buffptr = strchr(buffptr, '#')) != NULL) {
|
||||
strcpy(uch, buffptr + 1);
|
||||
chomp_string(uch);
|
||||
uch_len = strlen(uch);
|
||||
}
|
||||
// Validate UTF8 by making unichars with it.
|
||||
int used = 0;
|
||||
while (used < uch_len) {
|
||||
UNICHAR ch(uch + used, uch_len - used);
|
||||
int new_used = ch.utf8_len();
|
||||
if (new_used == 0) {
|
||||
tprintf("Bad UTF-8 str %s starts with 0x%02x at line %d, col %d\n",
|
||||
uch + used, uch[used], *line_number, used + 1);
|
||||
count = 0;
|
||||
break;
|
||||
}
|
||||
used += new_used;
|
||||
}
|
||||
if (count < 4 || used == 0) {
|
||||
tprintf("Box file format error on line %i; ignored\n", *line_number);
|
||||
} else {
|
||||
strncpy(utf8_str, uch, kBoxReadBufSize);
|
||||
return true; // Successfully read a box.
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(box_file);
|
||||
return false; // EOF
|
||||
}
|
@ -1,48 +0,0 @@
|
||||
/**********************************************************************
|
||||
* File: boxread.cpp
|
||||
* Description: Read data from a box file.
|
||||
* Author: Ray Smith
|
||||
* Created: Fri Aug 24 17:47:23 PDT 2007
|
||||
*
|
||||
* (C) Copyright 2007, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TESSERACT_CCUTIL_BOXREAD_H__
|
||||
#define TESSERACT_CCUTIL_BOXREAD_H__
|
||||
|
||||
#include <stdio.h>
|
||||
#include "strngs.h"
|
||||
|
||||
// Size of buffer used to read a line from a box file.
|
||||
const int kBoxReadBufSize = 1024;
|
||||
|
||||
// Open the boxfile based on the given image filename.
|
||||
FILE* OpenBoxFile(const STRING& fname);
|
||||
|
||||
// read_next_box factors out the code to interpret a line of a box
|
||||
// file so that applybox and unicharset_extractor interpret the same way.
|
||||
// This function returns the next valid box file utf8 string and coords
|
||||
// and returns true, or false on eof (and closes the file).
|
||||
// If ignores the uft8 file signature, checks for valid utf-8 and allows
|
||||
// space or tab between fields.
|
||||
// utf8_str must be at least kBoxReadBufSize in length.
|
||||
// If there are page numbers in the file, it reads them all.
|
||||
bool read_next_box(int *line_number, FILE* box_file, char* utf8_str,
|
||||
int* x_min, int* y_min, int* x_max, int* y_max);
|
||||
// As read_next_box above, but get a specific page number. (0-based)
|
||||
// Use -1 to read any page number. Files without page number all
|
||||
// read as if they are page 0.
|
||||
bool read_next_box(int page, int *line_number, FILE* box_file, char* utf8_str,
|
||||
int* x_min, int* y_min, int* x_max, int* y_max);
|
||||
|
||||
#endif // TESSERACT_CCUTIL_BOXREAD_H__
|
@ -14,10 +14,10 @@ CCUtil::CCUtil() :
|
||||
#endif
|
||||
INT_INIT_MEMBER(ambigs_debug_level, 0, "Debug level for unichar ambiguities",
|
||||
¶ms_),
|
||||
BOOL_INIT_MEMBER(use_definite_ambigs_for_classifier, 0, "Use definite"
|
||||
" ambiguities when running character classifier", ¶ms_),
|
||||
BOOL_INIT_MEMBER(use_ambigs_for_adaption, 0, "Use ambigs for deciding"
|
||||
" whether to adapt to a character", ¶ms_) {
|
||||
BOOL_MEMBER(use_definite_ambigs_for_classifier, 0, "Use definite"
|
||||
" ambiguities when running character classifier", ¶ms_),
|
||||
BOOL_MEMBER(use_ambigs_for_adaption, 0, "Use ambigs for deciding"
|
||||
" whether to adapt to a character", ¶ms_) {
|
||||
}
|
||||
|
||||
CCUtil::~CCUtil() {
|
||||
|
@ -93,36 +93,6 @@ void CLIST::shallow_clear() { //destroy all links
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/***********************************************************************
|
||||
* CLIST::internal_deep_copy
|
||||
*
|
||||
* Used during explict deep copy of a list. The "copier" function passed
|
||||
* allows each element to be correctly deep copied (assuming that each class
|
||||
* in the inheritance hierarchy does properly deep copies its members). The
|
||||
* function passing technique is as for "internal_clear".
|
||||
**********************************************************************/
|
||||
|
||||
void
|
||||
//ptr to copier functn
|
||||
CLIST::internal_deep_copy (void *(*copier) (void *),
|
||||
const CLIST * list) { //list being copied
|
||||
CLIST_ITERATOR from_it ((CLIST *) list);
|
||||
CLIST_ITERATOR to_it(this);
|
||||
|
||||
#ifndef NDEBUG
|
||||
if (!this)
|
||||
NULL_OBJECT.error ("CLIST::internal_deep_copy", ABORT, NULL);
|
||||
if (!list)
|
||||
BAD_PARAMETER.error ("CLIST::internal_deep_copy", ABORT,
|
||||
"source list is NULL");
|
||||
#endif
|
||||
|
||||
for (from_it.mark_cycle_pt (); !from_it.cycled_list (); from_it.forward ())
|
||||
to_it.add_after_then_move (copier (from_it.data ()));
|
||||
}
|
||||
|
||||
|
||||
/***********************************************************************
|
||||
* CLIST::assign_to_sublist
|
||||
*
|
||||
|
@ -105,10 +105,6 @@ class DLLSYM CLIST
|
||||
last = from_list->last;
|
||||
}
|
||||
|
||||
//ptr to copier functn
|
||||
void internal_deep_copy (void *(*copier) (void *),
|
||||
const CLIST * list); //list being copied
|
||||
|
||||
void assign_to_sublist( //to this list
|
||||
CLIST_ITERATOR *start_it, //from list start
|
||||
CLIST_ITERATOR *end_it); //from list end
|
||||
@ -897,10 +893,6 @@ public: \
|
||||
void deep_clear() /* delete elements */ \
|
||||
{ CLIST::internal_deep_clear( &CLASSNAME##_c1_zapper ); } \
|
||||
\
|
||||
void deep_copy( /* become a deep */ \
|
||||
const CLASSNAME##_CLIST*list) /* copy of src list*/ \
|
||||
{ CLIST::internal_deep_copy( &CLASSNAME##_c1_copier, list ); } \
|
||||
\
|
||||
void operator=( /* prevent assign */ \
|
||||
const CLASSNAME##_CLIST&) \
|
||||
{ DONT_ASSIGN_LISTS.error( QUOTE_IT( CLASSNAME##_CLIST ), \
|
||||
@ -979,27 +971,5 @@ void* link) /*link to delete*/ \
|
||||
{ \
|
||||
delete (CLASSNAME *) link; \
|
||||
} \
|
||||
\
|
||||
\
|
||||
\
|
||||
/*********************************************************************** \
|
||||
* CLASSNAME##_c1_copier \
|
||||
* \
|
||||
* A function which can generate a new, deep copy of a CLASSNAME element. \
|
||||
* This is passed to the generic deep copy list member function so that when \
|
||||
* a list is copied the elements on the list are properly copied from the \
|
||||
* base class, even though we dont use a virtual function. \
|
||||
* \
|
||||
**********************************************************************/ \
|
||||
\
|
||||
DLLSYM void* CLASSNAME##_c1_copier( /*deep copy a link*/ \
|
||||
void* old_element) /*source link*/ \
|
||||
{ \
|
||||
CLASSNAME* new_element; \
|
||||
\
|
||||
new_element = new CLASSNAME; \
|
||||
*new_element = *((CLASSNAME*) old_element); \
|
||||
return (void*) new_element; \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
@ -26,7 +26,6 @@
|
||||
#include <signal.h>
|
||||
#endif
|
||||
#include "tprintf.h"
|
||||
//#include "ipeerr.h"
|
||||
#include "errcode.h"
|
||||
|
||||
const ERRCODE BADERRACTION = "Illegal error action";
|
||||
@ -39,14 +38,12 @@ const ERRCODE BADERRACTION = "Illegal error action";
|
||||
* Makes use of error messages and numbers in a common place.
|
||||
*
|
||||
**********************************************************************/
|
||||
void
|
||||
ERRCODE::error ( //handle error
|
||||
const char *caller, //name of caller
|
||||
inT8 action, //action to take
|
||||
const char *format, ... //special message
|
||||
) const
|
||||
{
|
||||
va_list args; //variable args
|
||||
void ERRCODE::error( // handle error
|
||||
const char *caller, // name of caller
|
||||
TessErrorLogCode action, // action to take
|
||||
const char *format, ... // special message
|
||||
) const {
|
||||
va_list args; // variable args
|
||||
char msg[MAX_MSG];
|
||||
char *msgptr = msg;
|
||||
|
||||
@ -76,16 +73,6 @@ const char *format, ... //special message
|
||||
msgptr += sprintf (msgptr, "\n");
|
||||
|
||||
fprintf(stderr, msg);
|
||||
/*if ((strstr (message, "File") != NULL) ||
|
||||
(strstr (message, "file") != NULL))
|
||||
else if ((strstr (message, "List") != NULL) ||
|
||||
(strstr (message, "list") != NULL))
|
||||
else if ((strstr (message, "Memory") != NULL) ||
|
||||
(strstr (message, "memory") != NULL))
|
||||
global_abort_code = MEMORY_ABORT;
|
||||
else
|
||||
global_abort_code = NO_ABORT_CODE;
|
||||
*/
|
||||
|
||||
int* p = NULL;
|
||||
switch (action) {
|
||||
|
@ -23,10 +23,12 @@
|
||||
#include "host.h"
|
||||
|
||||
/*Control parameters for error()*/
|
||||
#define DBG -1 /*log without alert */
|
||||
#define TESSLOG 0 /*alert user */
|
||||
#define TESSEXIT 1 /*exit after erro */
|
||||
#define ABORT 2 /*abort after error */
|
||||
enum TessErrorLogCode {
|
||||
DBG = -1, /*log without alert */
|
||||
TESSLOG = 0, /*alert user */
|
||||
TESSEXIT = 1, /*exit after erro */
|
||||
ABORT = 2 /*abort after error */
|
||||
};
|
||||
|
||||
/* Explicit Error Abort codes */
|
||||
#define NO_ABORT_CODE 0
|
||||
@ -64,18 +66,17 @@
|
||||
#define SUBSUBLOC_TESS 1
|
||||
#define SUBSUBLOC_NN 2
|
||||
|
||||
class DLLSYM ERRCODE //error handler class
|
||||
{
|
||||
const char *message; //error message
|
||||
public:
|
||||
void error ( //error print function
|
||||
const char *caller, //function location
|
||||
inT8 action, //action to take
|
||||
const char *format, ... //fprintf format
|
||||
) const;
|
||||
ERRCODE(const char *string) {
|
||||
message = string;
|
||||
} //initialize with string
|
||||
class DLLSYM ERRCODE { // error handler class
|
||||
const char *message; // error message
|
||||
public:
|
||||
void error( // error print function
|
||||
const char *caller, // function location
|
||||
TessErrorLogCode action, // action to take
|
||||
const char *format, ... // fprintf format
|
||||
) const;
|
||||
ERRCODE(const char *string) {
|
||||
message = string;
|
||||
} // initialize with string
|
||||
};
|
||||
|
||||
const ERRCODE ASSERT_FAILED = "Assert failed";
|
||||
|
@ -71,6 +71,7 @@ class GenericVector {
|
||||
|
||||
// Return the object from an index.
|
||||
T &get(int index) const;
|
||||
T &back() const;
|
||||
T &operator[](int index) const;
|
||||
|
||||
// Return the index of the T object.
|
||||
@ -88,6 +89,10 @@ class GenericVector {
|
||||
int push_back(T object);
|
||||
void operator+=(T t);
|
||||
|
||||
// Push an element in the end of the array if the same
|
||||
// element is not already contained in the array.
|
||||
int push_back_new(T object);
|
||||
|
||||
// Push an element in the front of the array
|
||||
// Note: This function is O(n)
|
||||
int push_front(T object);
|
||||
@ -127,7 +132,7 @@ class GenericVector {
|
||||
void delete_data_pointers();
|
||||
|
||||
// This method clears the current object, then, does a shallow copy of
|
||||
// its argument, and finally invalidate its argument.
|
||||
// its argument, and finally invalidates its argument.
|
||||
// Callbacks are moved to the current object;
|
||||
void move(GenericVector<T>* from);
|
||||
|
||||
@ -197,7 +202,10 @@ class GenericVector {
|
||||
}
|
||||
// Searches the array (assuming sorted in ascending order, using sort()) for
|
||||
// an element equal to target and returns the index of the best candidate.
|
||||
// The return value is the largest index i such that data_[i] <= target or 0.
|
||||
// The return value is conceptually the largest index i such that
|
||||
// data_[i] <= target or 0 if target < the whole vector.
|
||||
// NOTE that this function uses operator> so really the return value is
|
||||
// the largest index i such that data_[i] > target is false.
|
||||
int binary_search(const T& target) const {
|
||||
int bottom = 0;
|
||||
int top = size_used_;
|
||||
@ -328,13 +336,13 @@ class PointerVector : public GenericVector<T*> {
|
||||
// Copy must be deep, as the pointers will be automatically deleted on
|
||||
// destruction.
|
||||
PointerVector(const PointerVector& other) {
|
||||
init(other.size());
|
||||
this->init(other.size());
|
||||
this->operator+=(other);
|
||||
}
|
||||
PointerVector<T>& operator+=(const PointerVector& other) {
|
||||
reserve(this->size_used_ + other.size_used_);
|
||||
this->reserve(this->size_used_ + other.size_used_);
|
||||
for (int i = 0; i < other.size(); ++i) {
|
||||
push_back(new T(*other.data_[i]));
|
||||
this->push_back(new T(*other.data_[i]));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
@ -360,6 +368,28 @@ class PointerVector : public GenericVector<T*> {
|
||||
GenericVector<T*>::truncate(size);
|
||||
}
|
||||
|
||||
// Compact the vector by deleting elements for which delete_cb returns
|
||||
// true. delete_cb is a permanent callback and will be deleted.
|
||||
void compact(TessResultCallback1<bool, const T*>* delete_cb) {
|
||||
int new_size = 0;
|
||||
int old_index = 0;
|
||||
// Until the callback returns true, the elements stay the same.
|
||||
while (old_index < GenericVector<T*>::size_used_ &&
|
||||
!delete_cb->Run(GenericVector<T*>::data_[old_index++]))
|
||||
++new_size;
|
||||
// Now just copy anything else that gets false from delete_cb.
|
||||
for (; old_index < GenericVector<T*>::size_used_; ++old_index) {
|
||||
if (!delete_cb->Run(GenericVector<T*>::data_[old_index])) {
|
||||
GenericVector<T*>::data_[new_size++] =
|
||||
GenericVector<T*>::data_[old_index];
|
||||
} else {
|
||||
delete GenericVector<T*>::data_[old_index];
|
||||
}
|
||||
}
|
||||
GenericVector<T*>::size_used_ = new_size;
|
||||
delete delete_cb;
|
||||
}
|
||||
|
||||
// Clear the array, calling the clear callback function if any.
|
||||
// All the owned callbacks are also deleted.
|
||||
// If you don't want the callbacks to be deleted, before calling clear, set
|
||||
@ -399,7 +429,7 @@ class PointerVector : public GenericVector<T*> {
|
||||
item = new T;
|
||||
if (!item->DeSerialize(swap, fp)) return false;
|
||||
}
|
||||
push_back(item);
|
||||
this->push_back(item);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -488,6 +518,12 @@ T &GenericVector<T>::operator[](int index) const {
|
||||
return data_[index];
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T &GenericVector<T>::back() const {
|
||||
ASSERT_HOST(size_used_ > 0);
|
||||
return data_[size_used_ - 1];
|
||||
}
|
||||
|
||||
// Return the object from an index.
|
||||
template <typename T>
|
||||
void GenericVector<T>::set(T t, int index) {
|
||||
@ -555,6 +591,14 @@ int GenericVector<T>::push_back(T object) {
|
||||
return index;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
int GenericVector<T>::push_back_new(T object) {
|
||||
int index = get_index(object);
|
||||
if (index >= 0)
|
||||
return index;
|
||||
return push_back(object);
|
||||
}
|
||||
|
||||
// Add an element in the array (front)
|
||||
template <typename T>
|
||||
int GenericVector<T>::push_front(T object) {
|
||||
@ -739,7 +783,7 @@ bool GenericVector<T>::DeSerializeClasses(bool swap, FILE* fp) {
|
||||
}
|
||||
|
||||
// This method clear the current object, then, does a shallow copy of
|
||||
// its argument, and finally invalindate its argument.
|
||||
// its argument, and finally invalidates its argument.
|
||||
template <typename T>
|
||||
void GenericVector<T>::move(GenericVector<T>* from) {
|
||||
this->clear();
|
||||
|
@ -31,8 +31,9 @@
|
||||
// Remove newline (if any) at the end of the string.
|
||||
inline void chomp_string(char *str) {
|
||||
int last_index = strlen(str) - 1;
|
||||
if (last_index >= 0 && str[last_index] == '\n') {
|
||||
str[last_index] = '\0';
|
||||
while (last_index >= 0 &&
|
||||
(str[last_index] == '\n' || str[last_index] == '\r')) {
|
||||
str[last_index--] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
@ -87,6 +88,18 @@ inline void UpdateRange(const T1& x_lo, const T1& x_hi,
|
||||
*upper_bound = x_hi;
|
||||
}
|
||||
|
||||
// Intersect the range [*lower2, *upper2] with the range [lower1, upper1],
|
||||
// putting the result back in [*lower2, *upper2].
|
||||
// If non-intersecting ranges are given, we end up with *lower2 > *upper2.
|
||||
template<typename T>
|
||||
inline void IntersectRange(const T& lower1, const T& upper1,
|
||||
T* lower2, T* upper2) {
|
||||
if (lower1 > *lower2)
|
||||
*lower2 = lower1;
|
||||
if (upper1 < *upper2)
|
||||
*upper2 = upper1;
|
||||
}
|
||||
|
||||
// Proper modulo arithmetic operator. Returns a mod b that works for -ve a.
|
||||
// For any integer a and positive b, returns r : 0<=r<b and a=n*b + r for
|
||||
// some integer n.
|
||||
@ -99,8 +112,8 @@ inline int Modulo(int a, int b) {
|
||||
// counting at 0. With simple rounding 1/3 = 0, 0/3 = 0 -1/3 = 0, -2/3 = 0,
|
||||
// -3/3 = 0 and -4/3 = -1.
|
||||
// I want 1/3 = 0, 0/3 = 0, -1/3 = 0, -2/3 = -1, -3/3 = -1 and -4/3 = -1.
|
||||
// Results with b negative are not defined.
|
||||
inline int DivRounded(int a, int b) {
|
||||
if (b < 0) return -DivRounded(a, -b);
|
||||
return a >= 0 ? (a + b / 2) / b : (a - b / 2) / b;
|
||||
}
|
||||
|
||||
|
@ -150,15 +150,6 @@ typedef unsigned char BOOL8;
|
||||
#define MIN_FLOAT32 ((float)1.17549435e-38)
|
||||
|
||||
// Defines
|
||||
|
||||
#ifndef OKAY
|
||||
#define OKAY 0
|
||||
#endif
|
||||
|
||||
#ifndef HPERR
|
||||
#define HPERR -1
|
||||
#endif
|
||||
|
||||
#ifndef TRUE
|
||||
#define TRUE 1
|
||||
#endif
|
||||
|
250
ccutil/indexmapbidi.cpp
Normal file
250
ccutil/indexmapbidi.cpp
Normal file
@ -0,0 +1,250 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: indexmapbidi.cpp
|
||||
// Description: Bi-directional mapping between a sparse and compact space.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
// Created: Tue Apr 06 11:33:59 PDT 2010
|
||||
//
|
||||
// (C) Copyright 2010, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "indexmapbidi.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
// SparseToCompact takes a sparse index to an index in the compact space.
|
||||
// Uses a binary search to find the result. For faster speed use
|
||||
// IndexMapBiDi, but that takes more memory.
|
||||
int IndexMap::SparseToCompact(int sparse_index) const {
|
||||
int result = compact_map_.binary_search(sparse_index);
|
||||
return compact_map_[result] == sparse_index ? result : -1;
|
||||
}
|
||||
|
||||
// Copy from the input.
|
||||
void IndexMap::CopyFrom(const IndexMap& src) {
|
||||
sparse_size_ = src.sparse_size_;
|
||||
compact_map_ = src.compact_map_;
|
||||
}
|
||||
void IndexMap::CopyFrom(const IndexMapBiDi& src) {
|
||||
sparse_size_ = src.SparseSize();
|
||||
compact_map_ = src.compact_map_;
|
||||
}
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool IndexMap::Serialize(FILE* fp) const {
|
||||
inT32 sparse_size = sparse_size_;
|
||||
if (fwrite(&sparse_size, sizeof(sparse_size), 1, fp) != 1) return false;
|
||||
if (!compact_map_.Serialize(fp)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool IndexMap::DeSerialize(bool swap, FILE* fp) {
|
||||
inT32 sparse_size;
|
||||
if (fread(&sparse_size, sizeof(sparse_size), 1, fp) != 1) return false;
|
||||
if (swap)
|
||||
ReverseN(&sparse_size, sizeof(sparse_size));
|
||||
sparse_size_ = sparse_size;
|
||||
if (!compact_map_.DeSerialize(swap, fp)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Top-level init function in a single call to initialize a map to select
|
||||
// a single contiguous subrange [start, end) of the sparse space to be mapped
|
||||
// 1 to 1 to the compact space, with all other elements of the sparse space
|
||||
// left unmapped.
|
||||
// No need to call Setup after this.
|
||||
void IndexMapBiDi::InitAndSetupRange(int sparse_size, int start, int end) {
|
||||
Init(sparse_size, false);
|
||||
for (int i = start; i < end; ++i)
|
||||
SetMap(i, true);
|
||||
Setup();
|
||||
}
|
||||
|
||||
// Initializes just the sparse_map_ to the given size with either all
|
||||
// forward indices mapped (all_mapped = true) or none (all_mapped = false).
|
||||
// Call Setup immediately after, or make calls to SetMap first to adjust the
|
||||
// mapping and then call Setup before using the map.
|
||||
void IndexMapBiDi::Init(int size, bool all_mapped) {
|
||||
sparse_map_.init_to_size(size, -1);
|
||||
if (all_mapped) {
|
||||
for (int i = 0; i < size; ++i)
|
||||
sparse_map_[i] = i;
|
||||
}
|
||||
}
|
||||
|
||||
// Sets a given index in the sparse_map_ to be mapped or not.
|
||||
void IndexMapBiDi::SetMap(int sparse_index, bool mapped) {
|
||||
sparse_map_[sparse_index] = mapped ? 0 : -1;
|
||||
}
|
||||
|
||||
// Sets up the sparse_map_ and compact_map_ properly after Init and
|
||||
// some calls to SetMap. Assumes an ordered 1-1 map from set indices
|
||||
// in the forward map to the compact space.
|
||||
void IndexMapBiDi::Setup() {
|
||||
int compact_size = 0;
|
||||
for (int i = 0; i < sparse_map_.size(); ++i) {
|
||||
if (sparse_map_[i] >= 0) {
|
||||
sparse_map_[i] = compact_size++;
|
||||
}
|
||||
}
|
||||
compact_map_.init_to_size(compact_size, -1);
|
||||
for (int i = 0; i < sparse_map_.size(); ++i) {
|
||||
if (sparse_map_[i] >= 0) {
|
||||
compact_map_[sparse_map_[i]] = i;
|
||||
}
|
||||
}
|
||||
sparse_size_ = sparse_map_.size();
|
||||
}
|
||||
|
||||
// Copy from the input.
|
||||
void IndexMapBiDi::CopyFrom(const IndexMapBiDi& src) {
|
||||
sparse_map_ = src.sparse_map_;
|
||||
compact_map_ = src.compact_map_;
|
||||
sparse_size_ = sparse_map_.size();
|
||||
}
|
||||
|
||||
// Merges the two compact space indices. May be called many times, but
|
||||
// the merges must be concluded by a call to CompleteMerges.
|
||||
// Returns true if a merge was actually performed.
|
||||
bool IndexMapBiDi::Merge(int compact_index1, int compact_index2) {
|
||||
// Find the current master index for index1 and index2.
|
||||
compact_index1 = MasterCompactIndex(compact_index1);
|
||||
compact_index2 = MasterCompactIndex(compact_index2);
|
||||
// Be sure that index1 < index2.
|
||||
if (compact_index1 > compact_index2) {
|
||||
int tmp = compact_index1;
|
||||
compact_index1 = compact_index2;
|
||||
compact_index2 = tmp;
|
||||
} else if (compact_index1 == compact_index2) {
|
||||
return false;
|
||||
}
|
||||
// To save iterating over all sparse_map_ entries, simply make the master
|
||||
// entry for index2 point to index1.
|
||||
// This leaves behind a potential chain of parents that needs to be chased,
|
||||
// as above.
|
||||
sparse_map_[compact_map_[compact_index2]] = compact_index1;
|
||||
if (compact_index1 >= 0)
|
||||
compact_map_[compact_index2] = compact_map_[compact_index1];
|
||||
return true;
|
||||
}
|
||||
|
||||
// Completes one or more Merge operations by further compacting the
|
||||
// compact space. Unused compact space indices are removed, and the used
|
||||
// ones above shuffled down to fill the gaps.
|
||||
// Example:
|
||||
// Input sparse_map_: (x indicates -1)
|
||||
// x x 0 x 2 x x 4 x 0 x 2 x
|
||||
// Output sparse_map_:
|
||||
// x x 0 x 1 x x 2 x 0 x 1 x
|
||||
// Output compact_map_:
|
||||
// 2 4 7.
|
||||
void IndexMapBiDi::CompleteMerges() {
|
||||
// Ensure each sparse_map_entry contains a master compact_map_ index.
|
||||
int compact_size = 0;
|
||||
for (int i = 0; i < sparse_map_.size(); ++i) {
|
||||
int compact_index = MasterCompactIndex(sparse_map_[i]);
|
||||
sparse_map_[i] = compact_index;
|
||||
if (compact_index >= compact_size)
|
||||
compact_size = compact_index + 1;
|
||||
}
|
||||
// Re-generate the compact_map leaving holes for unused indices.
|
||||
compact_map_.init_to_size(compact_size, -1);
|
||||
for (int i = 0; i < sparse_map_.size(); ++i) {
|
||||
if (sparse_map_[i] >= 0) {
|
||||
if (compact_map_[sparse_map_[i]] == -1)
|
||||
compact_map_[sparse_map_[i]] = i;
|
||||
}
|
||||
}
|
||||
// Compact the compact_map, leaving tmp_compact_map saying where each
|
||||
// index went to in the compacted map.
|
||||
GenericVector<inT32> tmp_compact_map;
|
||||
tmp_compact_map.init_to_size(compact_size, -1);
|
||||
compact_size = 0;
|
||||
for (int i = 0; i < compact_map_.size(); ++i) {
|
||||
if (compact_map_[i] >= 0) {
|
||||
tmp_compact_map[i] = compact_size;
|
||||
compact_map_[compact_size++] = compact_map_[i];
|
||||
}
|
||||
}
|
||||
compact_map_.truncate(compact_size);
|
||||
// Now modify the entries in the sparse map to point to the new locations.
|
||||
for (int i = 0; i < sparse_map_.size(); ++i) {
|
||||
if (sparse_map_[i] >= 0) {
|
||||
sparse_map_[i] = tmp_compact_map[sparse_map_[i]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool IndexMapBiDi::Serialize(FILE* fp) const {
|
||||
if (!IndexMap::Serialize(fp)) return false;
|
||||
// Make a vector containing the rest of the map. If the map is many-to-one
|
||||
// then each additional sparse entry needs to be stored.
|
||||
// Normally we store only the compact map to save space.
|
||||
GenericVector<inT32> remaining_pairs;
|
||||
for (int i = 0; i < sparse_map_.size(); ++i) {
|
||||
if (sparse_map_[i] >= 0 && compact_map_[sparse_map_[i]] != i) {
|
||||
remaining_pairs.push_back(i);
|
||||
remaining_pairs.push_back(sparse_map_[i]);
|
||||
}
|
||||
}
|
||||
if (!remaining_pairs.Serialize(fp)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool IndexMapBiDi::DeSerialize(bool swap, FILE* fp) {
|
||||
if (!IndexMap::DeSerialize(swap, fp)) return false;
|
||||
GenericVector<inT32> remaining_pairs;
|
||||
if (!remaining_pairs.DeSerialize(swap, fp)) return false;
|
||||
sparse_map_.init_to_size(sparse_size_, -1);
|
||||
for (int i = 0; i < compact_map_.size(); ++i) {
|
||||
sparse_map_[compact_map_[i]] = i;
|
||||
}
|
||||
for (int i = 0; i < remaining_pairs.size(); ++i) {
|
||||
int sparse_index = remaining_pairs[i++];
|
||||
sparse_map_[sparse_index] = remaining_pairs[i];
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Bulk calls to SparseToCompact.
|
||||
// Maps the given array of sparse indices to an array of compact indices.
|
||||
// Assumes the input is sorted. The output indices are sorted and uniqued.
|
||||
// Return value is the number of "missed" features, being features that
|
||||
// don't map to the compact feature space.
|
||||
int IndexMapBiDi::MapFeatures(const GenericVector<int>& sparse,
|
||||
GenericVector<int>* compact) const {
|
||||
compact->truncate(0);
|
||||
int num_features = sparse.size();
|
||||
int missed_features = 0;
|
||||
int prev_good_feature = -1;
|
||||
for (int f = 0; f < num_features; ++f) {
|
||||
int feature = sparse_map_[sparse[f]];
|
||||
if (feature >= 0) {
|
||||
if (feature != prev_good_feature) {
|
||||
compact->push_back(feature);
|
||||
prev_good_feature = feature;
|
||||
}
|
||||
} else {
|
||||
++missed_features;
|
||||
}
|
||||
}
|
||||
return missed_features;
|
||||
}
|
||||
|
||||
} // namespace tesseract.
|
||||
|
180
ccutil/indexmapbidi.h
Normal file
180
ccutil/indexmapbidi.h
Normal file
@ -0,0 +1,180 @@
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: indexmapbidi.h
|
||||
// Description: Bi-directional mapping between a sparse and compact space.
|
||||
// Author: rays@google.com (Ray Smith)
|
||||
// Created: Tue Apr 06 11:33:59 PDT 2010
|
||||
//
|
||||
// (C) Copyright 2010, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifndef TESSERACT_CCUTIL_INDEXMAPBIDI_H_
|
||||
#define TESSERACT_CCUTIL_INDEXMAPBIDI_H_
|
||||
|
||||
#include <stdio.h>
|
||||
#include "genericvector.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
class IndexMapBiDi;
|
||||
|
||||
// Bidirectional one-to-one mapping between a sparse and a compact discrete
|
||||
// space. Many entries in the sparse space are unmapped, but those that are
|
||||
// mapped have a 1-1 mapping to (and from) the compact space, where all
|
||||
// values are used. This is useful for forming subsets of larger collections,
|
||||
// such as subsets of character sets, or subsets of binary feature spaces.
|
||||
//
|
||||
// This base class provides basic functionality with binary search for the
|
||||
// SparseToCompact mapping to save memory.
|
||||
// For a faster inverse mapping, or to allow a many-to-one mapping, use
|
||||
// IndexMapBiDi below.
|
||||
// NOTE: there are currently no methods to setup an IndexMap on its own!
|
||||
// It must be initialized by copying from an IndexMapBiDi or by DeSerialize.
|
||||
class IndexMap {
|
||||
public:
|
||||
virtual ~IndexMap() {}
|
||||
|
||||
// SparseToCompact takes a sparse index to an index in the compact space.
|
||||
// Uses a binary search to find the result. For faster speed use
|
||||
// IndexMapBiDi, but that takes more memory.
|
||||
virtual int SparseToCompact(int sparse_index) const;
|
||||
|
||||
// CompactToSparse takes a compact index to the corresponding index in the
|
||||
// sparse space.
|
||||
int CompactToSparse(int compact_index) const {
|
||||
return compact_map_[compact_index];
|
||||
}
|
||||
// The size of the sparse space.
|
||||
virtual int SparseSize() const {
|
||||
return sparse_size_;
|
||||
}
|
||||
// The size of the compact space.
|
||||
int CompactSize() const {
|
||||
return compact_map_.size();
|
||||
}
|
||||
|
||||
// Copy from the input.
|
||||
void CopyFrom(const IndexMap& src);
|
||||
void CopyFrom(const IndexMapBiDi& src);
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
|
||||
protected:
|
||||
// The sparse space covers integers in the range [0, sparse_size_-1].
|
||||
int sparse_size_;
|
||||
// The compact space covers integers in the range [0, compact_map_.size()-1].
|
||||
// Each element contains the corresponding sparse index.
|
||||
GenericVector<inT32> compact_map_;
|
||||
};
|
||||
|
||||
// Bidirectional many-to-one mapping between a sparse and a compact discrete
|
||||
// space. As with IndexMap, many entries may be unmapped, but unlike IndexMap,
|
||||
// of those that are, many may be mapped to the same compact index.
|
||||
// If the map is many-to-one, it is not possible to directly obtain all the
|
||||
// sparse indices that map to a single compact index.
|
||||
// This map is time- rather than space-efficient. It stores the entire sparse
|
||||
// space.
|
||||
// IndexMapBiDi may be initialized in one of 3 ways:
|
||||
// 1. Init(size, true);
|
||||
// Setup();
|
||||
// Sets a complete 1:1 mapping with no unmapped elements.
|
||||
// 2. Init(size, false);
|
||||
// for ... SetMap(index, true);
|
||||
// Setup();
|
||||
// Specifies precisely which sparse indices are mapped. The mapping is 1:1.
|
||||
// 3. Either of the above, followed by:
|
||||
// for ... Merge(index1, index2);
|
||||
// CompleteMerges();
|
||||
// Allows a many-to-one mapping by merging compact space indices.
|
||||
class IndexMapBiDi : public IndexMap {
|
||||
public:
|
||||
virtual ~IndexMapBiDi() {}
|
||||
|
||||
// Top-level init function in a single call to initialize a map to select
|
||||
// a single contiguous subrange [start, end) of the sparse space to be mapped
|
||||
// 1 to 1 to the compact space, with all other elements of the sparse space
|
||||
// left unmapped.
|
||||
// No need to call Setup after this.
|
||||
void InitAndSetupRange(int sparse_size, int start, int end);
|
||||
|
||||
// Initializes just the sparse_map_ to the given size with either all
|
||||
// forward indices mapped (all_mapped = true) or none (all_mapped = false).
|
||||
// Call Setup immediately after, or make calls to SetMap first to adjust the
|
||||
// mapping and then call Setup before using the map.
|
||||
void Init(int size, bool all_mapped);
|
||||
// Sets a given index in the sparse_map_ to be mapped or not.
|
||||
void SetMap(int sparse_index, bool mapped);
|
||||
// Sets up the sparse_map_ and compact_map_ properly after Init and
|
||||
// some calls to SetMap. Assumes an ordered 1-1 map from set indices
|
||||
// in the sparse space to the compact space.
|
||||
void Setup();
|
||||
|
||||
// Merges the two compact space indices. May be called many times, but
|
||||
// the merges must be concluded by a call to CompleteMerges.
|
||||
// Returns true if a merge was actually performed.
|
||||
bool Merge(int compact_index1, int compact_index2);
|
||||
// Returns true if the given compact index has been deleted.
|
||||
bool IsCompactDeleted(int index) const {
|
||||
return MasterCompactIndex(index) < 0;
|
||||
}
|
||||
// Completes one or more Merge operations by further compacting the
|
||||
// compact space.
|
||||
void CompleteMerges();
|
||||
|
||||
// SparseToCompact takes a sparse index to an index in the compact space.
|
||||
virtual int SparseToCompact(int sparse_index) const {
|
||||
return sparse_map_[sparse_index];
|
||||
}
|
||||
// The size of the sparse space.
|
||||
virtual int SparseSize() const {
|
||||
return sparse_map_.size();
|
||||
}
|
||||
|
||||
// Copy from the input.
|
||||
void CopyFrom(const IndexMapBiDi& src);
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
|
||||
// Bulk calls to SparseToCompact.
|
||||
// Maps the given array of sparse indices to an array of compact indices.
|
||||
// Assumes the input is sorted. The output indices are sorted and uniqued.
|
||||
// Return value is the number of "missed" features, being features that
|
||||
// don't map to the compact feature space.
|
||||
int MapFeatures(const GenericVector<int>& sparse,
|
||||
GenericVector<int>* compact) const;
|
||||
|
||||
private:
|
||||
// Returns the master compact index for a given compact index.
|
||||
// During a multiple merge operation, several compact indices may be
|
||||
// combined, so we need to be able to find the master of all.
|
||||
int MasterCompactIndex(int compact_index) const {
|
||||
while (compact_index >= 0 &&
|
||||
sparse_map_[compact_map_[compact_index]] != compact_index)
|
||||
compact_index = sparse_map_[compact_map_[compact_index]];
|
||||
return compact_index;
|
||||
}
|
||||
|
||||
// Direct look-up of the compact index for each element in sparse space.
|
||||
GenericVector<inT32> sparse_map_;
|
||||
};
|
||||
|
||||
} // namespace tesseract.
|
||||
|
||||
#endif // TESSERACT_CCUTIL_INDEXMAPBIDI_H_
|
@ -43,193 +43,11 @@
|
||||
#define MAX_OCR_NAME 32 /*name of engine */
|
||||
#define MAX_OCR_VERSION 17 /*version code of engine */
|
||||
|
||||
/*Image parameters*/
|
||||
#define MIN_IMAGE_SIZE 64 /*smallest image that will be passed */
|
||||
#define IMAGE_ROUNDING 32 /*all sizes are multiple of this */
|
||||
|
||||
#if defined(__SLOW_TIMES__)
|
||||
/*Maximum timeouts of various functions (in secs)*/
|
||||
#define STARTUP_TIMEOUT 100 /*start of OCR engine */
|
||||
#define SHUTDOWN_TIMEOUT 50 /*end of OCR engine */
|
||||
#define SENDIM_TIMEOUT 50 /*send of image */
|
||||
#define RELEASE_TIMEOUT 50 /*release of semaphore */
|
||||
#define READIM_TIMEOUT 100 /*read of image */
|
||||
#define READTEXT_TIMEOUT 50 /*read of text */
|
||||
#define PROGRESS_TIMEOUT 30 /*progress every 3 seconds */
|
||||
#define BADTIMES_TIMEOUT 7 /*max lack of progress */
|
||||
#else
|
||||
/*Maximum timeouts of various functions (in secs)*/
|
||||
#define STARTUP_TIMEOUT 10 /*start of OCR engine */
|
||||
#define SHUTDOWN_TIMEOUT 6 /*end of OCR engine */
|
||||
#define SENDIM_TIMEOUT 5 /*send of image */
|
||||
#define RELEASE_TIMEOUT 5 /*release of semaphore */
|
||||
#define READIM_TIMEOUT 10 /*read of image */
|
||||
#define READTEXT_TIMEOUT 5 /*read of text */
|
||||
#define PROGRESS_TIMEOUT 3 /*progress every 3 seconds */
|
||||
#define BADTIMES_TIMEOUT 7 /*max lack of progress */
|
||||
#endif
|
||||
|
||||
/*language definitions are identical to RTF*/
|
||||
#define LANGE_NONE 0x0400 /*no language */
|
||||
#define LANGE_ALBANIAN 0x041c /*Albanian */
|
||||
#define LANGE_BRITISH 0x0809 /*International English */
|
||||
#define LANGE_BULGARIAN 0x0402 /*Bulgarian */
|
||||
#define LANGE_CROATIAN 0x041a /*Croatian(latin alphabet) */
|
||||
#define LANGE_CZECH 0x0405 /*Czech */
|
||||
#define LANGE_DANISH 0x0406 /*Danish */
|
||||
#define LANGE_DUTCH 0x0413 /*Dutch */
|
||||
#define LANGE_FINNISH 0x040b /*Finnish */
|
||||
#define LANGE_FRENCH 0x040c /*French */
|
||||
#define LANGE_GERMAN 0x0407 /*German */
|
||||
#define LANGE_GREEK 0x0408 /*Greek */
|
||||
#define LANGE_HUNGARIAN 0x040e /*Hungarian */
|
||||
#define LANGE_ITALIAN 0x0410 /*Italian */
|
||||
#define LANGE_JAPANESE 0x0411 /*Japanese */
|
||||
#define LANGE_KOREAN 0x0412 /*Korean */
|
||||
#define LANGE_NORWEGIAN 0x0414 /*Bokmal */
|
||||
#define LANGE_POLISH 0x0415 /*Polish */
|
||||
#define LANGE_PORTUGESE 0x0416 /*Brazilian Portugese */
|
||||
#define LANGE_ROMANIAN 0x0418 /*Romanian */
|
||||
#define LANGE_RUSSIAN 0x0419 /*Russian */
|
||||
#define LANGE_SCHINESE 0x0804 /*Simplified Chinese */
|
||||
#define LANGE_SLOVAK 0x041b /*Slovak */
|
||||
#define LANGE_SPANISH 0x040a /*Castilian */
|
||||
#define LANGE_SWEDISH 0x041d /*Swedish */
|
||||
#define LANGE_TCHINESE 0x0404 /*Traditional Chinese */
|
||||
#define LANGE_TURKISH 0x041f /*Turkish */
|
||||
#define LANGE_USENGLISH 0x0409 /*American */
|
||||
|
||||
/*font family definitions are identical to RTF*/
|
||||
#define FFAM_NONE 0 /*unknown */
|
||||
#define FFAM_ROMAN 1 /*serifed prop */
|
||||
#define FFAM_SWISS 2 /*sans-serif prop */
|
||||
#define FFAM_MODERN 3 /*fixed pitch */
|
||||
|
||||
/*character set definitions are identical to RTF*/
|
||||
#define CHSET_ANSI 0 /*Ansi efigs */
|
||||
#define CHSET_SHIFT_JIS 128 /*JIS X 0208-1990 */
|
||||
#define CHSET_KOREAN 129 /*KS C 5601-1992 */
|
||||
#define CHSET_SCHINESE 134 /*GB 2312-80 */
|
||||
#define CHSET_BIG5 136 /*Big Five */
|
||||
#define CHSET_CYRILLIC 204 /*Cyrillic */
|
||||
#define CHSET_EEUROPE 238 /*Eastern Europe */
|
||||
|
||||
/*pitch set definitions are identical to RTF*/
|
||||
#define PITCH_DEF 0 /*default */
|
||||
#define PITCH_FIXED 1 /*fixed pitch */
|
||||
#define PITCH_VAR 2 /*variable pitch */
|
||||
|
||||
/*Bitmasks for character enhancements.
|
||||
OR these together for enhancement in ocr_append_char*/
|
||||
#define EUC_BOLD 1 /*bold character */
|
||||
#define EUC_ITALIC 2 /*italic char */
|
||||
#define EUC_UNDERLINE 4 /*underlined char */
|
||||
#define EUC_SUBSCRIPT 8 /*subscript char */
|
||||
#define EUC_SUPERSCRIPT 16 /*superscript char */
|
||||
|
||||
/*enum for character rendering direction*/
|
||||
enum OCR_CHAR_DIRECTION {
|
||||
OCR_CDIR_RIGHT_LEFT, /*right to left horizontal */
|
||||
OCR_CDIR_LEFT_RIGHT, /*left to right horizontal */
|
||||
OCR_CDIR_TOP_BOTTOM, /*top to bottom vertical */
|
||||
OCR_CDIR_BOTTOM_TOP /*bottom to top vertical */
|
||||
};
|
||||
|
||||
/*enum for line rendering direction*/
|
||||
enum OCR_LINE_DIRECTION {
|
||||
OCR_LDIR_DOWN_RIGHT, /*horizontal lines go down */
|
||||
/*vertical lines go right */
|
||||
OCR_LDIR_UP_LEFT /*horizontal lines go up */
|
||||
};
|
||||
|
||||
/*enum for newline type*/
|
||||
enum OCR_NEWLINE_TYPE {
|
||||
OCR_NL_NONE, /*not a newline */
|
||||
OCR_NL_NEWLINE, /*this is a newline but not new para */
|
||||
OCR_NL_NEWPARA /*this is a newline and a new para */
|
||||
};
|
||||
|
||||
/*error codes that can be returned from the API functions other than OKAY
|
||||
and HPERR*/
|
||||
#define OCR_API_NO_MEM (-2) /*filled output buffer */
|
||||
#define OCR_API_BAD_CHAR (-3) /*whitespace sent to ocr_append_char */
|
||||
#define OCR_API_BAD_STATE (-4) /*invalid call sequence */
|
||||
|
||||
/*error codes used for passing errors back to the HP side*/
|
||||
enum OCR_ERR_CODE {
|
||||
OCR_ERR_NONE, /*no error */
|
||||
OCR_ERR_CLEAN_EXIT, /*no error */
|
||||
OCR_ERR_NO_MEM, /*out of memory */
|
||||
OCR_ERR_FILE_READ, /*failed to read data file */
|
||||
OCR_ERR_TMP_WRITE, /*failed to write temp file */
|
||||
OCR_ERR_TMP_READ, /*failed to read temp file */
|
||||
OCR_ERR_BAD_DLL, /*missing or invalid dll subcomponent */
|
||||
OCR_ERR_BAD_EXE, /*missing or invalid exe subcomponent */
|
||||
OCR_ERR_BAD_LOAD, /*failed to load subcomponent */
|
||||
OCR_ERR_BAD_LANG, /*unable to recognize requested language */
|
||||
OCR_ERR_BAD_STATE, /*engine did call out of sequence */
|
||||
OCR_ERR_INTERNAL1, /*internal error type 1 */
|
||||
OCR_ERR_INTERNAL2, /*internal error type 1 */
|
||||
OCR_ERR_INTERNAL3, /*internal error type 1 */
|
||||
OCR_ERR_INTERNAL4, /*internal error type 1 */
|
||||
OCR_ERR_INTERNAL5, /*internal error type 1 */
|
||||
OCR_ERR_INTERNAL6, /*internal error type 1 */
|
||||
OCR_ERR_INTERNAL7, /*internal error type 1 */
|
||||
OCR_ERR_INTERNAL8, /*internal error type 1 */
|
||||
OCR_ERR_TIMEOUT /*timed out in comms */
|
||||
}; /*for calls to ocr_error */
|
||||
|
||||
/**********************************************************************
|
||||
* EFONT_DESC
|
||||
* Description of one font.
|
||||
* The information required is basically that used by RTF.
|
||||
* The name may be either a valid font on the system or the empty string.
|
||||
**********************************************************************/
|
||||
|
||||
typedef struct { /*font description */
|
||||
uinT16 language; /*default language */
|
||||
uinT8 font_family; /*serif/not, fixed/not */
|
||||
uinT8 char_set; /*character set standard */
|
||||
uinT8 pitch; /*fixed or prop */
|
||||
inT8 name[MAX_FONT_NAME + 1]; /*plain ascii name */
|
||||
} EFONT_DESC; /*font description */
|
||||
|
||||
/**********************************************************************
|
||||
* EOCR_DESC
|
||||
* Description of the OCR engine provided at startup.
|
||||
* The name and version may be reported to the user at some point.
|
||||
* The fonts array should indicate the fonts that the OCR system
|
||||
* can recognize.
|
||||
**********************************************************************/
|
||||
|
||||
typedef struct { /*startup info */
|
||||
inT32 protocol; /*interface version */
|
||||
uinT32 font_count; /*number of fonts */
|
||||
uinT16 language; /*default language */
|
||||
uinT16 name[MAX_OCR_NAME + 1]; /*name of engine */
|
||||
/*version of engine */
|
||||
uinT16 version[MAX_OCR_VERSION + 1];
|
||||
EFONT_DESC fonts[1]; /*array of fonts */
|
||||
} EOCR_DESC; /*startup info */
|
||||
|
||||
/**********************************************************************
|
||||
* ESTRIP_DESC
|
||||
* Description of the image strip as it is passed to the engine.
|
||||
* The image is always 1 bit, with 1=black.
|
||||
* The width is always a multiple of 32, so padding is always OK.
|
||||
* The height of the full image is always a multiple of 32.
|
||||
* The top y coordinate is 0, and increases down.
|
||||
* The top leftmost pixel is in the most significant bit of the first byte.
|
||||
**********************************************************************/
|
||||
|
||||
typedef struct { /*bitmap strip */
|
||||
inT16 x_size; /*width in pixels */
|
||||
inT16 y_size; /*of full image */
|
||||
inT16 strip_size; /*of this strip */
|
||||
inT16 resolution; /*pixels per inch */
|
||||
uinT8 data[8]; /*image data */
|
||||
} ESTRIP_DESC; /*bitmap strip */
|
||||
|
||||
/**********************************************************************
|
||||
* EANYCODE_CHAR
|
||||
* Description of a single character. The character code is defined by
|
||||
|
@ -40,7 +40,8 @@ tesseract::ParamsVectors *GlobalParams() {
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
bool ParamUtils::ReadParamsFile(const char *file, bool init_only,
|
||||
bool ParamUtils::ReadParamsFile(const char *file,
|
||||
SetParamConstraint constraint,
|
||||
ParamsVectors *member_params) {
|
||||
char flag; // file flag
|
||||
inT16 nameoffset; // offset for real name
|
||||
@ -63,11 +64,12 @@ bool ParamUtils::ReadParamsFile(const char *file, bool init_only,
|
||||
tprintf("read_params_file: Can't open %s\n", file + nameoffset);
|
||||
return true;
|
||||
}
|
||||
return ReadParamsFromFp(fp, -1, init_only, member_params);
|
||||
return ReadParamsFromFp(fp, -1, constraint, member_params);
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
bool ParamUtils::ReadParamsFromFp(FILE *fp, inT64 end_offset, bool init_only,
|
||||
bool ParamUtils::ReadParamsFromFp(FILE *fp, inT64 end_offset,
|
||||
SetParamConstraint constraint,
|
||||
ParamsVectors *member_params) {
|
||||
char line[MAX_PATH]; // input line
|
||||
bool anyerr = false; // true if any error
|
||||
@ -89,7 +91,7 @@ bool ParamUtils::ReadParamsFromFp(FILE *fp, inT64 end_offset, bool init_only,
|
||||
valptr++; // find end of blanks
|
||||
while (*valptr == ' ' || *valptr == '\t');
|
||||
}
|
||||
foundit = SetParam(line, valptr, init_only, member_params);
|
||||
foundit = SetParam(line, valptr, constraint, member_params);
|
||||
|
||||
if (!foundit) {
|
||||
anyerr = true; // had an error
|
||||
@ -102,24 +104,25 @@ bool ParamUtils::ReadParamsFromFp(FILE *fp, inT64 end_offset, bool init_only,
|
||||
}
|
||||
|
||||
bool ParamUtils::SetParam(const char *name, const char* value,
|
||||
bool init_only, ParamsVectors *member_params) {
|
||||
SetParamConstraint constraint,
|
||||
ParamsVectors *member_params) {
|
||||
// Look for the parameter among string parameters.
|
||||
StringParam *sp = FindParam<StringParam>(name, GlobalParams()->string_params,
|
||||
member_params->string_params);
|
||||
if (sp != NULL && (!init_only || sp->is_init())) sp->set_value(value);
|
||||
if (sp != NULL && sp->constraint_ok(constraint)) sp->set_value(value);
|
||||
if (*value == '\0') return (sp != NULL);
|
||||
|
||||
// Look for the parameter among int parameters.
|
||||
int intval;
|
||||
IntParam *ip = FindParam<IntParam>(name, GlobalParams()->int_params,
|
||||
member_params->int_params);
|
||||
if (ip && (!init_only || ip->is_init()) &&
|
||||
if (ip && ip->constraint_ok(constraint) &&
|
||||
sscanf(value, INT32FORMAT, &intval) == 1) ip->set_value(intval);
|
||||
|
||||
// Look for the parameter among bool parameters.
|
||||
BoolParam *bp = FindParam<BoolParam>(name, GlobalParams()->bool_params,
|
||||
member_params->bool_params);
|
||||
if (bp != NULL && (!init_only || bp->is_init())) {
|
||||
if (bp != NULL && bp->constraint_ok(constraint)) {
|
||||
if (*value == 'T' || *value == 't' ||
|
||||
*value == 'Y' || *value == 'y' || *value == '1') {
|
||||
bp->set_value(true);
|
||||
@ -133,7 +136,7 @@ bool ParamUtils::SetParam(const char *name, const char* value,
|
||||
double doubleval;
|
||||
DoubleParam *dp = FindParam<DoubleParam>(name, GlobalParams()->double_params,
|
||||
member_params->double_params);
|
||||
if (dp != NULL && (!init_only || dp->is_init())) {
|
||||
if (dp != NULL && dp->constraint_ok(constraint)) {
|
||||
#ifdef EMBEDDED
|
||||
doubleval = strtofloat(value);
|
||||
#else
|
||||
|
@ -32,6 +32,14 @@ class BoolParam;
|
||||
class StringParam;
|
||||
class DoubleParam;
|
||||
|
||||
// Enum for constraints on what kind of params should be set by SetParam().
|
||||
enum SetParamConstraint {
|
||||
SET_PARAM_CONSTRAINT_NONE,
|
||||
SET_PARAM_CONSTRAINT_DEBUG_ONLY,
|
||||
SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY,
|
||||
SET_PARAM_CONSTRAINT_NON_INIT_ONLY,
|
||||
};
|
||||
|
||||
struct ParamsVectors {
|
||||
GenericVector<IntParam *> int_params;
|
||||
GenericVector<BoolParam *> bool_params;
|
||||
@ -49,17 +57,18 @@ class ParamUtils {
|
||||
// Values may have any whitespace after the name and are the rest of line.
|
||||
static bool ReadParamsFile(
|
||||
const char *file, // filename to read
|
||||
bool init_only, // only set parameters that need to be
|
||||
// initialized when Init() is called
|
||||
SetParamConstraint constraint,
|
||||
ParamsVectors *member_params);
|
||||
|
||||
// Read parameters from the given file pointer (stop at end_offset).
|
||||
static bool ReadParamsFromFp(FILE *fp, inT64 end_offset, bool init_only,
|
||||
ParamsVectors *member_params);
|
||||
static bool ReadParamsFromFp(FILE *fp, inT64 end_offset,
|
||||
SetParamConstraint constraint,
|
||||
ParamsVectors *member_params);
|
||||
|
||||
// Set a parameters to have the given value.
|
||||
static bool SetParam(const char *name, const char* value,
|
||||
bool init_only, ParamsVectors *member_params);
|
||||
SetParamConstraint constraint,
|
||||
ParamsVectors *member_params);
|
||||
|
||||
// Returns the pointer to the parameter with the given name (of the
|
||||
// appropriate type) if it was found in the vector obtained from
|
||||
@ -105,14 +114,27 @@ class Param {
|
||||
const char *name_str() const { return name_; }
|
||||
const char *info_str() const { return info_; }
|
||||
bool is_init() const { return init_; }
|
||||
bool is_debug() const { return debug_; }
|
||||
bool constraint_ok(SetParamConstraint constraint) const {
|
||||
return (constraint == SET_PARAM_CONSTRAINT_NONE ||
|
||||
(constraint == SET_PARAM_CONSTRAINT_DEBUG_ONLY &&
|
||||
this->is_debug()) ||
|
||||
(constraint == SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY &&
|
||||
!this->is_debug()) ||
|
||||
(constraint == SET_PARAM_CONSTRAINT_NON_INIT_ONLY &&
|
||||
!this->is_init()));
|
||||
}
|
||||
|
||||
protected:
|
||||
Param(const char *name, const char *comment, bool init) :
|
||||
name_(name), info_(comment), init_(init) {}
|
||||
name_(name), info_(comment), init_(init) {
|
||||
debug_ = (strstr(name, "debug") != NULL) || (strstr(name, "display"));
|
||||
}
|
||||
|
||||
const char *name_; // name of this parameter
|
||||
const char *info_; // for menus
|
||||
bool init_; // needs to be set before init
|
||||
const char *name_; // name of this parameter
|
||||
const char *info_; // for menus
|
||||
bool init_; // needs to be set before init
|
||||
bool debug_;
|
||||
};
|
||||
|
||||
class IntParam : public Param {
|
||||
@ -124,7 +146,7 @@ class IntParam : public Param {
|
||||
vec->int_params.push_back(this);
|
||||
}
|
||||
~IntParam() { ParamUtils::RemoveParam<IntParam>(this, params_vec_); }
|
||||
operator inT32() { return value_; }
|
||||
operator inT32() const { return value_; }
|
||||
void set_value(inT32 value) { value_ = value; }
|
||||
|
||||
private:
|
||||
@ -142,12 +164,12 @@ class BoolParam : public Param {
|
||||
vec->bool_params.push_back(this);
|
||||
}
|
||||
~BoolParam() { ParamUtils::RemoveParam<BoolParam>(this, params_vec_); }
|
||||
operator BOOL8() { return value_; }
|
||||
operator BOOL8() const { return value_; }
|
||||
void set_value(BOOL8 value) { value_ = value; }
|
||||
|
||||
private:
|
||||
BOOL8 value_;
|
||||
// Pointer to the vector that contains this param (not owened by this class).
|
||||
// Pointer to the vector that contains this param (not owned by this class).
|
||||
GenericVector<BoolParam *> *params_vec_;
|
||||
};
|
||||
|
||||
@ -163,6 +185,7 @@ class StringParam : public Param {
|
||||
~StringParam() { ParamUtils::RemoveParam<StringParam>(this, params_vec_); }
|
||||
operator STRING &() { return value_; }
|
||||
const char *string() const { return value_.string(); }
|
||||
bool empty() { return value_.length() <= 0; }
|
||||
void set_value(const STRING &value) { value_ = value; }
|
||||
|
||||
private:
|
||||
@ -180,12 +203,12 @@ class DoubleParam : public Param {
|
||||
vec->double_params.push_back(this);
|
||||
}
|
||||
~DoubleParam() { ParamUtils::RemoveParam<DoubleParam>(this, params_vec_); }
|
||||
operator double() { return value_; }
|
||||
operator double() const { return value_; }
|
||||
void set_value(double value) { value_ = value; }
|
||||
|
||||
private:
|
||||
double value_;
|
||||
// Pointer to the vector that contains this param (not owened by this class).
|
||||
// Pointer to the vector that contains this param (not owned by this class).
|
||||
GenericVector<DoubleParam *> *params_vec_;
|
||||
};
|
||||
|
||||
|
@ -17,9 +17,11 @@
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "mfcpch.h" //precompiled headers
|
||||
#include "mfcpch.h" // Precompiled headers
|
||||
#include "helpers.h"
|
||||
#include "tprintf.h"
|
||||
#include "strngs.h"
|
||||
#include "genericvector.h"
|
||||
|
||||
#include <assert.h>
|
||||
// Size of buffer needed to host the decimal representation of the maximum
|
||||
@ -122,6 +124,25 @@ STRING::~STRING() {
|
||||
DiscardData();
|
||||
}
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool STRING::Serialize(FILE* fp) const {
|
||||
inT32 len = length();
|
||||
if (fwrite(&len, sizeof(len), 1, fp) != 1) return false;
|
||||
if (fwrite(GetCStr(), 1, len, fp) != len) return false;
|
||||
return true;
|
||||
}
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool STRING::DeSerialize(bool swap, FILE* fp) {
|
||||
inT32 len;
|
||||
if (fread(&len, sizeof(len), 1, fp) != 1) return false;
|
||||
if (swap)
|
||||
ReverseN(&len, sizeof(len));
|
||||
truncate_at(len);
|
||||
if (fread(GetCStr(), 1, len, fp) != len) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
BOOL8 STRING::contains(const char c) const {
|
||||
return (c != '\0') && (strchr (GetCStr(), c) != NULL);
|
||||
}
|
||||
@ -197,14 +218,14 @@ void STRING::erase_range(inT32 index, int len) {
|
||||
assert(InvariantOk());
|
||||
}
|
||||
|
||||
#else
|
||||
void STRING::truncate_at(inT32 index) {
|
||||
char* this_cstr = ensure_cstr(index);
|
||||
char* this_cstr = ensure_cstr(index + 1);
|
||||
this_cstr[index] = '\0';
|
||||
GetHeader()->used_ = index;
|
||||
GetHeader()->used_ = index + 1;
|
||||
assert(InvariantOk());
|
||||
}
|
||||
|
||||
#else
|
||||
char& STRING::operator[](inT32 index) const {
|
||||
// Code is casting away this const and mutating the string,
|
||||
// so mark used_ as -1 to flag it unreliable.
|
||||
@ -213,6 +234,26 @@ char& STRING::operator[](inT32 index) const {
|
||||
}
|
||||
#endif
|
||||
|
||||
void STRING::split(const char c, GenericVector<STRING> *splited) {
|
||||
int start_index = 0;
|
||||
for (int i = 0; i < length(); i++) {
|
||||
if ((*this)[i] == c) {
|
||||
if (i != start_index) {
|
||||
(*this)[i] = '\0';
|
||||
STRING tmp = GetCStr() + start_index;
|
||||
splited->push_back(tmp);
|
||||
(*this)[i] = c;
|
||||
}
|
||||
start_index = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (length() != start_index) {
|
||||
STRING tmp = GetCStr() + start_index;
|
||||
splited->push_back(tmp);
|
||||
}
|
||||
}
|
||||
|
||||
BOOL8 STRING::operator==(const STRING& str) const {
|
||||
FixHeader();
|
||||
str.FixHeader();
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include <string.h>
|
||||
#include "memry.h"
|
||||
#include "serialis.h"
|
||||
#include "genericvector.h"
|
||||
|
||||
// STRING_IS_PROTECTED means that string[index] = X is invalid
|
||||
// because you have to go through strings interface to modify it.
|
||||
@ -42,7 +43,8 @@
|
||||
#define CCUTIL_API
|
||||
#endif
|
||||
|
||||
class CCUTIL_API STRING
|
||||
|
||||
class DLLSYM STRING
|
||||
{
|
||||
public:
|
||||
STRING();
|
||||
@ -50,8 +52,15 @@ class CCUTIL_API STRING
|
||||
STRING(const char *string);
|
||||
~STRING ();
|
||||
|
||||
// Writes to the given file. Returns false in case of error.
|
||||
bool Serialize(FILE* fp) const;
|
||||
// Reads from the given file. Returns false in case of error.
|
||||
// If swap is true, assumes a big/little-endian swap is needed.
|
||||
bool DeSerialize(bool swap, FILE* fp);
|
||||
|
||||
BOOL8 contains(const char c) const;
|
||||
inT32 length() const;
|
||||
inT32 size() const { return length(); }
|
||||
const char *string() const;
|
||||
|
||||
#if STRING_IS_PROTECTED
|
||||
@ -59,10 +68,11 @@ class CCUTIL_API STRING
|
||||
// len is number of chars in s to insert starting at index in this string
|
||||
void insert_range(inT32 index, const char*s, int len);
|
||||
void erase_range(inT32 index, int len);
|
||||
void truncate_at(inT32 index);
|
||||
#else
|
||||
char &operator[] (inT32 index) const;
|
||||
#endif
|
||||
void split(const char c, GenericVector<STRING> *splited);
|
||||
void truncate_at(inT32 index);
|
||||
|
||||
BOOL8 operator== (const STRING & string) const;
|
||||
BOOL8 operator!= (const STRING & string) const;
|
||||
|
@ -1017,12 +1017,12 @@ struct Identity {
|
||||
typedef T type;
|
||||
};
|
||||
|
||||
template <bool del, class R, class T, class P1, class A1, class A2>
|
||||
class _ConstTessMemberResultCallback_1_2
|
||||
: public TessResultCallback2<R,A1,A2> {
|
||||
template <bool del, class R, class T, class P1, class A1, class A2, class A3>
|
||||
class _ConstTessMemberResultCallback_1_3
|
||||
: public TessResultCallback3<R,A1,A2,A3> {
|
||||
public:
|
||||
typedef TessResultCallback2<R,A1,A2> base;
|
||||
typedef R (T::*MemberSignature)(P1,A1,A2) const;
|
||||
typedef TessResultCallback3<R,A1,A2,A3> base;
|
||||
typedef R (T::*MemberSignature)(P1,A1,A2,A3) const;
|
||||
|
||||
private:
|
||||
T* object_;
|
||||
@ -1030,16 +1030,16 @@ class _ConstTessMemberResultCallback_1_2
|
||||
typename remove_reference<P1>::type p1_;
|
||||
|
||||
public:
|
||||
inline _ConstTessMemberResultCallback_1_2(T* object,
|
||||
inline _ConstTessMemberResultCallback_1_3(T* object,
|
||||
MemberSignature member, P1 p1)
|
||||
: object_(object), member_(member), p1_(p1) { }
|
||||
|
||||
virtual R Run(A1 a1, A2 a2) {
|
||||
virtual R Run(A1 a1, A2 a2, A3 a3) {
|
||||
if (!del) {
|
||||
R result = (object_->*member_)(p1_,a1,a2);
|
||||
R result = (object_->*member_)(p1_,a1,a2,a3);
|
||||
return result;
|
||||
} else {
|
||||
R result = (object_->*member_)(p1_,a1,a2);
|
||||
R result = (object_->*member_)(p1_,a1,a2,a3);
|
||||
// zero out the pointer to ensure segfault if used again
|
||||
member_ = NULL;
|
||||
delete this;
|
||||
@ -1048,12 +1048,12 @@ class _ConstTessMemberResultCallback_1_2
|
||||
}
|
||||
};
|
||||
|
||||
template <bool del, class T, class P1, class A1, class A2>
|
||||
class _ConstTessMemberResultCallback_1_2<del, void, T, P1, A1, A2>
|
||||
: public TessCallback2<A1,A2> {
|
||||
template <bool del, class T, class P1, class A1, class A2, class A3>
|
||||
class _ConstTessMemberResultCallback_1_3<del, void, T, P1, A1, A2, A3>
|
||||
: public TessCallback3<A1,A2,A3> {
|
||||
public:
|
||||
typedef TessCallback2<A1,A2> base;
|
||||
typedef void (T::*MemberSignature)(P1,A1,A2) const;
|
||||
typedef TessCallback3<A1,A2,A3> base;
|
||||
typedef void (T::*MemberSignature)(P1,A1,A2,A3) const;
|
||||
|
||||
private:
|
||||
T* object_;
|
||||
@ -1061,15 +1061,15 @@ class _ConstTessMemberResultCallback_1_2<del, void, T, P1, A1, A2>
|
||||
typename remove_reference<P1>::type p1_;
|
||||
|
||||
public:
|
||||
inline _ConstTessMemberResultCallback_1_2(T* object,
|
||||
inline _ConstTessMemberResultCallback_1_3(T* object,
|
||||
MemberSignature member, P1 p1)
|
||||
: object_(object), member_(member), p1_(p1) { }
|
||||
|
||||
virtual void Run(A1 a1, A2 a2) {
|
||||
virtual void Run(A1 a1, A2 a2, A3 a3) {
|
||||
if (!del) {
|
||||
(object_->*member_)(p1_,a1,a2);
|
||||
(object_->*member_)(p1_,a1,a2,a3);
|
||||
} else {
|
||||
(object_->*member_)(p1_,a1,a2);
|
||||
(object_->*member_)(p1_,a1,a2,a3);
|
||||
// zero out the pointer to ensure segfault if used again
|
||||
member_ = NULL;
|
||||
delete this;
|
||||
@ -1078,26 +1078,26 @@ class _ConstTessMemberResultCallback_1_2<del, void, T, P1, A1, A2>
|
||||
};
|
||||
|
||||
#ifndef SWIG
|
||||
template <class T1, class T2, class R, class P1, class A1, class A2>
|
||||
inline typename _ConstTessMemberResultCallback_1_2<true,R,T1,P1,A1,A2>::base*
|
||||
NewTessCallback( T1* obj, R (T2::*member)(P1,A1,A2) , typename Identity<P1>::type p1) {
|
||||
return new _ConstTessMemberResultCallback_1_2<true,R,T1,P1,A1,A2>(obj, member, p1);
|
||||
template <class T1, class T2, class R, class P1, class A1, class A2, class A3>
|
||||
inline typename _ConstTessMemberResultCallback_1_3<true,R,T1,P1,A1,A2,A3>::base*
|
||||
NewTessCallback( T1* obj, R (T2::*member)(P1,A1,A2,A3) , typename Identity<P1>::type p1) {
|
||||
return new _ConstTessMemberResultCallback_1_3<true,R,T1,P1,A1,A2,A3>(obj, member, p1);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef SWIG
|
||||
template <class T1, class T2, class R, class P1, class A1, class A2>
|
||||
inline typename _ConstTessMemberResultCallback_1_2<false,R,T1,P1,A1,A2>::base*
|
||||
NewPermanentTessCallback( T1* obj, R (T2::*member)(P1,A1,A2) , typename Identity<P1>::type p1) {
|
||||
return new _ConstTessMemberResultCallback_1_2<false,R,T1,P1,A1,A2>(obj, member, p1);
|
||||
template <class T1, class T2, class R, class P1, class A1, class A2, class A3>
|
||||
inline typename _ConstTessMemberResultCallback_1_3<false,R,T1,P1,A1,A2,A3>::base*
|
||||
NewPermanentTessCallback( T1* obj, R (T2::*member)(P1,A1,A2,A3) , typename Identity<P1>::type p1) {
|
||||
return new _ConstTessMemberResultCallback_1_3<false,R,T1,P1,A1,A2,A3>(obj, member, p1);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <bool del, class R, class T, class P1, class A1, class A2>
|
||||
class _TessMemberResultCallback_1_2 : public TessResultCallback2<R,A1,A2> {
|
||||
template <bool del, class R, class T, class P1, class A1, class A2, class A3>
|
||||
class _TessMemberResultCallback_1_3 : public TessResultCallback3<R,A1,A2,A3> {
|
||||
public:
|
||||
typedef TessResultCallback2<R,A1,A2> base;
|
||||
typedef R (T::*MemberSignature)(P1,A1,A2) ;
|
||||
typedef TessResultCallback3<R,A1,A2,A3> base;
|
||||
typedef R (T::*MemberSignature)(P1,A1,A2,A3) ;
|
||||
|
||||
private:
|
||||
T* object_;
|
||||
@ -1105,16 +1105,16 @@ class _TessMemberResultCallback_1_2 : public TessResultCallback2<R,A1,A2> {
|
||||
typename remove_reference<P1>::type p1_;
|
||||
|
||||
public:
|
||||
inline _TessMemberResultCallback_1_2(T* object,
|
||||
inline _TessMemberResultCallback_1_3(T* object,
|
||||
MemberSignature member, P1 p1)
|
||||
: object_(object), member_(member), p1_(p1) { }
|
||||
|
||||
virtual R Run(A1 a1, A2 a2) {
|
||||
virtual R Run(A1 a1, A2 a2, A3 a3) {
|
||||
if (!del) {
|
||||
R result = (object_->*member_)(p1_,a1,a2);
|
||||
R result = (object_->*member_)(p1_,a1,a2,a3);
|
||||
return result;
|
||||
} else {
|
||||
R result = (object_->*member_)(p1_,a1,a2);
|
||||
R result = (object_->*member_)(p1_,a1,a2,a3);
|
||||
// zero out the pointer to ensure segfault if used again
|
||||
member_ = NULL;
|
||||
delete this;
|
||||
@ -1123,12 +1123,12 @@ class _TessMemberResultCallback_1_2 : public TessResultCallback2<R,A1,A2> {
|
||||
}
|
||||
};
|
||||
|
||||
template <bool del, class T, class P1, class A1, class A2>
|
||||
class _TessMemberResultCallback_1_2<del, void, T, P1, A1, A2>
|
||||
: public TessCallback2<A1,A2> {
|
||||
template <bool del, class T, class P1, class A1, class A2, class A3>
|
||||
class _TessMemberResultCallback_1_3<del, void, T, P1, A1, A2, A3>
|
||||
: public TessCallback3<A1,A2,A3> {
|
||||
public:
|
||||
typedef TessCallback2<A1,A2> base;
|
||||
typedef void (T::*MemberSignature)(P1,A1,A2) ;
|
||||
typedef TessCallback3<A1,A2,A3> base;
|
||||
typedef void (T::*MemberSignature)(P1,A1,A2,A3) ;
|
||||
|
||||
private:
|
||||
T* object_;
|
||||
@ -1136,15 +1136,15 @@ class _TessMemberResultCallback_1_2<del, void, T, P1, A1, A2>
|
||||
typename remove_reference<P1>::type p1_;
|
||||
|
||||
public:
|
||||
inline _TessMemberResultCallback_1_2(T* object,
|
||||
inline _TessMemberResultCallback_1_3(T* object,
|
||||
MemberSignature member, P1 p1)
|
||||
: object_(object), member_(member), p1_(p1) { }
|
||||
|
||||
virtual void Run(A1 a1, A2 a2) {
|
||||
virtual void Run(A1 a1, A2 a2, A3 a3) {
|
||||
if (!del) {
|
||||
(object_->*member_)(p1_,a1,a2);
|
||||
(object_->*member_)(p1_,a1,a2,a3);
|
||||
} else {
|
||||
(object_->*member_)(p1_,a1,a2);
|
||||
(object_->*member_)(p1_,a1,a2,a3);
|
||||
// zero out the pointer to ensure segfault if used again
|
||||
member_ = NULL;
|
||||
delete this;
|
||||
@ -1153,41 +1153,41 @@ class _TessMemberResultCallback_1_2<del, void, T, P1, A1, A2>
|
||||
};
|
||||
|
||||
#ifndef SWIG
|
||||
template <class T1, class T2, class R, class P1, class A1, class A2>
|
||||
inline typename _TessMemberResultCallback_1_2<true,R,T1,P1,A1,A2>::base*
|
||||
NewTessCallback( T1* obj, R (T2::*member)(P1,A1,A2) , typename Identity<P1>::type p1) {
|
||||
return new _TessMemberResultCallback_1_2<true,R,T1,P1,A1,A2>(obj, member, p1);
|
||||
template <class T1, class T2, class R, class P1, class A1, class A2, class A3>
|
||||
inline typename _TessMemberResultCallback_1_3<true,R,T1,P1,A1,A2,A3>::base*
|
||||
NewTessCallback( T1* obj, R (T2::*member)(P1,A1,A2,A3) , typename Identity<P1>::type p1) {
|
||||
return new _TessMemberResultCallback_1_3<true,R,T1,P1,A1,A2,A3>(obj, member, p1);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef SWIG
|
||||
template <class T1, class T2, class R, class P1, class A1, class A2>
|
||||
inline typename _TessMemberResultCallback_1_2<false,R,T1,P1,A1,A2>::base*
|
||||
NewPermanentTessCallback( T1* obj, R (T2::*member)(P1,A1,A2) , typename Identity<P1>::type p1) {
|
||||
return new _TessMemberResultCallback_1_2<false,R,T1,P1,A1,A2>(obj, member, p1);
|
||||
template <class T1, class T2, class R, class P1, class A1, class A2, class A3>
|
||||
inline typename _TessMemberResultCallback_1_3<false,R,T1,P1,A1,A2,A3>::base*
|
||||
NewPermanentTessCallback( T1* obj, R (T2::*member)(P1,A1,A2,A3) , typename Identity<P1>::type p1) {
|
||||
return new _TessMemberResultCallback_1_3<false,R,T1,P1,A1,A2,A3>(obj, member, p1);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <bool del, class R, class P1, class A1, class A2>
|
||||
class _TessFunctionResultCallback_1_2 : public TessCallback2<A1,A2> {
|
||||
template <bool del, class R, class P1, class A1, class A2, class A3>
|
||||
class _TessFunctionResultCallback_1_3 : public TessCallback3<A1,A2,A3> {
|
||||
public:
|
||||
typedef TessCallback2<A1,A2> base;
|
||||
typedef R (*FunctionSignature)(P1,A1,A2);
|
||||
typedef TessCallback3<A1,A2,A3> base;
|
||||
typedef R (*FunctionSignature)(P1,A1,A2,A3);
|
||||
|
||||
private:
|
||||
FunctionSignature function_;
|
||||
typename remove_reference<P1>::type p1_;
|
||||
|
||||
public:
|
||||
inline _TessFunctionResultCallback_1_2(FunctionSignature function, P1 p1)
|
||||
inline _TessFunctionResultCallback_1_3(FunctionSignature function, P1 p1)
|
||||
: function_(function), p1_(p1) { }
|
||||
|
||||
virtual R Run(A1 a1, A2 a2) {
|
||||
virtual R Run(A1 a1, A2 a2, A3 a3) {
|
||||
if (!del) {
|
||||
R result = (*function_)(p1_,a1,a2);
|
||||
R result = (*function_)(p1_,a1,a2,a3);
|
||||
return result;
|
||||
} else {
|
||||
R result = (*function_)(p1_,a1,a2);
|
||||
R result = (*function_)(p1_,a1,a2,a3);
|
||||
// zero out the pointer to ensure segfault if used again
|
||||
function_ = NULL;
|
||||
delete this;
|
||||
@ -1196,26 +1196,26 @@ class _TessFunctionResultCallback_1_2 : public TessCallback2<A1,A2> {
|
||||
}
|
||||
};
|
||||
|
||||
template <bool del, class P1, class A1, class A2>
|
||||
class _TessFunctionResultCallback_1_2<del, void, P1, A1, A2>
|
||||
: public TessCallback2<A1,A2> {
|
||||
template <bool del, class P1, class A1, class A2, class A3>
|
||||
class _TessFunctionResultCallback_1_3<del, void, P1, A1, A2, A3>
|
||||
: public TessCallback3<A1,A2,A3> {
|
||||
public:
|
||||
typedef TessCallback2<A1,A2> base;
|
||||
typedef void (*FunctionSignature)(P1,A1,A2);
|
||||
typedef TessCallback3<A1,A2,A3> base;
|
||||
typedef void (*FunctionSignature)(P1,A1,A2,A3);
|
||||
|
||||
private:
|
||||
FunctionSignature function_;
|
||||
typename remove_reference<P1>::type p1_;
|
||||
|
||||
public:
|
||||
inline _TessFunctionResultCallback_1_2(FunctionSignature function, P1 p1)
|
||||
inline _TessFunctionResultCallback_1_3(FunctionSignature function, P1 p1)
|
||||
: function_(function), p1_(p1) { }
|
||||
|
||||
virtual void Run(A1 a1, A2 a2) {
|
||||
virtual void Run(A1 a1, A2 a2, A3 a3) {
|
||||
if (!del) {
|
||||
(*function_)(p1_,a1,a2);
|
||||
(*function_)(p1_,a1,a2,a3);
|
||||
} else {
|
||||
(*function_)(p1_,a1,a2);
|
||||
(*function_)(p1_,a1,a2,a3);
|
||||
// zero out the pointer to ensure segfault if used again
|
||||
function_ = NULL;
|
||||
delete this;
|
||||
@ -1223,16 +1223,16 @@ class _TessFunctionResultCallback_1_2<del, void, P1, A1, A2>
|
||||
}
|
||||
};
|
||||
|
||||
template <class R, class P1, class A1, class A2>
|
||||
inline typename _TessFunctionResultCallback_1_2<true,R,P1,A1,A2>::base*
|
||||
NewTessCallback(R (*function)(P1,A1,A2), typename Identity<P1>::type p1) {
|
||||
return new _TessFunctionResultCallback_1_2<true,R,P1,A1,A2>(function, p1);
|
||||
template <class R, class P1, class A1, class A2, class A3>
|
||||
inline typename _TessFunctionResultCallback_1_3<true,R,P1,A1,A2,A3>::base*
|
||||
NewTessCallback(R (*function)(P1,A1,A2,A3), typename Identity<P1>::type p1) {
|
||||
return new _TessFunctionResultCallback_1_3<true,R,P1,A1,A2,A3>(function, p1);
|
||||
}
|
||||
|
||||
template <class R, class P1, class A1, class A2>
|
||||
inline typename _TessFunctionResultCallback_1_2<false,R,P1,A1,A2>::base*
|
||||
NewPermanentTessCallback(R (*function)(P1,A1,A2), typename Identity<P1>::type p1) {
|
||||
return new _TessFunctionResultCallback_1_2<false,R,P1,A1,A2>(function, p1);
|
||||
template <class R, class P1, class A1, class A2, class A3>
|
||||
inline typename _TessFunctionResultCallback_1_3<false,R,P1,A1,A2,A3>::base*
|
||||
NewPermanentTessCallback(R (*function)(P1,A1,A2,A3), typename Identity<P1>::type p1) {
|
||||
return new _TessFunctionResultCallback_1_3<false,R,P1,A1,A2,A3>(function, p1);
|
||||
}
|
||||
|
||||
#endif /* _TESS_CALLBACK_SPECIALIZATIONS_H */
|
||||
|
@ -38,17 +38,19 @@ bool TessdataManager::Init(const char *data_file_name, int debug_level) {
|
||||
data_file_ = fopen(data_file_name, "rb");
|
||||
if (data_file_ == NULL) {
|
||||
tprintf("Error opening data file %s\n", data_file_name);
|
||||
tprintf("Please make sure the TESSDATA_PREFIX environment variable is set "
|
||||
"to the parent directory of your \"tessdata\" directory.\n");
|
||||
return false;
|
||||
}
|
||||
fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
|
||||
bool swap = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
|
||||
if (swap) {
|
||||
swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
|
||||
if (swap_) {
|
||||
actual_tessdata_num_entries_ = reverse32(actual_tessdata_num_entries_);
|
||||
}
|
||||
ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
|
||||
fread(offset_table_, sizeof(inT64),
|
||||
actual_tessdata_num_entries_, data_file_);
|
||||
if (swap) {
|
||||
if (swap_) {
|
||||
for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
|
||||
offset_table_[i] = reverse64(offset_table_[i]);
|
||||
}
|
||||
|
@ -41,6 +41,10 @@ static const char kFreqDawgFileSuffix[] = "freq-dawg";
|
||||
static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";
|
||||
static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";
|
||||
static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
|
||||
static const char kShapeTableFileSuffix[] = "shapetable";
|
||||
static const char kBigramDawgFileSuffix[] = "bigram-dawg";
|
||||
static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
|
||||
static const char kParamsTrainingModelFileSuffix[] = "params-training-model";
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
@ -58,6 +62,10 @@ enum TessdataType {
|
||||
TESSDATA_FIXED_LENGTH_DAWGS, // 10
|
||||
TESSDATA_CUBE_UNICHARSET, // 11
|
||||
TESSDATA_CUBE_SYSTEM_DAWG, // 12
|
||||
TESSDATA_SHAPE_TABLE, // 13
|
||||
TESSDATA_BIGRAM_DAWG, // 14
|
||||
TESSDATA_UNAMBIG_DAWG, // 15
|
||||
TESSDATA_PARAMS_TRAINING_MODEL, // 16
|
||||
|
||||
TESSDATA_NUM_ENTRIES
|
||||
};
|
||||
@ -80,6 +88,10 @@ static const char * const kTessdataFileSuffixes[] = {
|
||||
kFixedLengthDawgsFileSuffix, // 10
|
||||
kCubeUnicharsetFileSuffix, // 11
|
||||
kCubeSystemDawgFileSuffix, // 12
|
||||
kShapeTableFileSuffix, // 13
|
||||
kBigramDawgFileSuffix, // 14
|
||||
kUnambigDawgFileSuffix, // 15
|
||||
kParamsTrainingModelFileSuffix, // 16
|
||||
};
|
||||
|
||||
/**
|
||||
@ -100,6 +112,10 @@ static const bool kTessdataFileIsText[] = {
|
||||
false, // 10
|
||||
true, // 11
|
||||
false, // 12
|
||||
false, // 13
|
||||
false, // 14
|
||||
false, // 15
|
||||
false, // 16
|
||||
};
|
||||
|
||||
/**
|
||||
@ -174,6 +190,9 @@ class TessdataManager {
|
||||
data_file_ = NULL;
|
||||
}
|
||||
}
|
||||
bool swap() const {
|
||||
return swap_;
|
||||
}
|
||||
|
||||
/** Writes the number of entries and the given offset table to output_file. */
|
||||
static void WriteMetadata(inT64 *offset_table, FILE *output_file);
|
||||
@ -260,6 +279,8 @@ class TessdataManager {
|
||||
inT32 actual_tessdata_num_entries_;
|
||||
FILE *data_file_; ///< pointer to the data file.
|
||||
int debug_level_;
|
||||
// True if the bytes need swapping.
|
||||
bool swap_;
|
||||
};
|
||||
|
||||
|
||||
|
@ -31,7 +31,7 @@
|
||||
#include "tprintf.h"
|
||||
#include "ccutil.h"
|
||||
|
||||
#define MAX_MSG_LEN 1024
|
||||
#define MAX_MSG_LEN 65536
|
||||
|
||||
#define EXTERN
|
||||
// Since tprintf is protected by a mutex, these parameters can rmain global.
|
||||
|
@ -25,7 +25,7 @@
|
||||
|
||||
// Maximum number of characters that can be stored in a UNICHAR. Must be
|
||||
// at least 4. Must not exceed 31 without changing the coding of length.
|
||||
#define UNICHAR_LEN 24
|
||||
#define UNICHAR_LEN 30
|
||||
|
||||
// A UNICHAR_ID is the unique id of a unichar.
|
||||
typedef int UNICHAR_ID;
|
||||
|
@ -1,4 +1,3 @@
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// File: unicharset.cpp
|
||||
// Description: Unicode character/ligature set class.
|
||||
@ -22,30 +21,51 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "tesscallback.h"
|
||||
#include "tprintf.h"
|
||||
#include "unichar.h"
|
||||
#include "unicharset.h"
|
||||
#include "params.h"
|
||||
|
||||
// Special character used in representing character fragments.
|
||||
static const char kSeparator = '|';
|
||||
// Special character used in representing 'natural' character fragments.
|
||||
static const char kNaturalFlag = 'n';
|
||||
|
||||
static const int ISALPHA_MASK = 0x1;
|
||||
static const int ISLOWER_MASK = 0x2;
|
||||
static const int ISUPPER_MASK = 0x4;
|
||||
static const int ISDIGIT_MASK = 0x8;
|
||||
static const int ISPUNCTUATION_MASK = 0x10;
|
||||
|
||||
// Y coordinate threshold for determining cap-height vs x-height.
|
||||
// TODO(rays) Bring the global definition down to the ccutil library level,
|
||||
// so this constant is relative to some other constants.
|
||||
static const int kMeanlineThreshold = 220;
|
||||
// Let C be the number of alpha chars for which all tops exceed
|
||||
// kMeanlineThreshold, and X the number of alpha chars for which all tops
|
||||
// are below kMeanlineThreshold, then if X > C * kMinXHeightFraction or
|
||||
// more than half the alpha characters have upper or lower case, then
|
||||
// the unicharset "has x-height".
|
||||
// kMeanlineThreshold, and X the number of alpha chars for which all
|
||||
// tops are below kMeanlineThreshold, then if X > C *
|
||||
// kMinXHeightFraction and C > X * kMinCapHeightFraction or more than
|
||||
// half the alpha characters have upper or lower case, then the
|
||||
// unicharset "has x-height".
|
||||
const double kMinXHeightFraction = 0.25;
|
||||
const double kMinCapHeightFraction = 0.05;
|
||||
|
||||
/*static */
|
||||
const char* UNICHARSET::kCustomLigatures[][2] = {
|
||||
{"ct", "\uE003"}, // c + t -> U+E003
|
||||
{"ſh", "\uE006"}, // long-s + h -> U+E006
|
||||
{"ſi", "\uE007"}, // long-s + i -> U+E007
|
||||
{"ſl", "\uE008"}, // long-s + l -> U+E008
|
||||
{"ſſ", "\uE009"}, // long-s + long-s -> U+E009
|
||||
{NULL, NULL}
|
||||
};
|
||||
|
||||
UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
|
||||
Init();
|
||||
}
|
||||
|
||||
// Initialize all properties to sensible default values.
|
||||
void UNICHARSET::UNICHAR_PROPERTIES::Init() {
|
||||
isalpha = false;
|
||||
islower = false;
|
||||
@ -54,13 +74,73 @@ void UNICHARSET::UNICHAR_PROPERTIES::Init() {
|
||||
ispunctuation = false;
|
||||
isngram = false;
|
||||
enabled = false;
|
||||
SetRangesOpen();
|
||||
script_id = 0;
|
||||
other_case = 0;
|
||||
mirror = 0;
|
||||
normed = "";
|
||||
direction = UNICHARSET::U_LEFT_TO_RIGHT;
|
||||
fragment = NULL;
|
||||
}
|
||||
|
||||
// Sets all ranges wide open. Initialization default in case there are
|
||||
// no useful values available.
|
||||
void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
|
||||
min_bottom = 0;
|
||||
max_bottom = MAX_UINT8;
|
||||
min_top = 0;
|
||||
max_top = MAX_UINT8;
|
||||
script_id = 0;
|
||||
other_case = 0;
|
||||
fragment = NULL;
|
||||
min_width = 0;
|
||||
max_width = MAX_INT16;
|
||||
min_bearing = 0;
|
||||
max_bearing = MAX_INT16;
|
||||
min_advance = 0;
|
||||
max_advance = MAX_INT16;
|
||||
}
|
||||
|
||||
// Sets all ranges to empty. Used before expanding with font-based data.
|
||||
void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
|
||||
min_bottom = MAX_UINT8;
|
||||
max_bottom = 0;
|
||||
min_top = MAX_UINT8;
|
||||
max_top = 0;
|
||||
min_width = MAX_INT16;
|
||||
max_width = 0;
|
||||
min_bearing = MAX_INT16;
|
||||
max_bearing = 0;
|
||||
min_advance = MAX_INT16;
|
||||
max_advance = 0;
|
||||
}
|
||||
|
||||
// Returns true if any of the top/bottom/width/bearing/advance ranges is
|
||||
// emtpy.
|
||||
bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {
|
||||
return min_bottom > max_bottom || min_top > max_top ||
|
||||
min_width > max_width || min_bearing > max_bearing ||
|
||||
min_advance > max_advance;
|
||||
}
|
||||
|
||||
// Expands the ranges with the ranges from the src properties.
|
||||
void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
|
||||
const UNICHAR_PROPERTIES& src) {
|
||||
UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
|
||||
UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
|
||||
UpdateRange(src.min_top, &min_top, &max_top);
|
||||
UpdateRange(src.max_top, &min_top, &max_top);
|
||||
UpdateRange(src.min_width, &min_width, &max_width);
|
||||
UpdateRange(src.max_width, &min_width, &max_width);
|
||||
UpdateRange(src.min_bearing, &min_bearing, &max_bearing);
|
||||
UpdateRange(src.max_bearing, &min_bearing, &max_bearing);
|
||||
UpdateRange(src.min_advance, &min_advance, &max_advance);
|
||||
UpdateRange(src.max_advance, &min_advance, &max_advance);
|
||||
}
|
||||
|
||||
// Copies the properties from src into this.
|
||||
void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES& src) {
|
||||
// Apart from the fragment, everything else can be done with a default copy.
|
||||
CHAR_FRAGMENT* saved_fragment = fragment;
|
||||
*this = src; // Bitwise copy.
|
||||
fragment = saved_fragment;
|
||||
}
|
||||
|
||||
UNICHARSET::UNICHARSET() :
|
||||
@ -82,7 +162,7 @@ void UNICHARSET::reserve(int unichars_number) {
|
||||
if (unichars_number > size_reserved) {
|
||||
UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
|
||||
for (int i = 0; i < size_used; ++i)
|
||||
memcpy(&unichars_new[i], &unichars[i], sizeof(UNICHAR_SLOT));
|
||||
unichars_new[i] = unichars[i];
|
||||
for (int j = size_used; j < unichars_number; ++j) {
|
||||
unichars_new[j].properties.script_id = add_script(null_script);
|
||||
}
|
||||
@ -119,21 +199,60 @@ int UNICHARSET::step(const char* str) const {
|
||||
while (goodlength <= UNICHAR_LEN) {
|
||||
if (str[goodlength] == '\0' || ids.minmatch(str + goodlength) > 0)
|
||||
return goodlength; // This length works!
|
||||
|
||||
// The next char is illegal so find the next usable length.
|
||||
do {
|
||||
++goodlength;
|
||||
} while (str[goodlength] != '\0' && goodlength <= UNICHAR_LEN &&
|
||||
!ids.contains(str, goodlength));
|
||||
if (goodlength > UNICHAR_LEN || !ids.contains(str, goodlength)) {
|
||||
// This does not constitute a good length!
|
||||
return minlength;
|
||||
}
|
||||
}
|
||||
// Search to find a subsequent legal char failed so return the minlength.
|
||||
return minlength;
|
||||
}
|
||||
|
||||
// Return whether the given UTF-8 string is encodable with this UNICHARSET.
|
||||
// If not encodable, write the first byte offset which cannot be converted
|
||||
// into the second (return) argument.
|
||||
bool UNICHARSET::encodable_string(const char *str,
|
||||
int *first_bad_position) const {
|
||||
for (int i = 0, len = strlen(str); i < len; ) {
|
||||
int increment = step(str + i);
|
||||
if (increment == 0) {
|
||||
if (first_bad_position) *first_bad_position = i;
|
||||
return false;
|
||||
}
|
||||
i += increment;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
|
||||
if (id == INVALID_UNICHAR_ID) {
|
||||
return INVALID_UNICHAR;
|
||||
}
|
||||
assert(id < this->size());
|
||||
ASSERT_HOST(id < this->size());
|
||||
return unichars[id].representation;
|
||||
}
|
||||
|
||||
const char* const UNICHARSET::id_to_unichar_ext(UNICHAR_ID id) const {
|
||||
if (id == INVALID_UNICHAR_ID) {
|
||||
return INVALID_UNICHAR;
|
||||
}
|
||||
ASSERT_HOST(id < this->size());
|
||||
// Resolve from the kCustomLigatures table if this is a private encoding.
|
||||
if (get_isprivate(id)) {
|
||||
const char* ch = id_to_unichar(id);
|
||||
for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) {
|
||||
if (!strcmp(ch, kCustomLigatures[i][1])) {
|
||||
return kCustomLigatures[i][0];
|
||||
}
|
||||
}
|
||||
}
|
||||
// Otherwise return the stored representation.
|
||||
return unichars[id].representation;
|
||||
}
|
||||
|
||||
@ -167,9 +286,7 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
|
||||
if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
|
||||
const CHAR_FRAGMENT *fragment = this->get_fragment(id);
|
||||
if (fragment) {
|
||||
STRING base = debug_str(fragment->get_unichar());
|
||||
return CHAR_FRAGMENT::to_string(base.string(), fragment->get_pos(),
|
||||
fragment->get_total());
|
||||
return fragment->to_string();
|
||||
}
|
||||
const char* str = id_to_unichar(id);
|
||||
STRING result = debug_utf8_str(str);
|
||||
@ -193,6 +310,180 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
|
||||
return result;
|
||||
}
|
||||
|
||||
// Returns whether the unichar id represents a unicode value in the private use
|
||||
// area. We use this range only internally to represent uncommon ligatures
|
||||
// (eg. 'ct') that do not have regular unicode values.
|
||||
bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {
|
||||
UNICHAR uc(id_to_unichar(unichar_id), -1);
|
||||
int uni = uc.first_uni();
|
||||
return (uni >= 0xE000 && uni <= 0xF8FF);
|
||||
}
|
||||
|
||||
|
||||
// Sets all ranges to empty, so they can be expanded to set the values.
|
||||
void UNICHARSET::set_ranges_empty() {
|
||||
for (int id = 0; id < size_used; ++id) {
|
||||
unichars[id].properties.SetRangesEmpty();
|
||||
}
|
||||
}
|
||||
|
||||
// Sets all the properties for this unicharset given a src unicharset with
|
||||
// everything set. The unicharsets don't have to be the same, and graphemes
|
||||
// are correctly accounted for.
|
||||
void UNICHARSET::SetPropertiesFromOther(const UNICHARSET& src) {
|
||||
for (int ch = 0; ch < size_used; ++ch) {
|
||||
const char* utf8 = id_to_unichar(ch);
|
||||
UNICHAR_PROPERTIES properties;
|
||||
if (src.GetStrProperties(utf8, &properties)) {
|
||||
// Setup the script_id, other_case, and mirror properly.
|
||||
const char* script = src.get_script_from_script_id(properties.script_id);
|
||||
properties.script_id = add_script(script);
|
||||
const char* other_case = src.id_to_unichar(properties.other_case);
|
||||
if (contains_unichar(other_case)) {
|
||||
properties.other_case = unichar_to_id(other_case);
|
||||
} else {
|
||||
properties.other_case = ch;
|
||||
}
|
||||
const char* mirror_str = src.id_to_unichar(properties.mirror);
|
||||
if (contains_unichar(mirror_str)) {
|
||||
properties.mirror = unichar_to_id(mirror_str);
|
||||
} else {
|
||||
properties.mirror = ch;
|
||||
}
|
||||
unichars[ch].properties.CopyFrom(properties);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Expands the tops and bottoms and widths for this unicharset given a
|
||||
// src unicharset with ranges in it. The unicharsets don't have to be the
|
||||
// same, and graphemes are correctly accounted for.
|
||||
void UNICHARSET::ExpandRangesFromOther(const UNICHARSET& src) {
|
||||
for (int ch = 0; ch < size_used; ++ch) {
|
||||
const char* utf8 = id_to_unichar(ch);
|
||||
UNICHAR_PROPERTIES properties;
|
||||
if (src.GetStrProperties(utf8, &properties)) {
|
||||
// Expand just the ranges from properties.
|
||||
unichars[ch].properties.ExpandRangesFrom(properties);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// For each id in src, if it does not occur in this, add it, as in
|
||||
// SetPropertiesFromOther, otherwise expand the ranges, as in
|
||||
// ExpandRangesFromOther.
|
||||
void UNICHARSET::AppendOtherUnicharset(const UNICHARSET& src) {
|
||||
for (int ch = 0; ch < src.size_used; ++ch) {
|
||||
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
|
||||
const char* utf8 = src.id_to_unichar(ch);
|
||||
if (strcmp(utf8, " ") != 0 && src_props.AnyRangeEmpty()) {
|
||||
// Only use fully valid entries.
|
||||
tprintf("Bad properties for char %s: %d,%d %d,%d %d,%d %d,%d %d,%d\n",
|
||||
utf8, src_props.min_bottom, src_props.max_bottom,
|
||||
src_props.min_top, src_props.max_top,
|
||||
src_props.min_width, src_props.max_width,
|
||||
src_props.min_bearing, src_props.max_bearing,
|
||||
src_props.min_advance, src_props.max_advance);
|
||||
continue;
|
||||
}
|
||||
int id = size_used;
|
||||
if (contains_unichar(utf8)) {
|
||||
id = unichar_to_id(utf8);
|
||||
} else {
|
||||
unichar_insert(utf8);
|
||||
unichars[id].properties.SetRangesEmpty();
|
||||
}
|
||||
if (!unichars[id].properties.AnyRangeEmpty()) {
|
||||
// Just expand current ranges.
|
||||
unichars[id].properties.ExpandRangesFrom(src_props);
|
||||
} else {
|
||||
// Copy properties from src_props.
|
||||
unichars[id].properties.CopyFrom(src_props);
|
||||
// Setup the script_id, other_case and mirror properly.
|
||||
const char* script = src.get_script_from_script_id(src_props.script_id);
|
||||
unichars[id].properties.script_id = add_script(script);
|
||||
const char* other_case = src.id_to_unichar(src_props.other_case);
|
||||
if (!contains_unichar(other_case)) {
|
||||
unichar_insert(other_case);
|
||||
unichars[size_used - 1].properties.SetRangesEmpty();
|
||||
// Other_case will have its ranges set later as it is contained in src.
|
||||
}
|
||||
unichars[id].properties.other_case = unichar_to_id(other_case);
|
||||
const char* mirror_str = src.id_to_unichar(src_props.mirror);
|
||||
if (!contains_unichar(mirror_str)) {
|
||||
unichar_insert(mirror_str);
|
||||
unichars[size_used - 1].properties.SetRangesEmpty();
|
||||
// Mirror will have its ranges set later as it is contained in src.
|
||||
}
|
||||
unichars[id].properties.mirror = unichar_to_id(mirror_str);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Gets the properties for a grapheme string, combining properties for
|
||||
// multiple characters in a meaningful way where possible.
|
||||
// Returns false if no valid match was found in the unicharset.
|
||||
// NOTE that script_id, mirror, and other_case refer to this unicharset on
|
||||
// return and will need translation if the target unicharset is different.
|
||||
bool UNICHARSET::GetStrProperties(const char* utf8_str,
|
||||
UNICHAR_PROPERTIES* props) const {
|
||||
props->Init();
|
||||
props->SetRangesEmpty();
|
||||
props->min_advance = 0;
|
||||
props->max_advance = 0;
|
||||
int utf8_step = 0;
|
||||
int total_unicodes = 0;
|
||||
for (int offset = 0; utf8_str[offset] != '\0'; offset += utf8_step) {
|
||||
utf8_step = step(utf8_str + offset);
|
||||
if (utf8_step == 0) return false;
|
||||
int id = unichar_to_id(utf8_str + offset, utf8_step);
|
||||
if (id < 0) return false;
|
||||
const UNICHAR_PROPERTIES& src_props = unichars[id].properties;
|
||||
// Logical OR all the bools.
|
||||
if (src_props.isalpha) props->isalpha = true;
|
||||
if (src_props.islower) props->islower = true;
|
||||
if (src_props.isupper) props->isupper = true;
|
||||
if (src_props.isdigit) props->isdigit = true;
|
||||
if (src_props.ispunctuation) props->ispunctuation = true;
|
||||
if (src_props.isngram) props->isngram = true;
|
||||
if (src_props.enabled) props->enabled = true;
|
||||
// Min/max the tops/bottoms.
|
||||
UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
|
||||
UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
|
||||
UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
|
||||
UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
|
||||
int bearing = props->min_advance + src_props.min_bearing;
|
||||
if (total_unicodes == 0 || bearing < props->min_bearing)
|
||||
props->min_bearing = bearing;
|
||||
bearing = props->max_advance + src_props.max_bearing;
|
||||
if (total_unicodes == 0 || bearing < props->max_bearing)
|
||||
props->max_bearing = bearing;
|
||||
props->min_advance += src_props.min_advance;
|
||||
props->max_advance += src_props.max_advance;
|
||||
// With a single width, just use the widths stored in the unicharset.
|
||||
props->min_width = src_props.min_width;
|
||||
props->max_width = src_props.max_width;
|
||||
// Use the first script id, other_case, mirror, direction.
|
||||
// Note that these will need translation, except direction.
|
||||
if (total_unicodes == 0) {
|
||||
props->script_id = src_props.script_id;
|
||||
props->other_case = src_props.other_case;
|
||||
props->mirror = src_props.mirror;
|
||||
props->direction = src_props.direction;
|
||||
}
|
||||
// The normed string for the compound character is the concatenation of
|
||||
// the normed versions of the individual characters.
|
||||
props->normed += src_props.normed;
|
||||
++total_unicodes;
|
||||
}
|
||||
if (total_unicodes > 1) {
|
||||
// Estimate the total widths from the advance - bearing.
|
||||
props->min_width = props->min_advance - props->max_bearing;
|
||||
props->max_width = props->max_advance - props->min_bearing;
|
||||
}
|
||||
return total_unicodes > 0;
|
||||
}
|
||||
|
||||
unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const {
|
||||
unsigned int properties = 0;
|
||||
if (this->get_isalpha(id))
|
||||
@ -271,27 +562,96 @@ bool UNICHARSET::save_to_file(FILE *file) const {
|
||||
for (UNICHAR_ID id = 0; id < this->size(); ++id) {
|
||||
int min_bottom, max_bottom, min_top, max_top;
|
||||
get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
|
||||
int min_width, max_width;
|
||||
get_width_range(id, &min_width, &max_width);
|
||||
int min_bearing, max_bearing;
|
||||
get_bearing_range(id, &min_bearing, &max_bearing);
|
||||
int min_advance, max_advance;
|
||||
get_advance_range(id, &min_advance, &max_advance);
|
||||
unsigned int properties = this->get_properties(id);
|
||||
if (strcmp(this->id_to_unichar(id), " ") == 0)
|
||||
if (strcmp(this->id_to_unichar(id), " ") == 0) {
|
||||
fprintf(file, "%s %x %s %d\n", "NULL", properties,
|
||||
this->get_script_from_script_id(this->get_script(id)),
|
||||
this->get_other_case(id));
|
||||
else
|
||||
fprintf(file, "%s %x %d,%d,%d,%d %s %d\t# %s\n",
|
||||
} else {
|
||||
fprintf(file,
|
||||
"%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %s %d %d %d %s\t# %s\n",
|
||||
this->id_to_unichar(id), properties,
|
||||
min_bottom, max_bottom, min_top, max_top,
|
||||
min_bottom, max_bottom, min_top, max_top, min_width, max_width,
|
||||
min_bearing, max_bearing, min_advance, max_advance,
|
||||
this->get_script_from_script_id(this->get_script(id)),
|
||||
this->get_other_case(id), this->debug_str(id).string());
|
||||
this->get_other_case(id), this->get_direction(id),
|
||||
this->get_mirror(id), this->get_normed_unichar(id),
|
||||
this->debug_str(id).string());
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
class InMemoryFilePointer {
|
||||
public:
|
||||
InMemoryFilePointer(const char *memory, int mem_size)
|
||||
: memory_(memory), fgets_ptr_(memory), mem_size_(mem_size) { }
|
||||
|
||||
char *fgets(char *orig_dst, int size) {
|
||||
const char *src_end = memory_ + mem_size_;
|
||||
char *dst_end = orig_dst + size - 1;
|
||||
if (size < 1) {
|
||||
return fgets_ptr_ < src_end ? orig_dst : NULL;
|
||||
}
|
||||
|
||||
char *dst = orig_dst;
|
||||
char ch = '^';
|
||||
while (fgets_ptr_ < src_end && dst < dst_end && ch != '\n') {
|
||||
ch = *dst++ = *fgets_ptr_++;
|
||||
}
|
||||
*dst = 0;
|
||||
return (dst == orig_dst) ? NULL : orig_dst;
|
||||
}
|
||||
|
||||
private:
|
||||
const char *memory_;
|
||||
const char *fgets_ptr_;
|
||||
const int mem_size_;
|
||||
};
|
||||
|
||||
bool UNICHARSET::load_from_inmemory_file(
|
||||
const char *memory, int mem_size, bool skip_fragments) {
|
||||
InMemoryFilePointer mem_fp(memory, mem_size);
|
||||
TessResultCallback2<char *, char *, int> *fgets_cb =
|
||||
NewPermanentTessCallback(&mem_fp, &InMemoryFilePointer::fgets);
|
||||
bool success = load_via_fgets(fgets_cb, skip_fragments);
|
||||
delete fgets_cb;
|
||||
return success;
|
||||
}
|
||||
|
||||
class LocalFilePointer {
|
||||
public:
|
||||
LocalFilePointer(FILE *stream) : fp_(stream) {}
|
||||
char *fgets(char *dst, int size) {
|
||||
return ::fgets(dst, size, fp_);
|
||||
}
|
||||
private:
|
||||
FILE *fp_;
|
||||
};
|
||||
|
||||
bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
|
||||
LocalFilePointer lfp(file);
|
||||
TessResultCallback2<char *, char *, int> *fgets_cb =
|
||||
NewPermanentTessCallback(&lfp, &LocalFilePointer::fgets);
|
||||
bool success = load_via_fgets(fgets_cb, skip_fragments);
|
||||
delete fgets_cb;
|
||||
return success;
|
||||
}
|
||||
|
||||
bool UNICHARSET::load_via_fgets(
|
||||
TessResultCallback2<char *, char *, int> *fgets_cb,
|
||||
bool skip_fragments) {
|
||||
int unicharset_size;
|
||||
char buffer[256];
|
||||
|
||||
this->clear();
|
||||
if (fgets(buffer, sizeof (buffer), file) == NULL ||
|
||||
if (fgets_cb->Run(buffer, sizeof(buffer)) == NULL ||
|
||||
sscanf(buffer, "%d", &unicharset_size) != 1) {
|
||||
return false;
|
||||
}
|
||||
@ -302,21 +662,53 @@ bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
|
||||
char script[64];
|
||||
|
||||
strcpy(script, null_script);
|
||||
this->unichars[id].properties.other_case = id;
|
||||
int min_bottom = 0;
|
||||
int max_bottom = MAX_UINT8;
|
||||
int min_top = 0;
|
||||
int max_top = MAX_UINT8;
|
||||
if (fgets(buffer, sizeof (buffer), file) == NULL ||
|
||||
(sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties,
|
||||
&min_bottom, &max_bottom, &min_top, &max_top,
|
||||
script, &(this->unichars[id].properties.other_case)) != 8 &&
|
||||
sscanf(buffer, "%s %x %63s %d", unichar, &properties,
|
||||
script, &(this->unichars[id].properties.other_case)) != 4 &&
|
||||
sscanf(buffer, "%s %x %63s", unichar, &properties, script) != 3 &&
|
||||
sscanf(buffer, "%s %x", unichar, &properties) != 2)) {
|
||||
int min_width = 0;
|
||||
int max_width = MAX_INT16;
|
||||
int min_bearing = 0;
|
||||
int max_bearing = MAX_INT16;
|
||||
int min_advance = 0;
|
||||
int max_advance = MAX_INT16;
|
||||
// TODO(eger): check that this default it ok
|
||||
// after enabling BiDi iterator for Arabic+Cube.
|
||||
int direction = UNICHARSET::U_LEFT_TO_RIGHT;
|
||||
UNICHAR_ID other_case = id;
|
||||
UNICHAR_ID mirror = id;
|
||||
char normed[64];
|
||||
int v = -1;
|
||||
if (fgets_cb->Run(buffer, sizeof (buffer)) == NULL ||
|
||||
((v = sscanf(buffer,
|
||||
"%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d %63s",
|
||||
unichar, &properties,
|
||||
&min_bottom, &max_bottom, &min_top, &max_top,
|
||||
&min_width, &max_width, &min_bearing, &max_bearing,
|
||||
&min_advance, &max_advance, script, &other_case,
|
||||
&direction, &mirror, normed)) != 17 &&
|
||||
(v = sscanf(buffer,
|
||||
"%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d",
|
||||
unichar, &properties,
|
||||
&min_bottom, &max_bottom, &min_top, &max_top,
|
||||
&min_width, &max_width, &min_bearing, &max_bearing,
|
||||
&min_advance, &max_advance,
|
||||
script, &other_case, &direction, &mirror)) != 16 &&
|
||||
(v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d %d %d",
|
||||
unichar, &properties,
|
||||
&min_bottom, &max_bottom, &min_top, &max_top,
|
||||
script, &other_case, &direction, &mirror)) != 10 &&
|
||||
(v = sscanf(buffer, "%s %x %d,%d,%d,%d %63s %d", unichar, &properties,
|
||||
&min_bottom, &max_bottom, &min_top, &max_top,
|
||||
script, &other_case)) != 8 &&
|
||||
(v = sscanf(buffer, "%s %x %63s %d", unichar, &properties,
|
||||
script, &other_case)) != 4 &&
|
||||
(v = sscanf(buffer, "%s %x %63s",
|
||||
unichar, &properties, script)) != 3 &&
|
||||
(v = sscanf(buffer, "%s %x", unichar, &properties) != 2))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Skip fragments if needed.
|
||||
CHAR_FRAGMENT *frag = NULL;
|
||||
if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
|
||||
@ -338,6 +730,15 @@ bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
|
||||
this->set_script(id, script);
|
||||
this->unichars[id].properties.enabled = true;
|
||||
this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
|
||||
this->set_width_range(id, min_width, max_width);
|
||||
this->set_bearing_range(id, min_bearing, max_bearing);
|
||||
this->set_advance_range(id, min_advance, max_advance);
|
||||
this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
|
||||
ASSERT_HOST(other_case < unicharset_size);
|
||||
this->set_other_case(id, (v>3) ? other_case : id);
|
||||
ASSERT_HOST(mirror < unicharset_size);
|
||||
this->set_mirror(id, (v>8) ? mirror : id);
|
||||
this->set_normed(id, (v>16) ? normed : unichar);
|
||||
}
|
||||
post_load_setup();
|
||||
return true;
|
||||
@ -372,9 +773,11 @@ void UNICHARSET::post_load_setup() {
|
||||
++cap_height_alphas;
|
||||
}
|
||||
}
|
||||
|
||||
script_has_upper_lower_ = net_case_alphas > 0;
|
||||
script_has_xheight_ = script_has_upper_lower_ ||
|
||||
x_height_alphas > cap_height_alphas * kMinXHeightFraction;
|
||||
(x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
|
||||
cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
|
||||
|
||||
null_sid_ = get_script_id_from_name(null_script);
|
||||
ASSERT_HOST(null_sid_ == 0);
|
||||
@ -386,11 +789,15 @@ void UNICHARSET::post_load_setup() {
|
||||
hiragana_sid_ = get_script_id_from_name("Hiragana");
|
||||
katakana_sid_ = get_script_id_from_name("Katakana");
|
||||
|
||||
// Compute default script.
|
||||
// Compute default script. Use the highest-counting alpha script, that is
|
||||
// not the common script, as that still contains some "alphas".
|
||||
int* script_counts = new int[script_table_size_used];
|
||||
memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
|
||||
for (int id = 0; id < size_used; ++id)
|
||||
++script_counts[get_script(id)];
|
||||
for (int id = 0; id < size_used; ++id) {
|
||||
if (get_isalpha(id)) {
|
||||
++script_counts[get_script(id)];
|
||||
}
|
||||
}
|
||||
default_sid_ = 0;
|
||||
for (int s = 1; s < script_table_size_used; ++s) {
|
||||
if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
|
||||
@ -399,15 +806,21 @@ void UNICHARSET::post_load_setup() {
|
||||
delete [] script_counts;
|
||||
}
|
||||
|
||||
// Returns true if any script entry in the unicharset is for a
|
||||
// right_to_left language.
|
||||
bool UNICHARSET::any_right_to_left() const {
|
||||
for (int id = 0; id < script_table_size_used; ++id) {
|
||||
if (strcmp(script_table[id], "Arabic") == 0 ||
|
||||
strcmp(script_table[id], "Hebrew") == 0)
|
||||
return true;
|
||||
// Returns true if right_to_left scripts are significant in the unicharset,
|
||||
// but without being so sensitive that "universal" unicharsets containing
|
||||
// characters from many scripts, like orientation and script detection,
|
||||
// look like they are right_to_left.
|
||||
bool UNICHARSET::major_right_to_left() const {
|
||||
int ltr_count = 0;
|
||||
int rtl_count = 0;
|
||||
for (int id = 0; id < size_used; ++id) {
|
||||
int dir = get_direction(id);
|
||||
if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
|
||||
if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
|
||||
dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC ||
|
||||
dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
|
||||
}
|
||||
return false;
|
||||
return rtl_count > ltr_count;
|
||||
}
|
||||
|
||||
// Set a whitelist and/or blacklist of characters to recognize.
|
||||
@ -471,6 +884,21 @@ int UNICHARSET::add_script(const char* script) {
|
||||
return script_table_size_used++;
|
||||
}
|
||||
|
||||
// Returns the string that represents a fragment
|
||||
// with the given unichar, pos and total.
|
||||
STRING CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,
|
||||
bool natural) {
|
||||
if (total == 1) return STRING(unichar);
|
||||
STRING result = "";
|
||||
result += kSeparator;
|
||||
result += unichar;
|
||||
char buffer[kMaxLen];
|
||||
snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
|
||||
natural ? kNaturalFlag : kSeparator, total);
|
||||
result += buffer;
|
||||
return result;
|
||||
}
|
||||
|
||||
CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
|
||||
const char *ptr = string;
|
||||
int len = strlen(string);
|
||||
@ -491,10 +919,14 @@ CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
|
||||
ptr += step; // move to the next fragment separator
|
||||
int pos = 0;
|
||||
int total = 0;
|
||||
bool natural = false;
|
||||
char *end_ptr = NULL;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (ptr > string + len || *ptr != kSeparator) {
|
||||
return NULL; // failed to parse fragment representation
|
||||
if (i == 1 && *ptr == kNaturalFlag)
|
||||
natural = true;
|
||||
else
|
||||
return NULL; // Failed to parse fragment representation.
|
||||
}
|
||||
ptr++; // move to the next character
|
||||
i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
|
||||
@ -505,7 +937,7 @@ CHAR_FRAGMENT *CHAR_FRAGMENT::parse_from_string(const char *string) {
|
||||
return NULL; // malformed fragment representation
|
||||
}
|
||||
CHAR_FRAGMENT *fragment = new CHAR_FRAGMENT();
|
||||
fragment->set_all(unichar, pos, total);
|
||||
fragment->set_all(unichar, pos, total, natural);
|
||||
return fragment;
|
||||
}
|
||||
|
||||
|
@ -26,22 +26,29 @@
|
||||
#include "unicharmap.h"
|
||||
#include "params.h"
|
||||
|
||||
enum StrongScriptDirection {
|
||||
DIR_NEUTRAL = 0, // Text contains only neutral characters.
|
||||
DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters.
|
||||
DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters.
|
||||
DIR_MIX = 3, // Text contains a mixture of left-to-right
|
||||
// and right-to-left characters.
|
||||
};
|
||||
|
||||
class CHAR_FRAGMENT {
|
||||
public:
|
||||
// Minimum number of characters used for fragment representation.
|
||||
static const int kMinLen = 6;
|
||||
// Maximum number of characters used for fragment representation.
|
||||
static const int kMaxLen = 3 + UNICHAR_LEN + 2;
|
||||
// Special character used in representing character fragments.
|
||||
static const char kSeparator = '|';
|
||||
// Maximum number of fragments per character.
|
||||
static const int kMaxChunks = 3;
|
||||
static const int kMaxChunks = 5;
|
||||
|
||||
// Setters and Getters.
|
||||
inline void set_all(const char *unichar, int pos, int total) {
|
||||
this->set_unichar(unichar);
|
||||
this->set_pos(pos);
|
||||
this->set_total(total);
|
||||
inline void set_all(const char *unichar, int pos, int total, bool natural) {
|
||||
set_unichar(unichar);
|
||||
set_pos(pos);
|
||||
set_total(total);
|
||||
set_natural(natural);
|
||||
}
|
||||
inline void set_unichar(const char *uch) {
|
||||
strncpy(this->unichar, uch, UNICHAR_LEN);
|
||||
@ -55,19 +62,11 @@ class CHAR_FRAGMENT {
|
||||
|
||||
// Returns the string that represents a fragment
|
||||
// with the given unichar, pos and total.
|
||||
static STRING to_string(const char *unichar, int pos, int total) {
|
||||
if (total == 1) return STRING(unichar);
|
||||
STRING result = "";
|
||||
result += kSeparator;
|
||||
result += unichar;
|
||||
char buffer[kMaxLen];
|
||||
snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos, kSeparator, total);
|
||||
result += buffer;
|
||||
return result;
|
||||
}
|
||||
static STRING to_string(const char *unichar, int pos, int total,
|
||||
bool natural);
|
||||
// Returns the string that represents this fragment.
|
||||
STRING to_string() const {
|
||||
return to_string(this->unichar, this->pos, this->total);
|
||||
return to_string(unichar, pos, total, natural);
|
||||
}
|
||||
|
||||
// Checks whether a fragment has the same unichar,
|
||||
@ -97,11 +96,19 @@ class CHAR_FRAGMENT {
|
||||
// Returns true if this fragment is an ending fragment.
|
||||
inline bool is_ending() const { return this->pos == this->total-1; }
|
||||
|
||||
// Returns true if the fragment was a separate component to begin with,
|
||||
// ie did not need chopping to be isolated, but may have been separated
|
||||
// out from a multi-outline blob.
|
||||
inline bool is_natural() const { return natural; }
|
||||
void set_natural(bool value) { natural = value; }
|
||||
|
||||
// Parses the string to see whether it represents a character fragment
|
||||
// (rather than a regular character). If so, allocates memory for a new
|
||||
// CHAR_FRAGMENT instance and fills it in with the corresponding fragment
|
||||
// information. Fragments are of the form:
|
||||
// |m|1|2, meaning chunk 1 of 2 of character m.
|
||||
// |m|1|2, meaning chunk 1 of 2 of character m, or
|
||||
// |:|1n2, meaning chunk 1 of 2 of character :, and no chopping was needed
|
||||
// to divide the parts, as they were already separate connected components.
|
||||
//
|
||||
// If parsing succeeded returns the pointer to the allocated CHAR_FRAGMENT
|
||||
// instance, otherwise (if the string does not represent a fragment or it
|
||||
@ -113,6 +120,10 @@ class CHAR_FRAGMENT {
|
||||
|
||||
private:
|
||||
char unichar[UNICHAR_LEN + 1];
|
||||
// True if the fragment was a separate component to begin with,
|
||||
// ie did not need chopping to be isolated, but may have been separated
|
||||
// out from a multi-outline blob.
|
||||
bool natural;
|
||||
inT16 pos; // fragment position in the character
|
||||
inT16 total; // total number of fragments in the character
|
||||
};
|
||||
@ -122,6 +133,35 @@ class CHAR_FRAGMENT {
|
||||
// by a unique number, from 0 to (size - 1).
|
||||
class UNICHARSET {
|
||||
public:
|
||||
// Custom list of characters and their ligature forms (UTF8)
|
||||
// These map to unicode values in the private use area (PUC) and are supported
|
||||
// by only few font families (eg. Wyld, Adobe Caslon Pro).
|
||||
static const char* kCustomLigatures[][2];
|
||||
|
||||
// ICU 2.0 UCharDirection enum (from third_party/icu/include/unicode/uchar.h)
|
||||
enum Direction {
|
||||
U_LEFT_TO_RIGHT = 0,
|
||||
U_RIGHT_TO_LEFT = 1,
|
||||
U_EUROPEAN_NUMBER = 2,
|
||||
U_EUROPEAN_NUMBER_SEPARATOR = 3,
|
||||
U_EUROPEAN_NUMBER_TERMINATOR = 4,
|
||||
U_ARABIC_NUMBER = 5,
|
||||
U_COMMON_NUMBER_SEPARATOR = 6,
|
||||
U_BLOCK_SEPARATOR = 7,
|
||||
U_SEGMENT_SEPARATOR = 8,
|
||||
U_WHITE_SPACE_NEUTRAL = 9,
|
||||
U_OTHER_NEUTRAL = 10,
|
||||
U_LEFT_TO_RIGHT_EMBEDDING = 11,
|
||||
U_LEFT_TO_RIGHT_OVERRIDE = 12,
|
||||
U_RIGHT_TO_LEFT_ARABIC = 13,
|
||||
U_RIGHT_TO_LEFT_EMBEDDING = 14,
|
||||
U_RIGHT_TO_LEFT_OVERRIDE = 15,
|
||||
U_POP_DIRECTIONAL_FORMAT = 16,
|
||||
U_DIR_NON_SPACING_MARK = 17,
|
||||
U_BOUNDARY_NEUTRAL = 18,
|
||||
U_CHAR_DIRECTION_COUNT
|
||||
};
|
||||
|
||||
// Create an empty UNICHARSET
|
||||
UNICHARSET();
|
||||
|
||||
@ -142,10 +182,21 @@ class UNICHARSET {
|
||||
// ensures there is a legal match after it.
|
||||
int step(const char* str) const;
|
||||
|
||||
// Return whether the given UTF-8 string is encodable with this UNICHARSET.
|
||||
// If not encodable, write the first byte offset which cannot be converted
|
||||
// into the second (return) argument.
|
||||
bool encodable_string(const char *str, int *first_bad_position) const;
|
||||
|
||||
// Return the unichar representation corresponding to the given UNICHAR_ID
|
||||
// within the UNICHARSET.
|
||||
const char* const id_to_unichar(UNICHAR_ID id) const;
|
||||
|
||||
// Return the UTF8 representation corresponding to the given UNICHAR_ID after
|
||||
// resolving any private encodings internal to Tesseract. This method is
|
||||
// preferrable to id_to_unichar for outputting text that will be visible to
|
||||
// external applications.
|
||||
const char* const id_to_unichar_ext(UNICHAR_ID id) const;
|
||||
|
||||
// Return a STRING that reformats the utf8 str into the str followed
|
||||
// by its hex unicodes.
|
||||
static STRING debug_utf8_str(const char* str);
|
||||
@ -163,7 +214,8 @@ class UNICHARSET {
|
||||
// Return true if the given unichar id exists within the set.
|
||||
// Relies on the fact that unichar ids are contiguous in the unicharset.
|
||||
bool contains_unichar_id(UNICHAR_ID unichar_id) const {
|
||||
return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used;
|
||||
return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
|
||||
unichar_id >= 0;
|
||||
}
|
||||
|
||||
// Return true if the given unichar representation exists within the set.
|
||||
@ -237,6 +289,16 @@ class UNICHARSET {
|
||||
// Returns true if the operation is successful.
|
||||
bool save_to_file(FILE *file) const;
|
||||
|
||||
// Load a unicharset from a unicharset file that has been loaded into
|
||||
// the given memory buffer.
|
||||
// Returns true if the operation is successful.
|
||||
bool load_from_inmemory_file(const char* const memory, int mem_size,
|
||||
bool skip_fragments);
|
||||
// Returns true if the operation is successful.
|
||||
bool load_from_inmemory_file(const char* const memory, int mem_size) {
|
||||
return load_from_inmemory_file(memory, mem_size, false);
|
||||
}
|
||||
|
||||
// Opens the file indicated by filename and loads the UNICHARSET
|
||||
// from the given file. The previous data is lost.
|
||||
// Returns true if the operation is successful.
|
||||
@ -247,6 +309,7 @@ class UNICHARSET {
|
||||
fclose(file);
|
||||
return result;
|
||||
}
|
||||
// returns true if the operation is successful.
|
||||
bool load_from_file(const char* const filename) {
|
||||
return load_from_file(filename, false);
|
||||
}
|
||||
@ -261,9 +324,11 @@ class UNICHARSET {
|
||||
// during set_unicharset_properties.
|
||||
void post_load_setup();
|
||||
|
||||
// Returns true if any script entry in the unicharset is for a
|
||||
// right_to_left language.
|
||||
bool any_right_to_left() const;
|
||||
// Returns true if right_to_left scripts are significant in the unicharset,
|
||||
// but without being so sensitive that "universal" unicharsets containing
|
||||
// characters from many scripts, like orientation and script detection,
|
||||
// look like they are right_to_left.
|
||||
bool major_right_to_left() const;
|
||||
|
||||
// Set a whitelist and/or blacklist of characters to recognize.
|
||||
// An empty or NULL whitelist enables everything (minus any blacklist).
|
||||
@ -315,40 +380,85 @@ class UNICHARSET {
|
||||
unichars[unichar_id].properties.other_case = other_case;
|
||||
}
|
||||
|
||||
// Set the direction property of the given unichar to the given value.
|
||||
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value) {
|
||||
unichars[unichar_id].properties.direction = value;
|
||||
}
|
||||
|
||||
// Set mirror unichar id in the properties for the given unichar id.
|
||||
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror) {
|
||||
unichars[unichar_id].properties.mirror = mirror;
|
||||
}
|
||||
|
||||
// Record normalized version of unichar with the given unichar_id.
|
||||
void set_normed(UNICHAR_ID unichar_id, const char* normed) {
|
||||
unichars[unichar_id].properties.normed = normed;
|
||||
}
|
||||
|
||||
// Return the isalpha property of the given unichar.
|
||||
bool get_isalpha(UNICHAR_ID unichar_id) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) return false;
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
return unichars[unichar_id].properties.isalpha;
|
||||
}
|
||||
|
||||
// Return the islower property of the given unichar.
|
||||
bool get_islower(UNICHAR_ID unichar_id) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) return false;
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
return unichars[unichar_id].properties.islower;
|
||||
}
|
||||
|
||||
// Return the isupper property of the given unichar.
|
||||
bool get_isupper(UNICHAR_ID unichar_id) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) return false;
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
return unichars[unichar_id].properties.isupper;
|
||||
}
|
||||
|
||||
// Return the isdigit property of the given unichar.
|
||||
bool get_isdigit(UNICHAR_ID unichar_id) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) return false;
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
return unichars[unichar_id].properties.isdigit;
|
||||
}
|
||||
|
||||
// Return the ispunctuation property of the given unichar.
|
||||
bool get_ispunctuation(UNICHAR_ID unichar_id) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) return false;
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
return unichars[unichar_id].properties.ispunctuation;
|
||||
}
|
||||
|
||||
// Return the isngram property of the given unichar.
|
||||
bool get_isngram(UNICHAR_ID unichar_id) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) return false;
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
return unichars[unichar_id].properties.isngram;
|
||||
}
|
||||
|
||||
// Returns whether the unichar id represents a unicode value in the private
|
||||
// use area.
|
||||
bool get_isprivate(UNICHAR_ID unichar_id) const;
|
||||
|
||||
// Returns true if the ids have useful min/max top/bottom values.
|
||||
bool top_bottom_useful() const {
|
||||
return top_bottom_set_;
|
||||
}
|
||||
// Sets all ranges to empty, so they can be expanded to set the values.
|
||||
void set_ranges_empty();
|
||||
// Sets all the properties for this unicharset given a src_unicharset with
|
||||
// everything set. The unicharsets don't have to be the same, and graphemes
|
||||
// are correctly accounted for.
|
||||
void SetPropertiesFromOther(const UNICHARSET& src);
|
||||
// Expands the tops and bottoms and widths for this unicharset given a
|
||||
// src_unicharset with ranges in it. The unicharsets don't have to be the
|
||||
// same, and graphemes are correctly accounted for.
|
||||
void ExpandRangesFromOther(const UNICHARSET& src);
|
||||
// For each id in src, if it does not occur in this, add it, as in
|
||||
// SetPropertiesFromOther, otherwise expand the ranges, as in
|
||||
// ExpandRangesFromOther.
|
||||
void AppendOtherUnicharset(const UNICHARSET& src);
|
||||
// Returns the min and max bottom and top of the given unichar in
|
||||
// baseline-normalized coordinates, ie, where the baseline is
|
||||
// kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight
|
||||
@ -356,6 +466,12 @@ class UNICHARSET {
|
||||
void get_top_bottom(UNICHAR_ID unichar_id,
|
||||
int* min_bottom, int* max_bottom,
|
||||
int* min_top, int* max_top) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) {
|
||||
*min_bottom = *min_top = 0;
|
||||
*max_bottom = *max_top = 256; // kBlnCellHeight
|
||||
return;
|
||||
}
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
*min_bottom = unichars[unichar_id].properties.min_bottom;
|
||||
*max_bottom = unichars[unichar_id].properties.max_bottom;
|
||||
*min_top = unichars[unichar_id].properties.min_top;
|
||||
@ -373,11 +489,76 @@ class UNICHARSET {
|
||||
unichars[unichar_id].properties.max_top =
|
||||
static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8));
|
||||
}
|
||||
// Returns the width range of the given unichar in baseline-normalized
|
||||
// coordinates, ie, where the baseline is kBlnBaselineOffset and the
|
||||
// meanline is kBlnBaselineOffset + kBlnXHeight.
|
||||
// (See normalis.h for the definitions).
|
||||
void get_width_range(UNICHAR_ID unichar_id,
|
||||
int* min_width, int* max_width) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) {
|
||||
*min_width = 0;
|
||||
*max_width = 256; // kBlnCellHeight;
|
||||
return;
|
||||
}
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
*min_width = unichars[unichar_id].properties.min_width;
|
||||
*max_width = unichars[unichar_id].properties.max_width;
|
||||
}
|
||||
void set_width_range(UNICHAR_ID unichar_id, int min_width, int max_width) {
|
||||
unichars[unichar_id].properties.min_width =
|
||||
static_cast<inT16>(ClipToRange(min_width, 0, MAX_INT16));
|
||||
unichars[unichar_id].properties.max_width =
|
||||
static_cast<inT16>(ClipToRange(max_width, 0, MAX_INT16));
|
||||
}
|
||||
// Returns the range of the x-bearing of the given unichar in
|
||||
// baseline-normalized coordinates, ie, where the baseline is
|
||||
// kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight.
|
||||
// (See normalis.h for the definitions).
|
||||
void get_bearing_range(UNICHAR_ID unichar_id,
|
||||
int* min_bearing, int* max_bearing) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) {
|
||||
*min_bearing = *max_bearing = 0;
|
||||
return;
|
||||
}
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
*min_bearing = unichars[unichar_id].properties.min_bearing;
|
||||
*max_bearing = unichars[unichar_id].properties.max_bearing;
|
||||
}
|
||||
void set_bearing_range(UNICHAR_ID unichar_id,
|
||||
int min_bearing, int max_bearing) {
|
||||
unichars[unichar_id].properties.min_bearing =
|
||||
static_cast<inT16>(ClipToRange(min_bearing, 0, MAX_INT16));
|
||||
unichars[unichar_id].properties.max_bearing =
|
||||
static_cast<inT16>(ClipToRange(max_bearing, 0, MAX_INT16));
|
||||
}
|
||||
// Returns the range of the x-advance of the given unichar in
|
||||
// baseline-normalized coordinates, ie, where the baseline is
|
||||
// kBlnBaselineOffset and the meanline is kBlnBaselineOffset + kBlnXHeight.
|
||||
// (See normalis.h for the definitions).
|
||||
void get_advance_range(UNICHAR_ID unichar_id,
|
||||
int* min_advance, int* max_advance) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) {
|
||||
*min_advance = *max_advance = 0;
|
||||
return;
|
||||
}
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
*min_advance = unichars[unichar_id].properties.min_advance;
|
||||
*max_advance = unichars[unichar_id].properties.max_advance;
|
||||
}
|
||||
void set_advance_range(UNICHAR_ID unichar_id,
|
||||
int min_advance, int max_advance) {
|
||||
unichars[unichar_id].properties.min_advance =
|
||||
static_cast<inT16>(ClipToRange(min_advance, 0, MAX_INT16));
|
||||
unichars[unichar_id].properties.max_advance =
|
||||
static_cast<inT16>(ClipToRange(max_advance, 0, MAX_INT16));
|
||||
}
|
||||
|
||||
// Return the script name of the given unichar.
|
||||
// The returned pointer will always be the same for the same script, it's
|
||||
// managed by unicharset and thus MUST NOT be deleted
|
||||
int get_script(UNICHAR_ID unichar_id) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
return unichars[unichar_id].properties.script_id;
|
||||
}
|
||||
|
||||
@ -396,17 +577,37 @@ class UNICHARSET {
|
||||
|
||||
// Get other_case unichar id in the properties for the given unichar id.
|
||||
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
return unichars[unichar_id].properties.other_case;
|
||||
}
|
||||
|
||||
// Returns the direction property of the given unichar.
|
||||
Direction get_direction(UNICHAR_ID unichar_id) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
return unichars[unichar_id].properties.direction;
|
||||
}
|
||||
|
||||
// Get mirror unichar id in the properties for the given unichar id.
|
||||
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
return unichars[unichar_id].properties.mirror;
|
||||
}
|
||||
|
||||
// Returns UNICHAR_ID of the corresponding lower-case unichar.
|
||||
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
if (unichars[unichar_id].properties.islower) return unichar_id;
|
||||
return unichars[unichar_id].properties.other_case;
|
||||
}
|
||||
|
||||
// Returns UNICHAR_ID of the corresponding upper-case unichar.
|
||||
UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
if (unichars[unichar_id].properties.isupper) return unichar_id;
|
||||
return unichars[unichar_id].properties.other_case;
|
||||
}
|
||||
@ -414,6 +615,8 @@ class UNICHARSET {
|
||||
// Return a pointer to the CHAR_FRAGMENT class if the given
|
||||
// unichar id represents a character fragment.
|
||||
const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
|
||||
if (INVALID_UNICHAR_ID == unichar_id) return NULL;
|
||||
ASSERT_HOST(contains_unichar_id(unichar_id));
|
||||
return unichars[unichar_id].properties.fragment;
|
||||
}
|
||||
|
||||
@ -504,6 +707,11 @@ class UNICHARSET {
|
||||
return get_ispunctuation(unichar_to_id(unichar_repr, length));
|
||||
}
|
||||
|
||||
// Returns normalized version of unichar with the given unichar_id.
|
||||
const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
|
||||
return unichars[unichar_id].properties.normed.string();
|
||||
}
|
||||
|
||||
// Return the script name of the given unichar representation.
|
||||
// Only the first length characters from unichar_repr are used.
|
||||
// The returned pointer will always be the same for the same script, it's
|
||||
@ -574,7 +782,20 @@ class UNICHARSET {
|
||||
|
||||
struct UNICHAR_PROPERTIES {
|
||||
UNICHAR_PROPERTIES();
|
||||
// Initializes all properties to sensible default values.
|
||||
void Init();
|
||||
// Sets all ranges wide open. Initialization default in case there are
|
||||
// no useful values available.
|
||||
void SetRangesOpen();
|
||||
// Sets all ranges to empty. Used before expanding with font-based data.
|
||||
void SetRangesEmpty();
|
||||
// Returns true if any of the top/bottom/width/bearing/advance ranges is
|
||||
// emtpy.
|
||||
bool AnyRangeEmpty() const;
|
||||
// Expands the ranges with the ranges from the src properties.
|
||||
void ExpandRangesFrom(const UNICHAR_PROPERTIES& src);
|
||||
// Copies the properties from src into this.
|
||||
void CopyFrom(const UNICHAR_PROPERTIES& src);
|
||||
|
||||
bool isalpha;
|
||||
bool islower;
|
||||
@ -591,9 +812,25 @@ class UNICHARSET {
|
||||
uinT8 max_bottom;
|
||||
uinT8 min_top;
|
||||
uinT8 max_top;
|
||||
// Limits on the widths of bounding box, also in baseline-normalized coords.
|
||||
inT16 min_width;
|
||||
inT16 max_width;
|
||||
// Limits on the x-bearing and advance, also in baseline-normalized coords.
|
||||
inT16 min_bearing;
|
||||
inT16 max_bearing;
|
||||
inT16 min_advance;
|
||||
inT16 max_advance;
|
||||
int script_id;
|
||||
UNICHAR_ID other_case; // id of the corresponding upper/lower case unichar
|
||||
|
||||
Direction direction; // direction of this unichar
|
||||
// Mirror property is useful for reverse DAWG lookup for words in
|
||||
// right-to-left languages (e.g. "(word)" would be in
|
||||
// '[open paren]' 'w' 'o' 'r' 'd' '[close paren]' in a UTF8 string.
|
||||
// However, what we want in our DAWG is
|
||||
// '[open paren]', 'd', 'r', 'o', 'w', '[close paren]' not
|
||||
// '[close paren]', 'd', 'r', 'o', 'w', '[open paren]'.
|
||||
UNICHAR_ID mirror;
|
||||
STRING normed; // normalized version of this unichar
|
||||
// Contains meta information about the fragment if a unichar represents
|
||||
// a fragment of a character, otherwise should be set to NULL.
|
||||
// It is assumed that character fragments are added to the unicharset
|
||||
@ -606,6 +843,20 @@ class UNICHARSET {
|
||||
UNICHAR_PROPERTIES properties;
|
||||
};
|
||||
|
||||
// Gets the properties for a grapheme string, combining properties for
|
||||
// multiple characters in a meaningful way where possible.
|
||||
// Returns false if no valid match was found in the unicharset.
|
||||
// NOTE that script_id, mirror, and other_case refer to this unicharset on
|
||||
// return and will need redirecting if the target unicharset is different.
|
||||
bool GetStrProperties(const char* utf8_str,
|
||||
UNICHAR_PROPERTIES* props) const;
|
||||
|
||||
// Load ourselves from a "file" where our only interface to the file is
|
||||
// an implementation of fgets(). This is the parsing primitive accessed by
|
||||
// the public routines load_from_file() and load_from_inmemory_file().
|
||||
bool load_via_fgets(TessResultCallback2<char *, char *, int> *fgets_cb,
|
||||
bool skip_fragments);
|
||||
|
||||
UNICHAR_SLOT* unichars;
|
||||
UNICHARMAP ids;
|
||||
int size_used;
|
||||
|
@ -85,7 +85,7 @@ class UnicityTable {
|
||||
/// The Callback given must be permanent since they will be called more than
|
||||
/// once. The given callback will be deleted at the end.
|
||||
/// Returns false on read/write error.
|
||||
bool write(FILE* f, TessResultCallback2<bool, FILE*, T const &>* cb);
|
||||
bool write(FILE* f, TessResultCallback2<bool, FILE*, T const &>* cb) const;
|
||||
/// swap is used to switch the endianness.
|
||||
bool read(FILE* f, TessResultCallback3<bool, FILE*, T*, bool>* cb, bool swap);
|
||||
|
||||
@ -187,8 +187,8 @@ void UnicityTable<T>::clear() {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool UnicityTable<T>::write(FILE* f,
|
||||
TessResultCallback2<bool, FILE*, T const &>* cb) {
|
||||
bool UnicityTable<T>::write(
|
||||
FILE* f, TessResultCallback2<bool, FILE*, T const &>* cb) const {
|
||||
return table_.write(f, cb);
|
||||
}
|
||||
|
||||
|
57
ccutil/unicodes.cpp
Normal file
57
ccutil/unicodes.cpp
Normal file
@ -0,0 +1,57 @@
|
||||
/**********************************************************************
|
||||
* File: unicodes.h
|
||||
* Description: Unicode related machinery
|
||||
* Author: David Eger
|
||||
* Created: Wed Jun 15 16:37:50 PST 2011
|
||||
*
|
||||
* (C) Copyright 2011, Google, Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#include "unicodes.h"
|
||||
#include "host.h" // for NULL
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
const char *kUTF8LineSeparator = "\u2028"; // "\xe2\x80\xa8";
|
||||
const char *kUTF8ParagraphSeparator = "\u2029"; // "\xe2\x80\xa9";
|
||||
const char *kLRM = "\u200E"; // Left-to-Right Mark
|
||||
const char *kRLM = "\u200F"; // Right-to-Left Mark
|
||||
const char *kRLE = "\u202A"; // Right-to-Left Embedding
|
||||
const char *kPDF = "\u202C"; // Pop Directional Formatting
|
||||
|
||||
const char *kHyphenLikeUTF8[] = {
|
||||
"-", // ASCII hyphen-minus
|
||||
"\u05BE", // word hyphen in hybrew
|
||||
"\u2010", // hyphen
|
||||
"\u2011", // non-breaking hyphen
|
||||
"\u2012", // a hyphen the same width as digits
|
||||
"\u2013", // en dash
|
||||
"\u2014", // em dash
|
||||
"\u2015", // horizontal bar
|
||||
"\u2212", // arithmetic minus sign
|
||||
"\uFE58", // small em dash
|
||||
"\uFE63", // small hyphen-minus
|
||||
"\uFF0D", // fullwidth hyphen-minus
|
||||
NULL, // end of our list
|
||||
};
|
||||
|
||||
const char *kApostropheLikeUTF8[] = {
|
||||
"'", // ASCII apostrophe
|
||||
"`", // ASCII backtick
|
||||
"\u2018", // opening single quote
|
||||
"\u2019", // closing single quote
|
||||
"\u2032", // mathematical prime mark
|
||||
NULL, // end of our list.
|
||||
};
|
||||
|
||||
} // namespace
|
39
ccutil/unicodes.h
Normal file
39
ccutil/unicodes.h
Normal file
@ -0,0 +1,39 @@
|
||||
/**********************************************************************
|
||||
* File: unicodes.h
|
||||
* Description: Unicode related machinery
|
||||
* Author: David Eger
|
||||
* Created: Wed Jun 15 16:37:50 PST 2011
|
||||
*
|
||||
* (C) Copyright 2011, Google, Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
** you may not use this file except in compliance with the License.
|
||||
** You may obtain a copy of the License at
|
||||
** http://www.apache.org/licenses/LICENSE-2.0
|
||||
** Unless required by applicable law or agreed to in writing, software
|
||||
** distributed under the License is distributed on an "AS IS" BASIS,
|
||||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
** See the License for the specific language governing permissions and
|
||||
** limitations under the License.
|
||||
*
|
||||
**********************************************************************/
|
||||
|
||||
#ifndef TESSERACT_CCUTIL_UNICODES_H__
|
||||
#define TESSERACT_CCUTIL_UNICODES_H__
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
extern const char *kUTF8LineSeparator;
|
||||
extern const char *kUTF8ParagraphSeparator;
|
||||
extern const char *kLRM; // Left-to-Right Mark
|
||||
extern const char *kRLM; // Right-to-Left Mark
|
||||
extern const char *kRLE; // Right-to-Left Embedding
|
||||
extern const char *kPDF; // Pop Directional Formatting
|
||||
|
||||
// The following are confusable internal word punctuation symbols
|
||||
// which we normalize to the first variant when matching in dawgs.
|
||||
extern const char *kHyphenLikeUTF8[];
|
||||
extern const char *kApostropheLikeUTF8[];
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // TESSERACT_CCUTIL_UNICODES_H__
|
@ -7,7 +7,7 @@
|
||||
# ----------------------------------------
|
||||
|
||||
AC_PREREQ(2.50)
|
||||
AC_INIT([tesseract], [3.01], [theraysmith@gmail.com])
|
||||
AC_INIT([tesseract], [3.02], [theraysmith@gmail.com])
|
||||
AC_CONFIG_MACRO_DIR([m4])
|
||||
AC_REVISION($Id: configure.ac,v 1.4 2007/02/02 22:38:17 theraysmith Exp $)
|
||||
AC_CONFIG_AUX_DIR(config)
|
||||
@ -18,8 +18,8 @@ AC_CANONICAL_HOST
|
||||
# Define date of package, etc. Could be useful in auto-generated
|
||||
# documentation.
|
||||
# TODO(luc) Generate good documentation using doxygen or equivalent
|
||||
PACKAGE_YEAR=2011
|
||||
PACKAGE_DATE="10/16"
|
||||
PACKAGE_YEAR=2012
|
||||
PACKAGE_DATE="02/01"
|
||||
|
||||
AC_DEFINE_UNQUOTED(PACKAGE_NAME,["${PACKAGE_NAME}"],[Name of package])
|
||||
AC_DEFINE_UNQUOTED(PACKAGE_VERSION,["${PACKAGE_VERSION}"],[Version number])
|
||||
@ -35,7 +35,7 @@ GENERIC_LIBRARY_NAME=tesseract
|
||||
|
||||
# Release versioning
|
||||
GENERIC_MAJOR_VERSION=3
|
||||
GENERIC_MINOR_VERSION=1
|
||||
GENERIC_MINOR_VERSION=2
|
||||
GENERIC_MICRO_VERSION=0
|
||||
|
||||
# API version (often = GENERIC_MAJOR_VERSION.GENERIC_MINOR_VERSION)
|
||||
|
@ -11,57 +11,22 @@
|
||||
**************************************************************************/
|
||||
#include "freelist.h"
|
||||
|
||||
#include <memory.h>
|
||||
|
||||
#include "danerror.h"
|
||||
#include "memry.h"
|
||||
#include "tprintf.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
|
||||
// With improvements in OS memory allocators, internal memory management is
|
||||
// no longer required, so these functions all map to their malloc-family
|
||||
// equivalents.
|
||||
|
||||
|
||||
/**********************************************************************
|
||||
* memalloc
|
||||
*
|
||||
* Memory allocator with protection.
|
||||
**********************************************************************/
|
||||
int *memalloc(int size) {
|
||||
return ((int *) alloc_mem (size));
|
||||
return static_cast<int*>(malloc(static_cast<size_t>(size)));
|
||||
}
|
||||
|
||||
|
||||
/**********************************************************************
|
||||
* memrealloc
|
||||
*
|
||||
* Memory allocator with protection.
|
||||
**********************************************************************/
|
||||
int *memrealloc(void *ptr, int size, int oldsize) {
|
||||
int shiftsize;
|
||||
int *newbuf;
|
||||
|
||||
shiftsize = size > oldsize ? oldsize : size;
|
||||
newbuf = (int *) alloc_mem (size);
|
||||
memcpy(newbuf, ptr, shiftsize);
|
||||
free_mem(ptr);
|
||||
return newbuf;
|
||||
return static_cast<int*>(realloc(ptr, static_cast<size_t>(size)));
|
||||
}
|
||||
|
||||
|
||||
/**********************************************************************
|
||||
* memfree
|
||||
*
|
||||
* Memory allocator with protection.
|
||||
**********************************************************************/
|
||||
void memfree(void *element) {
|
||||
if (element) {
|
||||
free_mem(element);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**********************************************************************
|
||||
* mem_tidy
|
||||
*
|
||||
* Do nothing.
|
||||
**********************************************************************/
|
||||
void mem_tidy(int level) {
|
||||
check_mem ("Old tidy", level);
|
||||
free(element);
|
||||
}
|
||||
|
@ -40,6 +40,4 @@ int *memrealloc(void *ptr, int size, int oldsize);
|
||||
|
||||
void memfree(void *element);
|
||||
|
||||
void mem_tidy(int level);
|
||||
|
||||
#endif
|
||||
|
@ -26,6 +26,9 @@ class CCUtil;
|
||||
class Image {
|
||||
public:
|
||||
Image(CCUtil* ccutil_ptr);
|
||||
const CCUtil* getCCUtil() const {
|
||||
return ccutil_ptr_;
|
||||
}
|
||||
CCUtil* getCCUtil() {
|
||||
return ccutil_ptr_;
|
||||
}
|
||||
|
@ -23,7 +23,6 @@ void sv_show_sub_image(IMAGE* source, // Image to show.
|
||||
ScrollView* window, // Window to draw in.
|
||||
inT32 xpos, // Place to show bottom-left.
|
||||
inT32 ypos) { // Y position.
|
||||
|
||||
Pix* pix;
|
||||
if (xstart != 0 || ystart != 0 ||
|
||||
xext != source->get_xsize() || yext != source->get_ysize()) {
|
||||
|
@ -52,7 +52,6 @@ const int kMaxIntPairSize = 45; // Holds %d,%d, for upto 64 bit.
|
||||
|
||||
#include "allheaders.h"
|
||||
|
||||
|
||||
struct SVPolyLineBuffer {
|
||||
bool empty; // Independent indicator to allow SendMsg to call SendPolygon.
|
||||
std::vector<int> xcoords;
|
||||
|
@ -31,6 +31,8 @@
|
||||
|
||||
#ifndef TESSERACT_VIEWER_SCROLLVIEW_H__
|
||||
#define TESSERACT_VIEWER_SCROLLVIEW_H__
|
||||
// TODO(rays) Move ScrollView into the tesseract namespace.
|
||||
#ifndef OCR_SCROLLVIEW_H__
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
@ -412,4 +414,5 @@ class ScrollView {
|
||||
#endif // GRAPHICS_DISABLED
|
||||
};
|
||||
|
||||
#endif // OCR_SCROLLVIEW_H__
|
||||
#endif // TESSERACT_VIEWER_SCROLLVIEW_H__
|
||||
|
@ -42,6 +42,7 @@ struct addrinfo {
|
||||
#ifdef __linux__
|
||||
#include <sys/prctl.h>
|
||||
#endif
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#include <cstdlib>
|
||||
|
Loading…
Reference in New Issue
Block a user