2007-03-08 04:03:40 +08:00
|
|
|
/**********************************************************************
|
|
|
|
* File: ratngs.h (Formerly ratings.h)
|
|
|
|
* Description: Definition of the WERD_CHOICE and BLOB_CHOICE classes.
|
2009-07-11 10:14:57 +08:00
|
|
|
* Author: Ray Smith
|
|
|
|
* Created: Thu Apr 23 11:40:38 BST 1992
|
2007-03-08 04:03:40 +08:00
|
|
|
*
|
|
|
|
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
|
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
** you may not use this file except in compliance with the License.
|
|
|
|
** You may obtain a copy of the License at
|
|
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
** See the License for the specific language governing permissions and
|
|
|
|
** limitations under the License.
|
|
|
|
*
|
|
|
|
**********************************************************************/
|
|
|
|
|
|
|
|
#ifndef RATNGS_H
|
|
|
|
#define RATNGS_H
|
|
|
|
|
2009-07-11 10:14:57 +08:00
|
|
|
#include <assert.h>
|
|
|
|
|
2007-07-18 09:15:07 +08:00
|
|
|
#include "clst.h"
|
2013-09-23 23:26:50 +08:00
|
|
|
#include "elst.h"
|
2015-05-13 08:24:34 +08:00
|
|
|
#include "fontinfo.h"
|
2009-07-11 10:14:57 +08:00
|
|
|
#include "genericvector.h"
|
2013-09-23 23:26:50 +08:00
|
|
|
#include "matrix.h"
|
2007-07-18 09:15:07 +08:00
|
|
|
#include "unichar.h"
|
2009-07-11 10:14:57 +08:00
|
|
|
#include "unicharset.h"
|
|
|
|
#include "werd.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
class MATRIX;
|
2014-01-25 10:28:51 +08:00
|
|
|
struct TBLOB;
|
|
|
|
struct TWERD;
|
2013-09-23 23:26:50 +08:00
|
|
|
|
|
|
|
// Enum to describe the source of a BLOB_CHOICE to make it possible to determine
|
|
|
|
// whether a blob has been classified by inspecting the BLOB_CHOICEs.
|
|
|
|
enum BlobChoiceClassifier {
|
|
|
|
BCC_STATIC_CLASSIFIER, // From the char_norm classifier.
|
|
|
|
BCC_ADAPTED_CLASSIFIER, // From the adaptive classifier.
|
|
|
|
BCC_SPECKLE_CLASSIFIER, // Backup for failed classification.
|
|
|
|
BCC_AMBIG, // Generated by ambiguity detection.
|
|
|
|
BCC_FAKE, // From some other process.
|
|
|
|
};
|
|
|
|
|
2009-07-11 10:14:57 +08:00
|
|
|
class BLOB_CHOICE: public ELIST_LINK
|
2007-03-08 04:03:40 +08:00
|
|
|
{
|
|
|
|
public:
|
2009-07-11 10:14:57 +08:00
|
|
|
BLOB_CHOICE() {
|
2014-08-12 07:23:06 +08:00
|
|
|
unichar_id_ = UNICHAR_SPACE;
|
2011-03-22 05:44:45 +08:00
|
|
|
fontinfo_id_ = -1;
|
|
|
|
fontinfo_id2_ = -1;
|
2014-08-12 07:23:06 +08:00
|
|
|
rating_ = 10.0;
|
|
|
|
certainty_ = -1.0;
|
2009-07-11 10:14:57 +08:00
|
|
|
script_id_ = -1;
|
2013-09-23 23:26:50 +08:00
|
|
|
xgap_before_ = 0;
|
|
|
|
xgap_after_ = 0;
|
|
|
|
min_xheight_ = 0.0f;
|
|
|
|
max_xheight_ = 0.0f;
|
|
|
|
yshift_ = 0.0f;
|
|
|
|
classifier_ = BCC_FAKE;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
|
2009-07-11 10:14:57 +08:00
|
|
|
float src_rating, // rating
|
|
|
|
float src_cert, // certainty
|
2012-02-02 11:06:39 +08:00
|
|
|
int script_id, // script
|
2013-09-23 23:26:50 +08:00
|
|
|
float min_xheight, // min xheight in image pixel units
|
|
|
|
float max_xheight, // max xheight allowed by this char
|
|
|
|
float yshift, // the larger of y shift (top or bottom)
|
|
|
|
BlobChoiceClassifier c); // adapted match or other
|
2009-07-11 10:14:57 +08:00
|
|
|
BLOB_CHOICE(const BLOB_CHOICE &other);
|
|
|
|
~BLOB_CHOICE() {}
|
2010-11-24 02:34:14 +08:00
|
|
|
|
2009-07-11 10:14:57 +08:00
|
|
|
UNICHAR_ID unichar_id() const {
|
|
|
|
return unichar_id_;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:14:57 +08:00
|
|
|
float rating() const {
|
|
|
|
return rating_;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:14:57 +08:00
|
|
|
float certainty() const {
|
|
|
|
return certainty_;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
inT16 fontinfo_id() const {
|
|
|
|
return fontinfo_id_;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
inT16 fontinfo_id2() const {
|
|
|
|
return fontinfo_id2_;
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
2015-05-13 08:24:34 +08:00
|
|
|
const GenericVector<tesseract::ScoredFont>& fonts() const {
|
|
|
|
return fonts_;
|
|
|
|
}
|
|
|
|
void set_fonts(const GenericVector<tesseract::ScoredFont>& fonts) {
|
|
|
|
fonts_ = fonts;
|
|
|
|
int score1 = 0, score2 = 0;
|
|
|
|
fontinfo_id_ = -1;
|
|
|
|
fontinfo_id2_ = -1;
|
|
|
|
for (int f = 0; f < fonts_.size(); ++f) {
|
|
|
|
if (fonts_[f].score > score1) {
|
|
|
|
score2 = score1;
|
|
|
|
fontinfo_id2_ = fontinfo_id_;
|
|
|
|
score1 = fonts_[f].score;
|
|
|
|
fontinfo_id_ = fonts_[f].fontinfo_id;
|
|
|
|
} else if (fonts_[f].score > score2) {
|
|
|
|
score2 = fonts_[f].score;
|
|
|
|
fontinfo_id2_ = fonts_[f].fontinfo_id;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2009-07-11 10:14:57 +08:00
|
|
|
int script_id() const {
|
|
|
|
return script_id_;
|
2008-04-22 08:41:37 +08:00
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
const MATRIX_COORD& matrix_cell() {
|
|
|
|
return matrix_cell_;
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
2012-02-02 11:06:39 +08:00
|
|
|
inT16 xgap_before() const {
|
2011-03-22 05:44:45 +08:00
|
|
|
return xgap_before_;
|
|
|
|
}
|
2012-02-02 11:06:39 +08:00
|
|
|
inT16 xgap_after() const {
|
2011-03-22 05:44:45 +08:00
|
|
|
return xgap_after_;
|
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
float min_xheight() const {
|
2012-02-02 11:06:39 +08:00
|
|
|
return min_xheight_;
|
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
float max_xheight() const {
|
2012-02-02 11:06:39 +08:00
|
|
|
return max_xheight_;
|
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
float yshift() const {
|
|
|
|
return yshift_;
|
|
|
|
}
|
|
|
|
BlobChoiceClassifier classifier() const {
|
|
|
|
return classifier_;
|
|
|
|
}
|
|
|
|
bool IsAdapted() const {
|
|
|
|
return classifier_ == BCC_ADAPTED_CLASSIFIER;
|
|
|
|
}
|
|
|
|
bool IsClassified() const {
|
|
|
|
return classifier_ == BCC_STATIC_CLASSIFIER ||
|
|
|
|
classifier_ == BCC_ADAPTED_CLASSIFIER ||
|
|
|
|
classifier_ == BCC_SPECKLE_CLASSIFIER;
|
2012-02-02 11:06:39 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:14:57 +08:00
|
|
|
void set_unichar_id(UNICHAR_ID newunichar_id) {
|
|
|
|
unichar_id_ = newunichar_id;
|
2008-12-31 02:15:44 +08:00
|
|
|
}
|
2009-07-11 10:14:57 +08:00
|
|
|
void set_rating(float newrat) {
|
|
|
|
rating_ = newrat;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:14:57 +08:00
|
|
|
void set_certainty(float newrat) {
|
|
|
|
certainty_ = newrat;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:14:57 +08:00
|
|
|
void set_script(int newscript_id) {
|
|
|
|
script_id_ = newscript_id;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
void set_matrix_cell(int col, int row) {
|
|
|
|
matrix_cell_.col = col;
|
|
|
|
matrix_cell_.row = row;
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
2011-03-22 05:44:45 +08:00
|
|
|
void set_xgap_before(inT16 gap) {
|
|
|
|
xgap_before_ = gap;
|
|
|
|
}
|
|
|
|
void set_xgap_after(inT16 gap) {
|
|
|
|
xgap_after_ = gap;
|
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
void set_classifier(BlobChoiceClassifier classifier) {
|
|
|
|
classifier_ = classifier;
|
2012-02-02 11:06:39 +08:00
|
|
|
}
|
2009-07-11 10:14:57 +08:00
|
|
|
static BLOB_CHOICE* deep_copy(const BLOB_CHOICE* src) {
|
|
|
|
BLOB_CHOICE* choice = new BLOB_CHOICE;
|
|
|
|
*choice = *src;
|
|
|
|
return choice;
|
2008-04-22 08:41:37 +08:00
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
// Returns true if *this and other agree on the baseline and x-height
|
|
|
|
// to within some tolerance based on a given estimate of the x-height.
|
|
|
|
bool PosAndSizeAgree(const BLOB_CHOICE& other, float x_height,
|
|
|
|
bool debug) const;
|
|
|
|
|
|
|
|
void print(const UNICHARSET *unicharset) const {
|
|
|
|
tprintf("r%.2f c%.2f x[%g,%g]: %d %s",
|
|
|
|
rating_, certainty_,
|
|
|
|
min_xheight_, max_xheight_, unichar_id_,
|
2010-11-24 02:34:14 +08:00
|
|
|
(unicharset == NULL) ? "" :
|
|
|
|
unicharset->debug_str(unichar_id_).string());
|
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
void print_full() const {
|
|
|
|
print(NULL);
|
|
|
|
tprintf(" script=%d, font1=%d, font2=%d, yshift=%g, classifier=%d\n",
|
|
|
|
script_id_, fontinfo_id_, fontinfo_id2_, yshift_, classifier_);
|
|
|
|
}
|
|
|
|
// Sort function for sorting BLOB_CHOICEs in increasing order of rating.
|
|
|
|
static int SortByRating(const void *p1, const void *p2) {
|
|
|
|
const BLOB_CHOICE *bc1 =
|
|
|
|
*reinterpret_cast<const BLOB_CHOICE * const *>(p1);
|
|
|
|
const BLOB_CHOICE *bc2 =
|
|
|
|
*reinterpret_cast<const BLOB_CHOICE * const *>(p2);
|
|
|
|
return (bc1->rating_ < bc2->rating_) ? -1 : 1;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:14:57 +08:00
|
|
|
private:
|
|
|
|
UNICHAR_ID unichar_id_; // unichar id
|
2015-05-13 08:24:34 +08:00
|
|
|
// Fonts and scores. Allowed to be empty.
|
|
|
|
GenericVector<tesseract::ScoredFont> fonts_;
|
2011-03-22 05:44:45 +08:00
|
|
|
inT16 fontinfo_id_; // char font information
|
|
|
|
inT16 fontinfo_id2_; // 2nd choice font information
|
2012-09-21 23:31:20 +08:00
|
|
|
// Rating is the classifier distance weighted by the length of the outline
|
|
|
|
// in the blob. In terms of probability, classifier distance is -klog p such
|
|
|
|
// that the resulting distance is in the range [0, 1] and then
|
|
|
|
// rating = w (-k log p) where w is the weight for the length of the outline.
|
|
|
|
// Sums of ratings may be compared meaningfully for words of different
|
|
|
|
// segmentation.
|
2011-03-22 05:44:45 +08:00
|
|
|
float rating_; // size related
|
2012-09-21 23:31:20 +08:00
|
|
|
// Certainty is a number in [-20, 0] indicating the classifier certainty
|
|
|
|
// of the choice. In terms of probability, certainty = 20 (k log p) where
|
|
|
|
// k is defined as above to normalize -klog p to the range [0, 1].
|
2011-03-22 05:44:45 +08:00
|
|
|
float certainty_; // absolute
|
2009-07-11 10:14:57 +08:00
|
|
|
int script_id_;
|
2013-09-23 23:26:50 +08:00
|
|
|
// Holds the position of this choice in the ratings matrix.
|
|
|
|
// Used to location position in the matrix during path backtracking.
|
|
|
|
MATRIX_COORD matrix_cell_;
|
2011-03-22 05:44:45 +08:00
|
|
|
inT16 xgap_before_;
|
|
|
|
inT16 xgap_after_;
|
2012-02-02 11:06:39 +08:00
|
|
|
// X-height range (in image pixels) that this classification supports.
|
2013-09-23 23:26:50 +08:00
|
|
|
float min_xheight_;
|
|
|
|
float max_xheight_;
|
|
|
|
// yshift_ - The vertical distance (in image pixels) the character is
|
|
|
|
// shifted (up or down) from an acceptable y position.
|
|
|
|
float yshift_;
|
|
|
|
BlobChoiceClassifier classifier_; // What generated *this.
|
2007-03-08 04:03:40 +08:00
|
|
|
};
|
|
|
|
|
2009-07-11 10:14:57 +08:00
|
|
|
// Make BLOB_CHOICE listable.
|
2013-09-23 23:26:50 +08:00
|
|
|
ELISTIZEH(BLOB_CHOICE)
|
|
|
|
|
|
|
|
// Return the BLOB_CHOICE in bc_list matching a given unichar_id,
|
|
|
|
// or NULL if there is no match.
|
|
|
|
BLOB_CHOICE *FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:14:57 +08:00
|
|
|
// Permuter codes used in WERD_CHOICEs.
|
|
|
|
enum PermuterType {
|
2010-11-24 02:34:14 +08:00
|
|
|
NO_PERM, // 0
|
|
|
|
PUNC_PERM, // 1
|
|
|
|
TOP_CHOICE_PERM, // 2
|
|
|
|
LOWER_CASE_PERM, // 3
|
|
|
|
UPPER_CASE_PERM, // 4
|
|
|
|
NGRAM_PERM, // 5
|
|
|
|
NUMBER_PERM, // 6
|
|
|
|
USER_PATTERN_PERM, // 7
|
|
|
|
SYSTEM_DAWG_PERM, // 8
|
|
|
|
DOC_DAWG_PERM, // 9
|
|
|
|
USER_DAWG_PERM, // 10
|
|
|
|
FREQ_DAWG_PERM, // 11
|
|
|
|
COMPOUND_PERM, // 12
|
2013-09-23 23:26:50 +08:00
|
|
|
|
|
|
|
NUM_PERMUTER_TYPES
|
2009-07-11 10:14:57 +08:00
|
|
|
};
|
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
namespace tesseract {
|
|
|
|
// ScriptPos tells whether a character is subscript, superscript or normal.
|
|
|
|
enum ScriptPos {
|
|
|
|
SP_NORMAL,
|
|
|
|
SP_SUBSCRIPT,
|
|
|
|
SP_SUPERSCRIPT,
|
|
|
|
SP_DROPCAP
|
|
|
|
};
|
|
|
|
|
|
|
|
const char *ScriptPosToString(tesseract::ScriptPos script_pos);
|
|
|
|
|
|
|
|
} // namespace tesseract.
|
|
|
|
|
2017-03-11 06:17:30 +08:00
|
|
|
class TESS_API WERD_CHOICE : public ELIST_LINK {
|
2009-07-11 10:14:57 +08:00
|
|
|
public:
|
2010-11-24 02:34:14 +08:00
|
|
|
static const float kBadRating;
|
2013-09-23 23:26:50 +08:00
|
|
|
static const char *permuter_name(uinT8 permuter);
|
2010-11-24 02:34:14 +08:00
|
|
|
|
2012-02-02 11:06:39 +08:00
|
|
|
WERD_CHOICE(const UNICHARSET *unicharset)
|
|
|
|
: unicharset_(unicharset) { this->init(8); }
|
|
|
|
WERD_CHOICE(const UNICHARSET *unicharset, int reserved)
|
|
|
|
: unicharset_(unicharset) { this->init(reserved); }
|
2009-07-11 10:14:57 +08:00
|
|
|
WERD_CHOICE(const char *src_string,
|
|
|
|
const char *src_lengths,
|
|
|
|
float src_rating,
|
|
|
|
float src_certainty,
|
|
|
|
uinT8 src_permuter,
|
2012-02-02 11:06:39 +08:00
|
|
|
const UNICHARSET &unicharset)
|
|
|
|
: unicharset_(&unicharset) {
|
2009-07-11 10:14:57 +08:00
|
|
|
this->init(src_string, src_lengths, src_rating,
|
2012-02-02 11:06:39 +08:00
|
|
|
src_certainty, src_permuter);
|
2009-07-11 10:14:57 +08:00
|
|
|
}
|
2012-02-02 11:06:39 +08:00
|
|
|
WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset);
|
2016-11-25 22:14:46 +08:00
|
|
|
WERD_CHOICE(const WERD_CHOICE &word)
|
|
|
|
: ELIST_LINK(word), unicharset_(word.unicharset_) {
|
2009-07-11 10:14:57 +08:00
|
|
|
this->init(word.length());
|
|
|
|
this->operator=(word);
|
|
|
|
}
|
|
|
|
~WERD_CHOICE();
|
|
|
|
|
2012-02-02 11:06:39 +08:00
|
|
|
const UNICHARSET *unicharset() const {
|
|
|
|
return unicharset_;
|
|
|
|
}
|
2009-07-11 10:14:57 +08:00
|
|
|
inline int length() const {
|
|
|
|
return length_;
|
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
float adjust_factor() const {
|
|
|
|
return adjust_factor_;
|
|
|
|
}
|
|
|
|
void set_adjust_factor(float factor) {
|
|
|
|
adjust_factor_ = factor;
|
|
|
|
}
|
2009-07-11 10:14:57 +08:00
|
|
|
inline const UNICHAR_ID *unichar_ids() const {
|
|
|
|
return unichar_ids_;
|
|
|
|
}
|
2015-11-05 05:34:22 +08:00
|
|
|
inline UNICHAR_ID unichar_id(int index) const {
|
2009-07-11 10:14:57 +08:00
|
|
|
assert(index < length_);
|
|
|
|
return unichar_ids_[index];
|
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
inline int state(int index) const {
|
|
|
|
return state_[index];
|
2009-07-11 10:14:57 +08:00
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
tesseract::ScriptPos BlobPosition(int index) const {
|
|
|
|
if (index < 0 || index >= length_)
|
|
|
|
return tesseract::SP_NORMAL;
|
|
|
|
return script_pos_[index];
|
2009-07-11 10:14:57 +08:00
|
|
|
}
|
|
|
|
inline float rating() const {
|
|
|
|
return rating_;
|
|
|
|
}
|
|
|
|
inline float certainty() const {
|
|
|
|
return certainty_;
|
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
inline float certainty(int index) const {
|
|
|
|
return certainties_[index];
|
|
|
|
}
|
|
|
|
inline float min_x_height() const {
|
|
|
|
return min_x_height_;
|
|
|
|
}
|
|
|
|
inline float max_x_height() const {
|
|
|
|
return max_x_height_;
|
|
|
|
}
|
|
|
|
inline void set_x_heights(float min_height, float max_height) {
|
|
|
|
min_x_height_ = min_height;
|
|
|
|
max_x_height_ = max_height;
|
|
|
|
}
|
2009-07-11 10:14:57 +08:00
|
|
|
inline uinT8 permuter() const {
|
|
|
|
return permuter_;
|
|
|
|
}
|
2012-02-02 11:06:39 +08:00
|
|
|
const char *permuter_name() const;
|
2013-09-23 23:26:50 +08:00
|
|
|
// Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
|
|
|
|
// taken from the appropriate cell in the ratings MATRIX.
|
|
|
|
// Borrowed pointer, so do not delete.
|
|
|
|
BLOB_CHOICE_LIST* blob_choices(int index, MATRIX* ratings) const;
|
|
|
|
|
|
|
|
// Returns the MATRIX_COORD corresponding to the location in the ratings
|
|
|
|
// MATRIX for the given index into the word.
|
|
|
|
MATRIX_COORD MatrixCoord(int index) const;
|
|
|
|
|
2009-07-11 10:14:57 +08:00
|
|
|
inline void set_unichar_id(UNICHAR_ID unichar_id, int index) {
|
|
|
|
assert(index < length_);
|
|
|
|
unichar_ids_[index] = unichar_id;
|
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
bool dangerous_ambig_found() const {
|
|
|
|
return dangerous_ambig_found_;
|
|
|
|
}
|
|
|
|
void set_dangerous_ambig_found_(bool value) {
|
|
|
|
dangerous_ambig_found_ = value;
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
2009-07-11 10:14:57 +08:00
|
|
|
inline void set_rating(float new_val) {
|
|
|
|
rating_ = new_val;
|
|
|
|
}
|
|
|
|
inline void set_certainty(float new_val) {
|
|
|
|
certainty_ = new_val;
|
|
|
|
}
|
|
|
|
inline void set_permuter(uinT8 perm) {
|
|
|
|
permuter_ = perm;
|
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
// Note: this function should only be used if all the fields
|
|
|
|
// are populated manually with set_* functions (rather than
|
|
|
|
// (copy)constructors and append_* functions).
|
|
|
|
inline void set_length(int len) {
|
|
|
|
ASSERT_HOST(reserved_ >= len);
|
|
|
|
length_ = len;
|
|
|
|
}
|
2009-07-11 10:14:57 +08:00
|
|
|
|
2010-07-27 21:23:23 +08:00
|
|
|
/// Make more space in unichar_id_ and fragment_lengths_ arrays.
|
2009-07-11 10:14:57 +08:00
|
|
|
inline void double_the_size() {
|
2012-02-02 11:06:39 +08:00
|
|
|
if (reserved_ > 0) {
|
|
|
|
unichar_ids_ = GenericVector<UNICHAR_ID>::double_the_size_memcpy(
|
|
|
|
reserved_, unichar_ids_);
|
2013-09-23 23:26:50 +08:00
|
|
|
script_pos_ = GenericVector<tesseract::ScriptPos>::double_the_size_memcpy(
|
|
|
|
reserved_, script_pos_);
|
|
|
|
state_ = GenericVector<int>::double_the_size_memcpy(
|
|
|
|
reserved_, state_);
|
|
|
|
certainties_ = GenericVector<float>::double_the_size_memcpy(
|
|
|
|
reserved_, certainties_);
|
2012-02-02 11:06:39 +08:00
|
|
|
reserved_ *= 2;
|
|
|
|
} else {
|
|
|
|
unichar_ids_ = new UNICHAR_ID[1];
|
2013-09-23 23:26:50 +08:00
|
|
|
script_pos_ = new tesseract::ScriptPos[1];
|
|
|
|
state_ = new int[1];
|
|
|
|
certainties_ = new float[1];
|
2012-02-02 11:06:39 +08:00
|
|
|
reserved_ = 1;
|
|
|
|
}
|
2009-07-11 10:14:57 +08:00
|
|
|
}
|
|
|
|
|
2011-03-22 05:44:45 +08:00
|
|
|
/// Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and
|
2010-07-27 21:23:23 +08:00
|
|
|
/// fragment_length_ arrays. Sets other values to default (blank) values.
|
2009-07-11 10:14:57 +08:00
|
|
|
inline void init(int reserved) {
|
|
|
|
reserved_ = reserved;
|
2012-02-02 11:06:39 +08:00
|
|
|
if (reserved > 0) {
|
|
|
|
unichar_ids_ = new UNICHAR_ID[reserved];
|
2013-09-23 23:26:50 +08:00
|
|
|
script_pos_ = new tesseract::ScriptPos[reserved];
|
|
|
|
state_ = new int[reserved];
|
|
|
|
certainties_ = new float[reserved];
|
2012-02-02 11:06:39 +08:00
|
|
|
} else {
|
|
|
|
unichar_ids_ = NULL;
|
2013-09-23 23:26:50 +08:00
|
|
|
script_pos_ = NULL;
|
|
|
|
state_ = NULL;
|
|
|
|
certainties_ = NULL;
|
2012-02-02 11:06:39 +08:00
|
|
|
}
|
2009-07-11 10:14:57 +08:00
|
|
|
length_ = 0;
|
2013-09-23 23:26:50 +08:00
|
|
|
adjust_factor_ = 1.0f;
|
2009-07-11 10:14:57 +08:00
|
|
|
rating_ = 0.0;
|
|
|
|
certainty_ = MAX_FLOAT32;
|
2013-09-23 23:26:50 +08:00
|
|
|
min_x_height_ = 0.0f;
|
|
|
|
max_x_height_ = MAX_FLOAT32;
|
2009-07-11 10:14:57 +08:00
|
|
|
permuter_ = NO_PERM;
|
2012-02-02 11:06:39 +08:00
|
|
|
unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
|
2013-09-23 23:26:50 +08:00
|
|
|
dangerous_ambig_found_ = false;
|
2009-07-11 10:14:57 +08:00
|
|
|
}
|
|
|
|
|
2010-07-27 21:23:23 +08:00
|
|
|
/// Helper function to build a WERD_CHOICE from the given string,
|
|
|
|
/// fragment lengths, rating, certainty and permuter.
|
|
|
|
/// The function assumes that src_string is not NULL.
|
|
|
|
/// src_lengths argument could be NULL, in which case the unichars
|
|
|
|
/// in src_string are assumed to all be of length 1.
|
2009-07-11 10:14:57 +08:00
|
|
|
void init(const char *src_string, const char *src_lengths,
|
|
|
|
float src_rating, float src_certainty,
|
2012-02-02 11:06:39 +08:00
|
|
|
uinT8 src_permuter);
|
2009-07-11 10:14:57 +08:00
|
|
|
|
2010-07-27 21:23:23 +08:00
|
|
|
/// Set the fields in this choice to be default (bad) values.
|
2009-07-11 10:14:57 +08:00
|
|
|
inline void make_bad() {
|
|
|
|
length_ = 0;
|
2010-11-24 02:34:14 +08:00
|
|
|
rating_ = kBadRating;
|
2009-07-11 10:14:57 +08:00
|
|
|
certainty_ = -MAX_FLOAT32;
|
|
|
|
}
|
|
|
|
|
2010-07-27 21:23:23 +08:00
|
|
|
/// This function assumes that there is enough space reserved
|
|
|
|
/// in the WERD_CHOICE for adding another unichar.
|
|
|
|
/// This is an efficient alternative to append_unichar_id().
|
2009-07-11 10:14:57 +08:00
|
|
|
inline void append_unichar_id_space_allocated(
|
2013-09-23 23:26:50 +08:00
|
|
|
UNICHAR_ID unichar_id, int blob_count,
|
2009-07-11 10:14:57 +08:00
|
|
|
float rating, float certainty) {
|
|
|
|
assert(reserved_ > length_);
|
|
|
|
length_++;
|
2013-09-23 23:26:50 +08:00
|
|
|
this->set_unichar_id(unichar_id, blob_count,
|
2009-07-11 10:14:57 +08:00
|
|
|
rating, certainty, length_-1);
|
|
|
|
}
|
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count,
|
2009-07-11 10:14:57 +08:00
|
|
|
float rating, float certainty);
|
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
inline void set_unichar_id(UNICHAR_ID unichar_id, int blob_count,
|
2009-07-11 10:14:57 +08:00
|
|
|
float rating, float certainty, int index) {
|
|
|
|
assert(index < length_);
|
|
|
|
unichar_ids_[index] = unichar_id;
|
2013-09-23 23:26:50 +08:00
|
|
|
state_[index] = blob_count;
|
|
|
|
certainties_[index] = certainty;
|
|
|
|
script_pos_[index] = tesseract::SP_NORMAL;
|
2009-07-11 10:14:57 +08:00
|
|
|
rating_ += rating;
|
|
|
|
if (certainty < certainty_) {
|
|
|
|
certainty_ = certainty;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 10:14:57 +08:00
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
// Sets the entries for the given index from the BLOB_CHOICE, assuming
|
|
|
|
// unit fragment lengths, but setting the state for this index to blob_count.
|
|
|
|
void set_blob_choice(int index, int blob_count,
|
|
|
|
const BLOB_CHOICE* blob_choice);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 10:14:57 +08:00
|
|
|
bool contains_unichar_id(UNICHAR_ID unichar_id) const;
|
|
|
|
void remove_unichar_ids(int index, int num);
|
|
|
|
inline void remove_last_unichar_id() { --length_; }
|
2012-02-02 11:06:39 +08:00
|
|
|
inline void remove_unichar_id(int index) {
|
|
|
|
this->remove_unichar_ids(index, 1);
|
|
|
|
}
|
|
|
|
bool has_rtl_unichar_id() const;
|
|
|
|
void reverse_and_mirror_unichar_ids();
|
|
|
|
|
|
|
|
// Returns the half-open interval of unichar_id indices [start, end) which
|
|
|
|
// enclose the core portion of this word -- the part after stripping
|
|
|
|
// punctuation from the left and right.
|
|
|
|
void punct_stripped(int *start_core, int *end_core) const;
|
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
// Returns the indices [start, end) containing the core of the word, stripped
|
|
|
|
// of any superscript digits on either side. (i.e., the non-footnote part
|
|
|
|
// of the word). There is no guarantee that the output range is non-empty.
|
|
|
|
void GetNonSuperscriptSpan(int *start, int *end) const;
|
|
|
|
|
2012-02-02 11:06:39 +08:00
|
|
|
// Return a copy of this WERD_CHOICE with the choices [start, end).
|
|
|
|
// The result is useful only for checking against a dictionary.
|
|
|
|
WERD_CHOICE shallow_copy(int start, int end) const;
|
|
|
|
|
|
|
|
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const;
|
|
|
|
const STRING debug_string() const {
|
2009-07-11 10:14:57 +08:00
|
|
|
STRING word_str;
|
|
|
|
for (int i = 0; i < length_; ++i) {
|
2012-02-02 11:06:39 +08:00
|
|
|
word_str += unicharset_->debug_str(unichar_ids_[i]);
|
2009-07-11 10:14:57 +08:00
|
|
|
word_str += " ";
|
2008-02-01 08:36:18 +08:00
|
|
|
}
|
2009-07-11 10:14:57 +08:00
|
|
|
return word_str;
|
|
|
|
}
|
2012-02-02 11:06:39 +08:00
|
|
|
|
|
|
|
// Call this to override the default (strict left to right graphemes)
|
|
|
|
// with the fact that some engine produces a "reading order" set of
|
|
|
|
// Graphemes for each word.
|
|
|
|
bool set_unichars_in_script_order(bool in_script_order) {
|
|
|
|
return unichars_in_script_order_ = in_script_order;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool unichars_in_script_order() const {
|
|
|
|
return unichars_in_script_order_;
|
|
|
|
}
|
|
|
|
|
2012-02-15 09:37:00 +08:00
|
|
|
// Returns a UTF-8 string equivalent to the current choice
|
|
|
|
// of UNICHAR IDs.
|
2009-07-11 10:14:57 +08:00
|
|
|
const STRING &unichar_string() const {
|
2012-02-15 09:37:00 +08:00
|
|
|
this->string_and_lengths(&unichar_string_, &unichar_lengths_);
|
2009-07-11 10:14:57 +08:00
|
|
|
return unichar_string_;
|
|
|
|
}
|
2012-02-02 11:06:39 +08:00
|
|
|
|
2012-02-15 09:37:00 +08:00
|
|
|
// Returns the lengths, one byte each, representing the number of bytes
|
|
|
|
// required in the unichar_string for each UNICHAR_ID.
|
2009-07-11 10:14:57 +08:00
|
|
|
const STRING &unichar_lengths() const {
|
2012-02-15 09:37:00 +08:00
|
|
|
this->string_and_lengths(&unichar_string_, &unichar_lengths_);
|
2009-07-11 10:14:57 +08:00
|
|
|
return unichar_lengths_;
|
|
|
|
}
|
2013-09-23 23:26:50 +08:00
|
|
|
|
|
|
|
// Sets up the script_pos_ member using the blobs_list to get the bln
|
|
|
|
// bounding boxes, *this to get the unichars, and this->unicharset
|
|
|
|
// to get the target positions. If small_caps is true, sub/super are not
|
|
|
|
// considered, but dropcaps are.
|
|
|
|
// NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
|
|
|
|
void SetScriptPositions(bool small_caps, TWERD* word);
|
|
|
|
// Sets the script_pos_ member from some source positions with a given length.
|
|
|
|
void SetScriptPositions(const tesseract::ScriptPos* positions, int length);
|
|
|
|
// Sets all the script_pos_ positions to the given position.
|
|
|
|
void SetAllScriptPositions(tesseract::ScriptPos position);
|
|
|
|
|
|
|
|
static tesseract::ScriptPos ScriptPositionOf(bool print_debug,
|
|
|
|
const UNICHARSET& unicharset,
|
|
|
|
const TBOX& blob_box,
|
|
|
|
UNICHAR_ID unichar_id);
|
|
|
|
|
|
|
|
// Returns the "dominant" script ID for the word. By "dominant", the script
|
|
|
|
// must account for at least half the characters. Otherwise, it returns 0.
|
|
|
|
// Note that for Japanese, Hiragana and Katakana are simply treated as Han.
|
|
|
|
int GetTopScriptID() const;
|
|
|
|
|
|
|
|
// Fixes the state_ for a chop at the given blob_posiiton.
|
|
|
|
void UpdateStateForSplit(int blob_position);
|
|
|
|
|
|
|
|
// Returns the sum of all the state elements, being the total number of blobs.
|
|
|
|
int TotalOfStates() const;
|
|
|
|
|
|
|
|
void print() const { this->print(""); }
|
|
|
|
void print(const char *msg) const;
|
|
|
|
// Prints the segmentation state with an introductory message.
|
|
|
|
void print_state(const char *msg) const;
|
|
|
|
|
|
|
|
// Displays the segmentation state of *this (if not the same as the last
|
|
|
|
// one displayed) and waits for a click in the window.
|
|
|
|
void DisplaySegmentation(TWERD* word);
|
2008-02-01 08:36:18 +08:00
|
|
|
|
2009-07-11 10:14:57 +08:00
|
|
|
WERD_CHOICE& operator+= ( // concatanate
|
|
|
|
const WERD_CHOICE & second);// second on first
|
2008-02-01 08:36:18 +08:00
|
|
|
|
2009-07-11 10:14:57 +08:00
|
|
|
WERD_CHOICE& operator= (const WERD_CHOICE& source);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2010-11-30 08:56:39 +08:00
|
|
|
private:
|
2012-02-02 11:06:39 +08:00
|
|
|
const UNICHARSET *unicharset_;
|
2013-09-23 23:26:50 +08:00
|
|
|
// TODO(rays) Perhaps replace the multiple arrays with an array of structs?
|
|
|
|
// unichar_ids_ is an array of classifier "results" that make up a word.
|
|
|
|
// For each unichar_ids_[i], script_pos_[i] has the sub/super/normal position
|
|
|
|
// of each unichar_id.
|
|
|
|
// state_[i] indicates the number of blobs in WERD_RES::chopped_word that
|
|
|
|
// were put together to make the classification results in the ith position
|
|
|
|
// in unichar_ids_, and certainties_[i] is the certainty of the choice that
|
|
|
|
// was used in this word.
|
|
|
|
// == Change from before ==
|
|
|
|
// Previously there was fragment_lengths_ that allowed a word to be
|
|
|
|
// artificially composed of multiple fragment results. Since the new
|
|
|
|
// segmentation search doesn't do fragments, treatment of fragments has
|
|
|
|
// been moved to a lower level, augmenting the ratings matrix with the
|
|
|
|
// combined fragments, and allowing the language-model/segmentation-search
|
|
|
|
// to deal with only the combined unichar_ids.
|
2009-07-11 10:14:57 +08:00
|
|
|
UNICHAR_ID *unichar_ids_; // unichar ids that represent the text of the word
|
2013-09-23 23:26:50 +08:00
|
|
|
tesseract::ScriptPos* script_pos_; // Normal/Sub/Superscript of each unichar.
|
|
|
|
int* state_; // Number of blobs in each unichar.
|
|
|
|
float* certainties_; // Certainty of each unichar.
|
2009-07-11 10:14:57 +08:00
|
|
|
int reserved_; // size of the above arrays
|
|
|
|
int length_; // word length
|
2013-09-23 23:26:50 +08:00
|
|
|
// Factor that was used to adjust the rating.
|
|
|
|
float adjust_factor_;
|
2012-09-21 23:31:20 +08:00
|
|
|
// Rating is the sum of the ratings of the individual blobs in the word.
|
2009-07-11 10:14:57 +08:00
|
|
|
float rating_; // size related
|
2012-09-21 23:31:20 +08:00
|
|
|
// certainty is the min (worst) certainty of the individual blobs in the word.
|
2009-07-11 10:14:57 +08:00
|
|
|
float certainty_; // absolute
|
2013-09-23 23:26:50 +08:00
|
|
|
// xheight computed from the result, or 0 if inconsistent.
|
|
|
|
float min_x_height_;
|
|
|
|
float max_x_height_;
|
2009-07-11 10:14:57 +08:00
|
|
|
uinT8 permuter_; // permuter code
|
2008-02-01 08:36:18 +08:00
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
// Normally, the ratings_ matrix represents the recognition results in order
|
2012-02-02 11:06:39 +08:00
|
|
|
// from left-to-right. However, some engines (say Cube) may return
|
|
|
|
// recognition results in the order of the script's major reading direction
|
|
|
|
// (for Arabic, that is right-to-left).
|
|
|
|
bool unichars_in_script_order_;
|
2013-09-23 23:26:50 +08:00
|
|
|
// True if NoDangerousAmbig found an ambiguity.
|
|
|
|
bool dangerous_ambig_found_;
|
2012-02-02 11:06:39 +08:00
|
|
|
|
2012-02-15 09:37:00 +08:00
|
|
|
// The following variables are populated and passed by reference any
|
|
|
|
// time unichar_string() or unichar_lengths() are called.
|
|
|
|
mutable STRING unichar_string_;
|
|
|
|
mutable STRING unichar_lengths_;
|
2007-03-08 04:03:40 +08:00
|
|
|
};
|
|
|
|
|
2009-07-11 10:14:57 +08:00
|
|
|
// Make WERD_CHOICE listable.
|
2013-09-23 23:26:50 +08:00
|
|
|
ELISTIZEH(WERD_CHOICE)
|
2009-07-11 10:14:57 +08:00
|
|
|
typedef GenericVector<BLOB_CHOICE_LIST *> BLOB_CHOICE_LIST_VECTOR;
|
|
|
|
|
2012-02-02 11:06:39 +08:00
|
|
|
// Utilities for comparing WERD_CHOICEs
|
|
|
|
|
|
|
|
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1,
|
|
|
|
const WERD_CHOICE &word2);
|
|
|
|
|
|
|
|
// Utilities for debug printing.
|
2009-07-11 10:14:57 +08:00
|
|
|
void print_ratings_list(
|
|
|
|
const char *msg, // intro message
|
|
|
|
BLOB_CHOICE_LIST *ratings, // list of results
|
|
|
|
const UNICHARSET ¤t_unicharset // unicharset that can be used
|
|
|
|
// for id-to-unichar conversion
|
|
|
|
);
|
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
#endif
|