mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 20:59:36 +08:00
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Refactored top-level word recognition module, Added simultaneous multi-language capability.
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@654 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
e33ae59f4d
commit
73adf693d5
@ -245,13 +245,11 @@ WordAltList *BeamSearch::CreateWordAltList(SearchObject *srch_obj) {
|
||||
int cost = 0;
|
||||
// char bigram cost
|
||||
int bigram_cost = !bigrams ? 0 :
|
||||
bigrams->Cost(ch_buff, cntxt_->CharacterSet(),
|
||||
&cntxt_->TesseractObject()->unicharset);
|
||||
bigrams->Cost(ch_buff, cntxt_->CharacterSet());
|
||||
// word unigram cost
|
||||
int unigram_cost = !word_unigrams ? 0 :
|
||||
word_unigrams->Cost(ch_buff, cntxt_->LangMod(),
|
||||
cntxt_->CharacterSet(),
|
||||
&cntxt_->TesseractObject()->unicharset);
|
||||
cntxt_->CharacterSet());
|
||||
// overall cost
|
||||
cost = static_cast<int>(
|
||||
(size_cost * cntxt_->Params()->SizeWgt()) +
|
||||
|
@ -25,7 +25,6 @@
|
||||
#include "char_bigrams.h"
|
||||
#include "cube_utils.h"
|
||||
#include "ndminx.h"
|
||||
#include "unicharset.h"
|
||||
#include "cube_const.h"
|
||||
|
||||
namespace tesseract {
|
||||
@ -167,21 +166,20 @@ int CharBigrams::PairCost(char_32 ch1, char_32 ch2) const {
|
||||
return bigram_table_.char_bigram[ch1].bigram[ch2].cost;
|
||||
}
|
||||
|
||||
int CharBigrams::Cost(const char_32 *char_32_ptr, CharSet *char_set,
|
||||
UNICHARSET *unicharset) const {
|
||||
int CharBigrams::Cost(const char_32 *char_32_ptr, CharSet *char_set) const {
|
||||
if (!char_32_ptr || char_32_ptr[0] == 0) {
|
||||
return bigram_table_.worst_cost;
|
||||
}
|
||||
int cost = MeanCostWithSpaces(char_32_ptr);
|
||||
if (CubeUtils::StrLen(char_32_ptr) >= kMinLengthCaseInvariant &&
|
||||
CubeUtils::IsCaseInvariant(char_32_ptr, char_set, unicharset)) {
|
||||
char_32 *lower_32 = CubeUtils::ToLower(char_32_ptr, char_set, unicharset);
|
||||
CubeUtils::IsCaseInvariant(char_32_ptr, char_set)) {
|
||||
char_32 *lower_32 = CubeUtils::ToLower(char_32_ptr, char_set);
|
||||
if (lower_32 && lower_32[0] != 0) {
|
||||
int cost_lower = MeanCostWithSpaces(lower_32);
|
||||
cost = MIN(cost, cost_lower);
|
||||
delete [] lower_32;
|
||||
}
|
||||
char_32 *upper_32 = CubeUtils::ToUpper(char_32_ptr, char_set, unicharset);
|
||||
char_32 *upper_32 = CubeUtils::ToUpper(char_32_ptr, char_set);
|
||||
if (upper_32 && upper_32[0] != 0) {
|
||||
int cost_upper = MeanCostWithSpaces(upper_32);
|
||||
cost = MIN(cost, cost_upper);
|
||||
|
@ -61,13 +61,12 @@ class CharBigrams {
|
||||
static CharBigrams *Create(const string &data_file_path,
|
||||
const string &lang);
|
||||
// Top-level function to return the mean character bigram cost of a
|
||||
// sequence of characters. If char_set and unicharset are not NULL
|
||||
// and cube and tesseract share the same unicharset, use
|
||||
// sequence of characters. If char_set is not NULL, use
|
||||
// tesseract functions to return a case-invariant cost.
|
||||
// This avoids unnecessarily penalizing all-one-case words or
|
||||
// capitalized words (first-letter upper-case and remaining letters
|
||||
// lower-case).
|
||||
int Cost(const char_32 *str, CharSet *char_set, UNICHARSET *unicharset) const;
|
||||
int Cost(const char_32 *str, CharSet *char_set) const;
|
||||
|
||||
protected:
|
||||
// Returns the character bigram cost of two characters.
|
||||
|
@ -78,10 +78,16 @@ CharSet *CharSet::Create(TessdataManager *tessdata_manager,
|
||||
// map its unichars to tesseract's; if only one unicharset exists,
|
||||
// just load it.
|
||||
bool loaded;
|
||||
if (cube_unicharset_exists)
|
||||
loaded = char_set->LoadSupportedCharList(charset_fp, tess_unicharset);
|
||||
else
|
||||
if (cube_unicharset_exists) {
|
||||
char_set->cube_unicharset_.load_from_file(charset_fp);
|
||||
loaded = tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET);
|
||||
loaded = loaded && char_set->LoadSupportedCharList(
|
||||
tessdata_manager->GetDataFilePtr(), tess_unicharset);
|
||||
char_set->unicharset_ = &char_set->cube_unicharset_;
|
||||
} else {
|
||||
loaded = char_set->LoadSupportedCharList(charset_fp, NULL);
|
||||
char_set->unicharset_ = tess_unicharset;
|
||||
}
|
||||
if (!loaded) {
|
||||
delete char_set;
|
||||
return false;
|
||||
|
@ -116,6 +116,10 @@ class CharSet {
|
||||
static CharSet *Create(TessdataManager *tessdata_manager,
|
||||
UNICHARSET *tess_unicharset);
|
||||
|
||||
// Return the UNICHARSET cube is using for recognition internally --
|
||||
// ClassId() returns unichar_id's in this unicharset.
|
||||
UNICHARSET *InternalUnicharset() { return unicharset_; }
|
||||
|
||||
private:
|
||||
// Hash table configuration params. Determined emperically on
|
||||
// the supported languages so far (Eng, Ara, Hin). Might need to be
|
||||
@ -155,6 +159,13 @@ class CharSet {
|
||||
string_32 **class_strings_;
|
||||
// map from class id to secondary (tesseract's) unicharset's ids
|
||||
int *unicharset_map_;
|
||||
// A unicharset which is filled in with a Tesseract-style UNICHARSET for
|
||||
// cube's data if our unicharset is different from tesseract's.
|
||||
UNICHARSET cube_unicharset_;
|
||||
// This points to either the tess_unicharset we're passed or cube_unicharset_,
|
||||
// depending upon whether we just have one unicharset or one for each
|
||||
// tesseract and cube, respectively.
|
||||
UNICHARSET *unicharset_;
|
||||
// has the char set been initialized flag
|
||||
bool init_;
|
||||
};
|
||||
|
@ -242,6 +242,16 @@ int CubeObject::WordCost(const char *str) {
|
||||
return cost;
|
||||
}
|
||||
|
||||
// Recognizes a single character and returns the list of results.
|
||||
CharAltList *CubeObject::RecognizeChar() {
|
||||
if (char_samp_ == NULL) return NULL;
|
||||
CharAltList* alt_list = NULL;
|
||||
CharClassifier *char_classifier = cntxt_->Classifier();
|
||||
ASSERT_HOST(char_classifier != NULL);
|
||||
alt_list = char_classifier->Classify(char_samp_);
|
||||
return alt_list;
|
||||
}
|
||||
|
||||
// Normalize the input word bitmap to have a minimum aspect ratio
|
||||
bool CubeObject::Normalize() {
|
||||
// create a cube search object
|
||||
|
@ -64,7 +64,6 @@
|
||||
//
|
||||
// // Call this once you are done recognizing all words with
|
||||
// // for the current language
|
||||
// tess_obj->end_tesseract();
|
||||
// delete tess_obj;
|
||||
//
|
||||
// Note that if the language supports "Italics" (see the CubeRecoContext), the
|
||||
@ -110,6 +109,8 @@ class CubeObject {
|
||||
// recognition of a language model that allows only the specified word.
|
||||
// The alternate list(s) will be permanently modified.
|
||||
int WordCost(const char *str);
|
||||
// Recognizes a single character and returns the list of results.
|
||||
CharAltList *RecognizeChar();
|
||||
|
||||
// Returns the BeamSearch object that resulted from the last call to
|
||||
// RecognizeWord
|
||||
|
@ -358,8 +358,7 @@ void CubeUtils::UTF32ToUTF8(const char_32 *utf32_str, string *str) {
|
||||
}
|
||||
}
|
||||
|
||||
bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set,
|
||||
UNICHARSET *unicharset) {
|
||||
bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) {
|
||||
bool all_one_case = true;
|
||||
bool capitalized;
|
||||
bool prev_upper;
|
||||
@ -370,20 +369,18 @@ bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set,
|
||||
bool cur_lower;
|
||||
|
||||
string str8;
|
||||
if (!char_set || !unicharset || !char_set->SharedUnicharset()) {
|
||||
// If cube char_set or tesseract unicharset are missing, or
|
||||
// unicharset is not shared, then use C-locale-dependent functions
|
||||
if (!char_set) {
|
||||
// If cube char_set is missing, use C-locale-dependent functions
|
||||
// on UTF8 characters to determine case properties.
|
||||
UTF32ToUTF8(str32, &str8);
|
||||
first_upper = isupper(str8[0]);
|
||||
first_lower = islower(str8[0]);
|
||||
first_upper = isupper(str32[0]);
|
||||
first_lower = islower(str32[0]);
|
||||
if (first_upper)
|
||||
capitalized = true;
|
||||
prev_upper = first_upper;
|
||||
prev_lower = islower(str8[0]);
|
||||
for (int c = 1; c < str8.length(); ++c) {
|
||||
cur_upper = isupper(str8[c]);
|
||||
cur_lower = islower(str8[c]);
|
||||
prev_lower = islower(str32[0]);
|
||||
for (int c = 1; str32[c] != 0; ++c) {
|
||||
cur_upper = isupper(str32[c]);
|
||||
cur_lower = islower(str32[c]);
|
||||
if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
|
||||
all_one_case = false;
|
||||
if (cur_upper)
|
||||
@ -392,17 +389,18 @@ bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set,
|
||||
prev_lower = cur_lower;
|
||||
}
|
||||
} else {
|
||||
UNICHARSET *unicharset = char_set->InternalUnicharset();
|
||||
// Use UNICHARSET functions to determine case properties
|
||||
first_upper = unicharset->get_isupper(char_set->UnicharID(str32[0]));
|
||||
first_lower = unicharset->get_islower(char_set->UnicharID(str32[0]));
|
||||
first_upper = unicharset->get_isupper(char_set->ClassID(str32[0]));
|
||||
first_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
|
||||
if (first_upper)
|
||||
capitalized = true;
|
||||
prev_upper = first_upper;
|
||||
prev_lower = unicharset->get_islower(char_set->UnicharID(str32[0]));
|
||||
prev_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
|
||||
|
||||
for (int c = 1; c < StrLen(str32); ++c) {
|
||||
cur_upper = unicharset->get_isupper(char_set->UnicharID(str32[c]));
|
||||
cur_lower = unicharset->get_islower(char_set->UnicharID(str32[c]));
|
||||
cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c]));
|
||||
cur_lower = unicharset->get_islower(char_set->ClassID(str32[c]));
|
||||
if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
|
||||
all_one_case = false;
|
||||
if (cur_upper)
|
||||
@ -414,11 +412,11 @@ bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set,
|
||||
return all_one_case || capitalized;
|
||||
}
|
||||
|
||||
char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set,
|
||||
UNICHARSET *unicharset) {
|
||||
if (!char_set || !unicharset || !char_set->SharedUnicharset()) {
|
||||
char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set) {
|
||||
if (!char_set) {
|
||||
return NULL;
|
||||
}
|
||||
UNICHARSET *unicharset = char_set->InternalUnicharset();
|
||||
int len = StrLen(str32);
|
||||
char_32 *lower = new char_32[len + 1];
|
||||
if (!lower)
|
||||
@ -430,9 +428,8 @@ char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set,
|
||||
return NULL;
|
||||
}
|
||||
// convert upper-case characters to lower-case
|
||||
if (unicharset->get_isupper(char_set->UnicharID(ch))) {
|
||||
UNICHAR_ID uid_lower =
|
||||
unicharset->get_other_case(char_set->UnicharID(ch));
|
||||
if (unicharset->get_isupper(char_set->ClassID(ch))) {
|
||||
UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch));
|
||||
const char_32 *str32_lower = char_set->ClassString(uid_lower);
|
||||
// expect lower-case version of character to be a single character
|
||||
if (!str32_lower || StrLen(str32_lower) != 1) {
|
||||
@ -448,11 +445,11 @@ char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set,
|
||||
return lower;
|
||||
}
|
||||
|
||||
char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set,
|
||||
UNICHARSET *unicharset) {
|
||||
if (!char_set || !unicharset || !char_set->SharedUnicharset()) {
|
||||
char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set) {
|
||||
if (!char_set) {
|
||||
return NULL;
|
||||
}
|
||||
UNICHARSET *unicharset = char_set->InternalUnicharset();
|
||||
int len = StrLen(str32);
|
||||
char_32 *upper = new char_32[len + 1];
|
||||
if (!upper)
|
||||
@ -464,9 +461,8 @@ char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set,
|
||||
return NULL;
|
||||
}
|
||||
// convert lower-case characters to upper-case
|
||||
if (unicharset->get_islower(char_set->UnicharID(ch))) {
|
||||
UNICHAR_ID uid_upper =
|
||||
unicharset->get_other_case(char_set->UnicharID(ch));
|
||||
if (unicharset->get_islower(char_set->ClassID(ch))) {
|
||||
UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch));
|
||||
const char_32 *str32_upper = char_set->ClassString(uid_upper);
|
||||
// expect upper-case version of character to be a single character
|
||||
if (!str32_upper || StrLen(str32_upper) != 1) {
|
||||
|
@ -69,24 +69,18 @@ class CubeUtils {
|
||||
static void UTF32ToUTF8(const char_32 *utf32_str, string *str);
|
||||
// Returns true if input word has either 1) all-one-case, or 2)
|
||||
// first character upper-case, and remaining characters lower-case.
|
||||
// If char_set and unicharset are not NULL, uses tesseract's unicharset
|
||||
// functions to determine case properties. Otherwise, uses
|
||||
// C-locale-dependent functions, which may be unreliable on
|
||||
// non-ASCII characters.
|
||||
static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set,
|
||||
UNICHARSET *unicharset);
|
||||
// If char_set is not NULL, uses tesseract's unicharset functions
|
||||
// to determine case properties. Otherwise, uses C-locale-dependent
|
||||
// functions, which may be unreliable on non-ASCII characters.
|
||||
static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set);
|
||||
// Returns char_32 pointer to the lower-case-transformed version of
|
||||
// the input string or NULL on error. If char_set or unicharset are
|
||||
// NULL, or tesseract and cube do not share unicharsets, returns
|
||||
// NULL. Return array must be freed by caller.
|
||||
static char_32 *ToLower(const char_32 *str32, CharSet *char_set,
|
||||
UNICHARSET *unicharset);
|
||||
// the input string or NULL on error. If char_set is NULL returns NULL.
|
||||
// Return array must be freed by caller.
|
||||
static char_32 *ToLower(const char_32 *str32, CharSet *char_set);
|
||||
// Returns char_32 pointer to the upper-case-transformed version of
|
||||
// the input string or NULL on error. If char_set or unicharset are
|
||||
// NULL, or tesseract and cube do not share unicharsets, returns
|
||||
// NULL. Return array must be freed by caller.
|
||||
static char_32 *ToUpper(const char_32 *str32, CharSet *char_set,
|
||||
UNICHARSET *unicharset);
|
||||
// the input string or NULL on error. If char_set is NULL returns NULL.
|
||||
// Return array must be freed by caller.
|
||||
static char_32 *ToUpper(const char_32 *str32, CharSet *char_set);
|
||||
private:
|
||||
static unsigned char *GetImageData(IMAGE *img,
|
||||
int left, int top, int wid, int hgt);
|
||||
|
@ -121,11 +121,11 @@ void WordListLangModel::WordVariants(const CharSet &char_set,
|
||||
string_32 prefix_str32,
|
||||
WERD_CHOICE *word_so_far,
|
||||
string_32 str32,
|
||||
vector<WERD_CHOICE> *word_variants) {
|
||||
vector<WERD_CHOICE *> *word_variants) {
|
||||
int str_len = str32.length();
|
||||
if (str_len == 0) {
|
||||
if (word_so_far->length() > 0) {
|
||||
word_variants->push_back(*word_so_far);
|
||||
word_variants->push_back(new WERD_CHOICE(*word_so_far));
|
||||
}
|
||||
} else {
|
||||
// Try out all the possible prefixes of the str32.
|
||||
@ -151,11 +151,15 @@ void WordListLangModel::WordVariants(const CharSet &char_set,
|
||||
// Compute all the variants of a 32-bit string in terms of the class-ids
|
||||
// This is needed for languages that have ligatures. A word can then have more
|
||||
// than one spelling in terms of the class-ids
|
||||
void WordListLangModel::WordVariants(const CharSet &char_set, string_32 str32,
|
||||
vector<WERD_CHOICE> *word_variants) {
|
||||
void WordListLangModel::WordVariants(const CharSet &char_set,
|
||||
const UNICHARSET *uchset, string_32 str32,
|
||||
vector<WERD_CHOICE *> *word_variants) {
|
||||
for (int i = 0; i < word_variants->size(); i++) {
|
||||
delete (*word_variants)[i];
|
||||
}
|
||||
word_variants->clear();
|
||||
string_32 prefix_str32;
|
||||
WERD_CHOICE word_so_far;
|
||||
WERD_CHOICE word_so_far(uchset);
|
||||
WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants);
|
||||
}
|
||||
|
||||
@ -179,21 +183,23 @@ bool WordListLangModel::AddString32(const char_32 *char_32_ptr) {
|
||||
return false;
|
||||
}
|
||||
// get all the word variants
|
||||
vector<WERD_CHOICE> word_variants;
|
||||
WordVariants(*(cntxt_->CharacterSet()), char_32_ptr, &word_variants);
|
||||
vector<WERD_CHOICE *> word_variants;
|
||||
WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(),
|
||||
char_32_ptr, &word_variants);
|
||||
|
||||
if (word_variants.size() > 0) {
|
||||
// find the shortest variant
|
||||
int shortest_word = 0;
|
||||
for (int word = 1; word < word_variants.size(); word++) {
|
||||
if (word_variants[shortest_word].length() >
|
||||
word_variants[word].length()) {
|
||||
if (word_variants[shortest_word]->length() >
|
||||
word_variants[word]->length()) {
|
||||
shortest_word = word;
|
||||
}
|
||||
}
|
||||
// only add the shortest grapheme interpretation of string to the word list
|
||||
dawg_->add_word_to_dawg(word_variants[shortest_word]);
|
||||
dawg_->add_word_to_dawg(*word_variants[shortest_word]);
|
||||
}
|
||||
for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; }
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -63,8 +63,9 @@ class WordListLangModel : public LangModel {
|
||||
// Compute all the variants of a 32-bit string in terms of the class-ids.
|
||||
// This is needed for languages that have ligatures. A word can then have
|
||||
// more than one spelling in terms of the class-ids.
|
||||
static void WordVariants(const CharSet &char_set, string_32 str32,
|
||||
vector<WERD_CHOICE> *word_variants);
|
||||
static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset,
|
||||
string_32 str32,
|
||||
vector<WERD_CHOICE *> *word_variants);
|
||||
private:
|
||||
// constants needed to configure the language model
|
||||
static const int kMaxEdge = 512;
|
||||
@ -78,9 +79,11 @@ class WordListLangModel : public LangModel {
|
||||
// Cleanup
|
||||
void Cleanup();
|
||||
// Recursive helper function for WordVariants().
|
||||
static void WordVariants(const CharSet &char_set,
|
||||
string_32 prefix_str32, WERD_CHOICE *word_so_far,
|
||||
string_32 str32, vector<WERD_CHOICE> *word_variants);
|
||||
static void WordVariants(
|
||||
const CharSet &char_set,
|
||||
string_32 prefix_str32, WERD_CHOICE *word_so_far,
|
||||
string_32 str32,
|
||||
vector<WERD_CHOICE *> *word_variants);
|
||||
};
|
||||
} // tesseract
|
||||
|
||||
|
@ -25,7 +25,6 @@
|
||||
#include "const.h"
|
||||
#include "cube_utils.h"
|
||||
#include "ndminx.h"
|
||||
#include "unicharset.h"
|
||||
#include "word_unigrams.h"
|
||||
|
||||
namespace tesseract {
|
||||
@ -150,8 +149,7 @@ WordUnigrams *WordUnigrams::Create(const string &data_file_path,
|
||||
// cost.
|
||||
int WordUnigrams::Cost(const char_32 *key_str32,
|
||||
LangModel *lang_mod,
|
||||
CharSet *char_set,
|
||||
UNICHARSET *unicharset) const {
|
||||
CharSet *char_set) const {
|
||||
if (!key_str32)
|
||||
return 0;
|
||||
// convert string to UTF8 to split into space-separated words
|
||||
@ -206,15 +204,15 @@ int WordUnigrams::Cost(const char_32 *key_str32,
|
||||
// if case invariant, get costs of all-upper-case and all-lower-case
|
||||
// versions and return the min cost
|
||||
if (clean_len >= kMinLengthNumOrCaseInvariant &&
|
||||
CubeUtils::IsCaseInvariant(clean_str32, char_set, unicharset)) {
|
||||
char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set, unicharset);
|
||||
CubeUtils::IsCaseInvariant(clean_str32, char_set)) {
|
||||
char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set);
|
||||
if (lower_32) {
|
||||
string lower_8;
|
||||
CubeUtils::UTF32ToUTF8(lower_32, &lower_8);
|
||||
word_cost = MIN(word_cost, CostInternal(lower_8.c_str()));
|
||||
delete [] lower_32;
|
||||
}
|
||||
char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set, unicharset);
|
||||
char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set);
|
||||
if (upper_32) {
|
||||
string upper_8;
|
||||
CubeUtils::UTF32ToUTF8(upper_32, &upper_8);
|
||||
|
@ -49,7 +49,7 @@ class WordUnigrams {
|
||||
// case-invariant cost is computed in those cases, assuming the word
|
||||
// meets a minimum length.
|
||||
int Cost(const char_32 *str32, LangModel *lang_mod,
|
||||
CharSet *char_set, UNICHARSET *unicharset) const;
|
||||
CharSet *char_set) const;
|
||||
protected:
|
||||
// Compute the word unigram cost of a UTF-8 string with binary
|
||||
// search of sorted words_ array.
|
||||
|
Loading…
Reference in New Issue
Block a user