Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Refactored top-level word recognition module, Added simultaneous multi-language capability.

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@654 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
theraysmith@gmail.com 2012-02-02 03:03:56 +00:00
parent e33ae59f4d
commit 73adf693d5
13 changed files with 104 additions and 84 deletions

View File

@ -245,13 +245,11 @@ WordAltList *BeamSearch::CreateWordAltList(SearchObject *srch_obj) {
int cost = 0;
// char bigram cost
int bigram_cost = !bigrams ? 0 :
bigrams->Cost(ch_buff, cntxt_->CharacterSet(),
&cntxt_->TesseractObject()->unicharset);
bigrams->Cost(ch_buff, cntxt_->CharacterSet());
// word unigram cost
int unigram_cost = !word_unigrams ? 0 :
word_unigrams->Cost(ch_buff, cntxt_->LangMod(),
cntxt_->CharacterSet(),
&cntxt_->TesseractObject()->unicharset);
cntxt_->CharacterSet());
// overall cost
cost = static_cast<int>(
(size_cost * cntxt_->Params()->SizeWgt()) +

View File

@ -25,7 +25,6 @@
#include "char_bigrams.h"
#include "cube_utils.h"
#include "ndminx.h"
#include "unicharset.h"
#include "cube_const.h"
namespace tesseract {
@ -167,21 +166,20 @@ int CharBigrams::PairCost(char_32 ch1, char_32 ch2) const {
return bigram_table_.char_bigram[ch1].bigram[ch2].cost;
}
int CharBigrams::Cost(const char_32 *char_32_ptr, CharSet *char_set,
UNICHARSET *unicharset) const {
int CharBigrams::Cost(const char_32 *char_32_ptr, CharSet *char_set) const {
if (!char_32_ptr || char_32_ptr[0] == 0) {
return bigram_table_.worst_cost;
}
int cost = MeanCostWithSpaces(char_32_ptr);
if (CubeUtils::StrLen(char_32_ptr) >= kMinLengthCaseInvariant &&
CubeUtils::IsCaseInvariant(char_32_ptr, char_set, unicharset)) {
char_32 *lower_32 = CubeUtils::ToLower(char_32_ptr, char_set, unicharset);
CubeUtils::IsCaseInvariant(char_32_ptr, char_set)) {
char_32 *lower_32 = CubeUtils::ToLower(char_32_ptr, char_set);
if (lower_32 && lower_32[0] != 0) {
int cost_lower = MeanCostWithSpaces(lower_32);
cost = MIN(cost, cost_lower);
delete [] lower_32;
}
char_32 *upper_32 = CubeUtils::ToUpper(char_32_ptr, char_set, unicharset);
char_32 *upper_32 = CubeUtils::ToUpper(char_32_ptr, char_set);
if (upper_32 && upper_32[0] != 0) {
int cost_upper = MeanCostWithSpaces(upper_32);
cost = MIN(cost, cost_upper);

View File

@ -61,13 +61,12 @@ class CharBigrams {
static CharBigrams *Create(const string &data_file_path,
const string &lang);
// Top-level function to return the mean character bigram cost of a
// sequence of characters. If char_set and unicharset are not NULL
// and cube and tesseract share the same unicharset, use
// sequence of characters. If char_set is not NULL, use
// tesseract functions to return a case-invariant cost.
// This avoids unnecessarily penalizing all-one-case words or
// capitalized words (first-letter upper-case and remaining letters
// lower-case).
int Cost(const char_32 *str, CharSet *char_set, UNICHARSET *unicharset) const;
int Cost(const char_32 *str, CharSet *char_set) const;
protected:
// Returns the character bigram cost of two characters.

View File

@ -78,10 +78,16 @@ CharSet *CharSet::Create(TessdataManager *tessdata_manager,
// map its unichars to tesseract's; if only one unicharset exists,
// just load it.
bool loaded;
if (cube_unicharset_exists)
loaded = char_set->LoadSupportedCharList(charset_fp, tess_unicharset);
else
if (cube_unicharset_exists) {
char_set->cube_unicharset_.load_from_file(charset_fp);
loaded = tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET);
loaded = loaded && char_set->LoadSupportedCharList(
tessdata_manager->GetDataFilePtr(), tess_unicharset);
char_set->unicharset_ = &char_set->cube_unicharset_;
} else {
loaded = char_set->LoadSupportedCharList(charset_fp, NULL);
char_set->unicharset_ = tess_unicharset;
}
if (!loaded) {
delete char_set;
return false;

View File

@ -116,6 +116,10 @@ class CharSet {
static CharSet *Create(TessdataManager *tessdata_manager,
UNICHARSET *tess_unicharset);
// Return the UNICHARSET cube is using for recognition internally --
// ClassId() returns unichar_id's in this unicharset.
UNICHARSET *InternalUnicharset() { return unicharset_; }
private:
// Hash table configuration params. Determined emperically on
// the supported languages so far (Eng, Ara, Hin). Might need to be
@ -155,6 +159,13 @@ class CharSet {
string_32 **class_strings_;
// map from class id to secondary (tesseract's) unicharset's ids
int *unicharset_map_;
// A unicharset which is filled in with a Tesseract-style UNICHARSET for
// cube's data if our unicharset is different from tesseract's.
UNICHARSET cube_unicharset_;
// This points to either the tess_unicharset we're passed or cube_unicharset_,
// depending upon whether we just have one unicharset or one for each
// tesseract and cube, respectively.
UNICHARSET *unicharset_;
// has the char set been initialized flag
bool init_;
};

View File

@ -242,6 +242,16 @@ int CubeObject::WordCost(const char *str) {
return cost;
}
// Recognizes a single character and returns the list of results.
CharAltList *CubeObject::RecognizeChar() {
if (char_samp_ == NULL) return NULL;
CharAltList* alt_list = NULL;
CharClassifier *char_classifier = cntxt_->Classifier();
ASSERT_HOST(char_classifier != NULL);
alt_list = char_classifier->Classify(char_samp_);
return alt_list;
}
// Normalize the input word bitmap to have a minimum aspect ratio
bool CubeObject::Normalize() {
// create a cube search object

View File

@ -64,7 +64,6 @@
//
// // Call this once you are done recognizing all words with
// // for the current language
// tess_obj->end_tesseract();
// delete tess_obj;
//
// Note that if the language supports "Italics" (see the CubeRecoContext), the
@ -110,6 +109,8 @@ class CubeObject {
// recognition of a language model that allows only the specified word.
// The alternate list(s) will be permanently modified.
int WordCost(const char *str);
// Recognizes a single character and returns the list of results.
CharAltList *RecognizeChar();
// Returns the BeamSearch object that resulted from the last call to
// RecognizeWord

View File

@ -358,8 +358,7 @@ void CubeUtils::UTF32ToUTF8(const char_32 *utf32_str, string *str) {
}
}
bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set,
UNICHARSET *unicharset) {
bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) {
bool all_one_case = true;
bool capitalized;
bool prev_upper;
@ -370,20 +369,18 @@ bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set,
bool cur_lower;
string str8;
if (!char_set || !unicharset || !char_set->SharedUnicharset()) {
// If cube char_set or tesseract unicharset are missing, or
// unicharset is not shared, then use C-locale-dependent functions
if (!char_set) {
// If cube char_set is missing, use C-locale-dependent functions
// on UTF8 characters to determine case properties.
UTF32ToUTF8(str32, &str8);
first_upper = isupper(str8[0]);
first_lower = islower(str8[0]);
first_upper = isupper(str32[0]);
first_lower = islower(str32[0]);
if (first_upper)
capitalized = true;
prev_upper = first_upper;
prev_lower = islower(str8[0]);
for (int c = 1; c < str8.length(); ++c) {
cur_upper = isupper(str8[c]);
cur_lower = islower(str8[c]);
prev_lower = islower(str32[0]);
for (int c = 1; str32[c] != 0; ++c) {
cur_upper = isupper(str32[c]);
cur_lower = islower(str32[c]);
if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
all_one_case = false;
if (cur_upper)
@ -392,17 +389,18 @@ bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set,
prev_lower = cur_lower;
}
} else {
UNICHARSET *unicharset = char_set->InternalUnicharset();
// Use UNICHARSET functions to determine case properties
first_upper = unicharset->get_isupper(char_set->UnicharID(str32[0]));
first_lower = unicharset->get_islower(char_set->UnicharID(str32[0]));
first_upper = unicharset->get_isupper(char_set->ClassID(str32[0]));
first_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
if (first_upper)
capitalized = true;
prev_upper = first_upper;
prev_lower = unicharset->get_islower(char_set->UnicharID(str32[0]));
prev_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
for (int c = 1; c < StrLen(str32); ++c) {
cur_upper = unicharset->get_isupper(char_set->UnicharID(str32[c]));
cur_lower = unicharset->get_islower(char_set->UnicharID(str32[c]));
cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c]));
cur_lower = unicharset->get_islower(char_set->ClassID(str32[c]));
if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
all_one_case = false;
if (cur_upper)
@ -414,11 +412,11 @@ bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set,
return all_one_case || capitalized;
}
char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set,
UNICHARSET *unicharset) {
if (!char_set || !unicharset || !char_set->SharedUnicharset()) {
char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set) {
if (!char_set) {
return NULL;
}
UNICHARSET *unicharset = char_set->InternalUnicharset();
int len = StrLen(str32);
char_32 *lower = new char_32[len + 1];
if (!lower)
@ -430,9 +428,8 @@ char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set,
return NULL;
}
// convert upper-case characters to lower-case
if (unicharset->get_isupper(char_set->UnicharID(ch))) {
UNICHAR_ID uid_lower =
unicharset->get_other_case(char_set->UnicharID(ch));
if (unicharset->get_isupper(char_set->ClassID(ch))) {
UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch));
const char_32 *str32_lower = char_set->ClassString(uid_lower);
// expect lower-case version of character to be a single character
if (!str32_lower || StrLen(str32_lower) != 1) {
@ -448,11 +445,11 @@ char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set,
return lower;
}
char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set,
UNICHARSET *unicharset) {
if (!char_set || !unicharset || !char_set->SharedUnicharset()) {
char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set) {
if (!char_set) {
return NULL;
}
UNICHARSET *unicharset = char_set->InternalUnicharset();
int len = StrLen(str32);
char_32 *upper = new char_32[len + 1];
if (!upper)
@ -464,9 +461,8 @@ char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set,
return NULL;
}
// convert lower-case characters to upper-case
if (unicharset->get_islower(char_set->UnicharID(ch))) {
UNICHAR_ID uid_upper =
unicharset->get_other_case(char_set->UnicharID(ch));
if (unicharset->get_islower(char_set->ClassID(ch))) {
UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch));
const char_32 *str32_upper = char_set->ClassString(uid_upper);
// expect upper-case version of character to be a single character
if (!str32_upper || StrLen(str32_upper) != 1) {

View File

@ -69,24 +69,18 @@ class CubeUtils {
static void UTF32ToUTF8(const char_32 *utf32_str, string *str);
// Returns true if input word has either 1) all-one-case, or 2)
// first character upper-case, and remaining characters lower-case.
// If char_set and unicharset are not NULL, uses tesseract's unicharset
// functions to determine case properties. Otherwise, uses
// C-locale-dependent functions, which may be unreliable on
// non-ASCII characters.
static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set,
UNICHARSET *unicharset);
// If char_set is not NULL, uses tesseract's unicharset functions
// to determine case properties. Otherwise, uses C-locale-dependent
// functions, which may be unreliable on non-ASCII characters.
static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set);
// Returns char_32 pointer to the lower-case-transformed version of
// the input string or NULL on error. If char_set or unicharset are
// NULL, or tesseract and cube do not share unicharsets, returns
// NULL. Return array must be freed by caller.
static char_32 *ToLower(const char_32 *str32, CharSet *char_set,
UNICHARSET *unicharset);
// the input string or NULL on error. If char_set is NULL returns NULL.
// Return array must be freed by caller.
static char_32 *ToLower(const char_32 *str32, CharSet *char_set);
// Returns char_32 pointer to the upper-case-transformed version of
// the input string or NULL on error. If char_set or unicharset are
// NULL, or tesseract and cube do not share unicharsets, returns
// NULL. Return array must be freed by caller.
static char_32 *ToUpper(const char_32 *str32, CharSet *char_set,
UNICHARSET *unicharset);
// the input string or NULL on error. If char_set is NULL returns NULL.
// Return array must be freed by caller.
static char_32 *ToUpper(const char_32 *str32, CharSet *char_set);
private:
static unsigned char *GetImageData(IMAGE *img,
int left, int top, int wid, int hgt);

View File

@ -121,11 +121,11 @@ void WordListLangModel::WordVariants(const CharSet &char_set,
string_32 prefix_str32,
WERD_CHOICE *word_so_far,
string_32 str32,
vector<WERD_CHOICE> *word_variants) {
vector<WERD_CHOICE *> *word_variants) {
int str_len = str32.length();
if (str_len == 0) {
if (word_so_far->length() > 0) {
word_variants->push_back(*word_so_far);
word_variants->push_back(new WERD_CHOICE(*word_so_far));
}
} else {
// Try out all the possible prefixes of the str32.
@ -151,11 +151,15 @@ void WordListLangModel::WordVariants(const CharSet &char_set,
// Compute all the variants of a 32-bit string in terms of the class-ids
// This is needed for languages that have ligatures. A word can then have more
// than one spelling in terms of the class-ids
void WordListLangModel::WordVariants(const CharSet &char_set, string_32 str32,
vector<WERD_CHOICE> *word_variants) {
void WordListLangModel::WordVariants(const CharSet &char_set,
const UNICHARSET *uchset, string_32 str32,
vector<WERD_CHOICE *> *word_variants) {
for (int i = 0; i < word_variants->size(); i++) {
delete (*word_variants)[i];
}
word_variants->clear();
string_32 prefix_str32;
WERD_CHOICE word_so_far;
WERD_CHOICE word_so_far(uchset);
WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants);
}
@ -179,21 +183,23 @@ bool WordListLangModel::AddString32(const char_32 *char_32_ptr) {
return false;
}
// get all the word variants
vector<WERD_CHOICE> word_variants;
WordVariants(*(cntxt_->CharacterSet()), char_32_ptr, &word_variants);
vector<WERD_CHOICE *> word_variants;
WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(),
char_32_ptr, &word_variants);
if (word_variants.size() > 0) {
// find the shortest variant
int shortest_word = 0;
for (int word = 1; word < word_variants.size(); word++) {
if (word_variants[shortest_word].length() >
word_variants[word].length()) {
if (word_variants[shortest_word]->length() >
word_variants[word]->length()) {
shortest_word = word;
}
}
// only add the shortest grapheme interpretation of string to the word list
dawg_->add_word_to_dawg(word_variants[shortest_word]);
dawg_->add_word_to_dawg(*word_variants[shortest_word]);
}
for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; }
return true;
}

View File

@ -63,8 +63,9 @@ class WordListLangModel : public LangModel {
// Compute all the variants of a 32-bit string in terms of the class-ids.
// This is needed for languages that have ligatures. A word can then have
// more than one spelling in terms of the class-ids.
static void WordVariants(const CharSet &char_set, string_32 str32,
vector<WERD_CHOICE> *word_variants);
static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset,
string_32 str32,
vector<WERD_CHOICE *> *word_variants);
private:
// constants needed to configure the language model
static const int kMaxEdge = 512;
@ -78,9 +79,11 @@ class WordListLangModel : public LangModel {
// Cleanup
void Cleanup();
// Recursive helper function for WordVariants().
static void WordVariants(const CharSet &char_set,
string_32 prefix_str32, WERD_CHOICE *word_so_far,
string_32 str32, vector<WERD_CHOICE> *word_variants);
static void WordVariants(
const CharSet &char_set,
string_32 prefix_str32, WERD_CHOICE *word_so_far,
string_32 str32,
vector<WERD_CHOICE *> *word_variants);
};
} // tesseract

View File

@ -25,7 +25,6 @@
#include "const.h"
#include "cube_utils.h"
#include "ndminx.h"
#include "unicharset.h"
#include "word_unigrams.h"
namespace tesseract {
@ -150,8 +149,7 @@ WordUnigrams *WordUnigrams::Create(const string &data_file_path,
// cost.
int WordUnigrams::Cost(const char_32 *key_str32,
LangModel *lang_mod,
CharSet *char_set,
UNICHARSET *unicharset) const {
CharSet *char_set) const {
if (!key_str32)
return 0;
// convert string to UTF8 to split into space-separated words
@ -206,15 +204,15 @@ int WordUnigrams::Cost(const char_32 *key_str32,
// if case invariant, get costs of all-upper-case and all-lower-case
// versions and return the min cost
if (clean_len >= kMinLengthNumOrCaseInvariant &&
CubeUtils::IsCaseInvariant(clean_str32, char_set, unicharset)) {
char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set, unicharset);
CubeUtils::IsCaseInvariant(clean_str32, char_set)) {
char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set);
if (lower_32) {
string lower_8;
CubeUtils::UTF32ToUTF8(lower_32, &lower_8);
word_cost = MIN(word_cost, CostInternal(lower_8.c_str()));
delete [] lower_32;
}
char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set, unicharset);
char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set);
if (upper_32) {
string upper_8;
CubeUtils::UTF32ToUTF8(upper_32, &upper_8);

View File

@ -49,7 +49,7 @@ class WordUnigrams {
// case-invariant cost is computed in those cases, assuming the word
// meets a minimum length.
int Cost(const char_32 *str32, LangModel *lang_mod,
CharSet *char_set, UNICHARSET *unicharset) const;
CharSet *char_set) const;
protected:
// Compute the word unigram cost of a UTF-8 string with binary
// search of sorted words_ array.