This commit is contained in:
Jim O'Regan 2014-09-12 21:54:57 +01:00
parent 9f42f69782
commit 0fc4d528a3
3 changed files with 55 additions and 21 deletions

View File

@ -31,7 +31,9 @@ CubeUtils::CubeUtils() {
CubeUtils::~CubeUtils() { CubeUtils::~CubeUtils() {
} }
// convert a prob to a cost (-ve log prob) /**
* convert a prob to a cost (-ve log prob)
*/
int CubeUtils::Prob2Cost(double prob_val) { int CubeUtils::Prob2Cost(double prob_val) {
if (prob_val < MIN_PROB) { if (prob_val < MIN_PROB) {
return MIN_PROB_COST; return MIN_PROB_COST;
@ -39,12 +41,16 @@ int CubeUtils::Prob2Cost(double prob_val) {
return static_cast<int>(-log(prob_val) * PROB2COST_SCALE); return static_cast<int>(-log(prob_val) * PROB2COST_SCALE);
} }
// converts a cost to probability /**
* converts a cost to probability
*/
double CubeUtils::Cost2Prob(int cost) { double CubeUtils::Cost2Prob(int cost) {
return exp(-cost / PROB2COST_SCALE); return exp(-cost / PROB2COST_SCALE);
} }
// computes the length of a NULL terminated char_32 string /**
* computes the length of a NULL terminated char_32 string
*/
int CubeUtils::StrLen(const char_32 *char_32_ptr) { int CubeUtils::StrLen(const char_32 *char_32_ptr) {
if (char_32_ptr == NULL) { if (char_32_ptr == NULL) {
return 0; return 0;
@ -54,7 +60,9 @@ int CubeUtils::StrLen(const char_32 *char_32_ptr) {
return len; return len;
} }
// compares two char_32 strings /**
* compares two char_32 strings
*/
int CubeUtils::StrCmp(const char_32 *str1, const char_32 *str2) { int CubeUtils::StrCmp(const char_32 *str1, const char_32 *str2) {
const char_32 *pch1 = str1; const char_32 *pch1 = str1;
const char_32 *pch2 = str2; const char_32 *pch2 = str2;
@ -76,7 +84,9 @@ int CubeUtils::StrCmp(const char_32 *str1, const char_32 *str2) {
} }
} }
// Duplicates a 32-bit char buffer /**
* Duplicates a 32-bit char buffer
*/
char_32 *CubeUtils::StrDup(const char_32 *str32) { char_32 *CubeUtils::StrDup(const char_32 *str32) {
int len = StrLen(str32); int len = StrLen(str32);
char_32 *new_str = new char_32[len + 1]; char_32 *new_str = new char_32[len + 1];
@ -88,7 +98,9 @@ char_32 *CubeUtils::StrDup(const char_32 *str32) {
return new_str; return new_str;
} }
// creates a char samp from a specified portion of the image /**
* creates a char samp from a specified portion of the image
*/
CharSamp *CubeUtils::CharSampleFromPix(Pix *pix, int left, int top, CharSamp *CubeUtils::CharSampleFromPix(Pix *pix, int left, int top,
int wid, int hgt) { int wid, int hgt) {
// get the raw img data from the image // get the raw img data from the image
@ -105,7 +117,9 @@ CharSamp *CubeUtils::CharSampleFromPix(Pix *pix, int left, int top,
return char_samp; return char_samp;
} }
// create a B/W image from a char_sample /**
* create a B/W image from a char_sample
*/
Pix *CubeUtils::PixFromCharSample(CharSamp *char_samp) { Pix *CubeUtils::PixFromCharSample(CharSamp *char_samp) {
// parameter check // parameter check
if (char_samp == NULL) { if (char_samp == NULL) {
@ -137,7 +151,9 @@ Pix *CubeUtils::PixFromCharSample(CharSamp *char_samp) {
return pix; return pix;
} }
// creates a raw buffer from the specified location of the pix /**
* creates a raw buffer from the specified location of the pix
*/
unsigned char *CubeUtils::GetImageData(Pix *pix, int left, int top, unsigned char *CubeUtils::GetImageData(Pix *pix, int left, int top,
int wid, int hgt) { int wid, int hgt) {
// skip invalid dimensions // skip invalid dimensions
@ -173,7 +189,9 @@ unsigned char *CubeUtils::GetImageData(Pix *pix, int left, int top,
return temp_buff; return temp_buff;
} }
// read file contents to a string /**
* read file contents to a string
*/
bool CubeUtils::ReadFileToString(const string &file_name, string *str) { bool CubeUtils::ReadFileToString(const string &file_name, string *str) {
str->clear(); str->clear();
FILE *fp = fopen(file_name.c_str(), "rb"); FILE *fp = fopen(file_name.c_str(), "rb");
@ -206,7 +224,9 @@ bool CubeUtils::ReadFileToString(const string &file_name, string *str) {
return (read_bytes == file_size); return (read_bytes == file_size);
} }
// splits a string into vectors based on specified delimiters /**
* splits a string into vectors based on specified delimiters
*/
void CubeUtils::SplitStringUsing(const string &str, void CubeUtils::SplitStringUsing(const string &str,
const string &delims, const string &delims,
vector<string> *str_vec) { vector<string> *str_vec) {
@ -240,7 +260,9 @@ void CubeUtils::SplitStringUsing(const string &str,
} }
} }
// UTF-8 to UTF-32 convesion functions /**
* UTF-8 to UTF-32 convesion functions
*/
void CubeUtils::UTF8ToUTF32(const char *utf8_str, string_32 *str32) { void CubeUtils::UTF8ToUTF32(const char *utf8_str, string_32 *str32) {
str32->clear(); str32->clear();
int len = strlen(utf8_str); int len = strlen(utf8_str);
@ -254,7 +276,9 @@ void CubeUtils::UTF8ToUTF32(const char *utf8_str, string_32 *str32) {
} }
} }
// UTF-8 to UTF-32 convesion functions /**
* UTF-8 to UTF-32 convesion functions
*/
void CubeUtils::UTF32ToUTF8(const char_32 *utf32_str, string *str) { void CubeUtils::UTF32ToUTF8(const char_32 *utf32_str, string *str) {
str->clear(); str->clear();
for (const char_32 *ch_32 = utf32_str; (*ch_32) != 0; ch_32++) { for (const char_32 *ch_32 = utf32_str; (*ch_32) != 0; ch_32++) {

View File

@ -37,7 +37,9 @@ WordAltList::~WordAltList() {
} }
} }
// insert an alternate word with the specified cost and tag /**
* insert an alternate word with the specified cost and tag
*/
bool WordAltList::Insert(char_32 *word_str, int cost, void *tag) { bool WordAltList::Insert(char_32 *word_str, int cost, void *tag) {
if (word_alt_ == NULL || alt_cost_ == NULL) { if (word_alt_ == NULL || alt_cost_ == NULL) {
word_alt_ = new char_32*[max_alt_]; word_alt_ = new char_32*[max_alt_];
@ -84,7 +86,9 @@ bool WordAltList::Insert(char_32 *word_str, int cost, void *tag) {
return true; return true;
} }
// sort the alternate in descending order based on the cost /**
* sort the alternate in descending order based on the cost
*/
void WordAltList::Sort() { void WordAltList::Sort() {
for (int alt_idx = 0; alt_idx < alt_cnt_; alt_idx++) { for (int alt_idx = 0; alt_idx < alt_cnt_; alt_idx++) {
for (int alt = alt_idx + 1; alt < alt_cnt_; alt++) { for (int alt = alt_idx + 1; alt < alt_cnt_; alt++) {

View File

@ -50,8 +50,10 @@ WordUnigrams::~WordUnigrams() {
} }
} }
// Load the word-list and unigrams from file and create an object /**
// The word list is assumed to be sorted in lexicographic order. * Load the word-list and unigrams from file and create an object
* The word list is assumed to be sorted in lexicographic order.
*/
WordUnigrams *WordUnigrams::Create(const string &data_file_path, WordUnigrams *WordUnigrams::Create(const string &data_file_path,
const string &lang) { const string &lang) {
string file_name; string file_name;
@ -143,10 +145,12 @@ WordUnigrams *WordUnigrams::Create(const string &data_file_path,
return word_unigrams_obj; return word_unigrams_obj;
} }
// Split input into space-separated tokens, strip trailing punctuation /**
// from each, determine case properties, call UTF-8 flavor of cost * Split input into space-separated tokens, strip trailing punctuation
// function on each word, and aggregate all into single mean word * from each, determine case properties, call UTF-8 flavor of cost
// cost. * function on each word, and aggregate all into single mean word
* cost.
*/
int WordUnigrams::Cost(const char_32 *key_str32, int WordUnigrams::Cost(const char_32 *key_str32,
LangModel *lang_mod, LangModel *lang_mod,
CharSet *char_set) const { CharSet *char_set) const {
@ -239,7 +243,9 @@ int WordUnigrams::Cost(const char_32 *key_str32,
return static_cast<int>(cost / static_cast<double>(words.size())); return static_cast<int>(cost / static_cast<double>(words.size()));
} }
// Search for UTF-8 string using binary search of sorted words_ array. /**
* Search for UTF-8 string using binary search of sorted words_ array.
*/
int WordUnigrams::CostInternal(const char *key_str) const { int WordUnigrams::CostInternal(const char *key_str) const {
if (strlen(key_str) == 0) if (strlen(key_str) == 0)
return not_in_list_cost_; return not_in_list_cost_;