tesseract/cube/word_list_lang_model.cpp

/**********************************************************************
 * File:        word_list_lang_model.cpp
 * Description: Implementation of the Word List Language Model Class
 * Author:    Ahmad Abdulkader
 * Created:   2008
 *
 * (C) Copyright 2008, Google Inc.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/

#include <string>
#include <vector>
#include "word_list_lang_model.h"
#include "cube_utils.h"

#include "ratngs.h"
#include "trie.h"

namespace tesseract {
WordListLangModel::WordListLangModel(CubeRecoContext *cntxt) {
  cntxt_ = cntxt;
  dawg_ = NULL;
  init_ = false;
}

WordListLangModel::~WordListLangModel() {
  Cleanup();
}

// Cleanup
void WordListLangModel::Cleanup() {
  if (dawg_ != NULL) {
    delete dawg_;
    dawg_ = NULL;
  }
  init_ = false;
}

// Initialize the language model
bool WordListLangModel::Init() {
  if (init_ == true) {
    return true;
  }
  // The last parameter to the Trie constructor (the debug level) is set to
  // false for now, until Cube has a way to express its preferred debug level.
  dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM,
                   cntxt_->CharacterSet()->ClassCount(), false);
  if (dawg_ == NULL) {
    return false;
  }
  init_ = true;
  return true;
}

// return a pointer to the root
LangModEdge * WordListLangModel::Root() {
  return NULL;
}

// return the edges emerging from the current state
LangModEdge **WordListLangModel::GetEdges(CharAltList *alt_list,
                                          LangModEdge *edge,
                                          int *edge_cnt) {
  // initialize if necessary
  if (init_ == false) {
    if (Init() == false) {
      return NULL;
    }
  }

  (*edge_cnt) = 0;

  EDGE_REF edge_ref;

  TessLangModEdge *tess_lm_edge = reinterpret_cast<TessLangModEdge *>(edge);

  if (tess_lm_edge == NULL) {
    edge_ref = 0;
  } else {
    edge_ref = tess_lm_edge->EndEdge();

    // advance node
    edge_ref = dawg_->next_node(edge_ref);
    if (edge_ref == 0) {
      return NULL;
    }
  }

  // allocate memory for edges
  LangModEdge **edge_array = new LangModEdge *[kMaxEdge];
  if (edge_array == NULL) {
    return NULL;
  }

  // now get all the emerging edges
  (*edge_cnt) += TessLangModEdge::CreateChildren(cntxt_, dawg_, edge_ref,
                                                 edge_array + (*edge_cnt));

  return edge_array;
}

// returns true if the char_32 is supported by the language model
// TODO(ahmadab) currently not implemented
bool WordListLangModel::IsValidSequence(const char_32 *sequence,
                                        bool terminal, LangModEdge **edges) {
  return false;
}

// Recursive helper function for WordVariants().
void WordListLangModel::WordVariants(const CharSet &char_set,
                                     string_32 prefix_str32,
                                     WERD_CHOICE *word_so_far,
                                     string_32 str32,
                                     vector<WERD_CHOICE *> *word_variants) {
  int str_len = str32.length();
  if (str_len == 0) {
    if (word_so_far->length() > 0) {
      word_variants->push_back(new WERD_CHOICE(*word_so_far));
    }
  } else {
    // Try out all the possible prefixes of the str32.
    for (int len = 1; len <= str_len; len++) {
      // Check if prefix is supported in character set.
      string_32 str_pref32 = str32.substr(0, len);
      int class_id = char_set.ClassID(reinterpret_cast<const char_32 *>(
          str_pref32.c_str()));
      if (class_id <= 0) {
        continue;
      } else {
        string_32 new_prefix_str32 = prefix_str32 + str_pref32;
        string_32 new_str32 = str32.substr(len);
        word_so_far->append_unichar_id(class_id, 1, 0.0, 0.0);
        WordVariants(char_set, new_prefix_str32, word_so_far, new_str32,
                     word_variants);
        word_so_far->remove_last_unichar_id();
      }
    }
  }
}

// Compute all the variants of a 32-bit string in terms of the class-ids
// This is needed for languages that have ligatures. A word can then have more
// than one spelling in terms of the class-ids
void WordListLangModel::WordVariants(const CharSet &char_set,
                                     const UNICHARSET *uchset, string_32 str32,
                                     vector<WERD_CHOICE *> *word_variants) {
  for (int i = 0; i < word_variants->size(); i++) {
    delete (*word_variants)[i];
  }
  word_variants->clear();
  string_32 prefix_str32;
  WERD_CHOICE word_so_far(uchset);
  WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants);
}

// add a new UTF-8 string to the lang model
bool WordListLangModel::AddString(const char *char_ptr) {
  if (!init_ && !Init()) {  // initialize if necessary
    return false;
  }

  string_32 str32;
  CubeUtils::UTF8ToUTF32(char_ptr, &str32);
  if (str32.length() < 1) {
    return false;
  }
  return AddString32(str32.c_str());
}

// add a new UTF-32 string to the lang model
bool WordListLangModel::AddString32(const char_32 *char_32_ptr) {
  if (char_32_ptr == NULL) {
    return false;
  }
  // get all the word variants
  vector<WERD_CHOICE *> word_variants;
  WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(),
               char_32_ptr, &word_variants);

  if (word_variants.size() > 0) {
    // find the shortest variant
    int shortest_word = 0;
    for (int word = 1; word < word_variants.size(); word++) {
      if (word_variants[shortest_word]->length() >
          word_variants[word]->length()) {
        shortest_word = word;
      }
    }
    // only add the shortest grapheme interpretation of string to the word list
    dawg_->add_word_to_dawg(*word_variants[shortest_word]);
  }
  for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; }
  return true;
}

}
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`/**********************************************************************`
			`* File: word_list_lang_model.cpp`
			`* Description: Implementation of the Word List Language Model Class`
			`* Author: Ahmad Abdulkader`
			`* Created: 2008`
			`*`
			`* (C) Copyright 2008, Google Inc.`
			`** Licensed under the Apache License, Version 2.0 (the "License");`
			`** you may not use this file except in compliance with the License.`
			`** You may obtain a copy of the License at`
			`** http://www.apache.org/licenses/LICENSE-2.0`
			`** Unless required by applicable law or agreed to in writing, software`
			`** distributed under the License is distributed on an "AS IS" BASIS,`
			`** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`** See the License for the specific language governing permissions and`
			`** limitations under the License.`
			`*`
			`**********************************************************************/`

			`#include <string>`
			`#include <vector>`
			`#include "word_list_lang_model.h"`
			`#include "cube_utils.h"`

			`#include "ratngs.h"`
			`#include "trie.h"`

			`namespace tesseract {`
			`WordListLangModel::WordListLangModel(CubeRecoContext *cntxt) {`
			`cntxt_ = cntxt;`
			`dawg_ = NULL;`
			`init_ = false;`
			`}`

			`WordListLangModel::~WordListLangModel() {`
			`Cleanup();`
			`}`

			`// Cleanup`
			`void WordListLangModel::Cleanup() {`
			`if (dawg_ != NULL) {`
			`delete dawg_;`
			`dawg_ = NULL;`
			`}`
			`init_ = false;`
			`}`

			`// Initialize the language model`
			`bool WordListLangModel::Init() {`
			`if (init_ == true) {`
			`return true;`
			`}`
			`// The last parameter to the Trie constructor (the debug level) is set to`
			`// false for now, until Cube has a way to express its preferred debug level.`
			`dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM,`
			`cntxt_->CharacterSet()->ClassCount(), false);`
			`if (dawg_ == NULL) {`
			`return false;`
			`}`
			`init_ = true;`
			`return true;`
			`}`

			`// return a pointer to the root`
			`LangModEdge * WordListLangModel::Root() {`
			`return NULL;`
			`}`

			`// return the edges emerging from the current state`
			`LangModEdge *WordListLangModel::GetEdges(CharAltList alt_list,`
			`LangModEdge *edge,`
			`int *edge_cnt) {`
			`// initialize if necessary`
			`if (init_ == false) {`
			`if (Init() == false) {`
Major refactor of beam search, elimination of dead code, misc bug fixes, updates to Makefile.am, Changelog etc. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@878 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2013-09-23 23:26:50 +08:00			`return NULL;`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`}`
			`}`

			`(*edge_cnt) = 0;`

			`EDGE_REF edge_ref;`

			`TessLangModEdge tess_lm_edge = reinterpret_cast<TessLangModEdge >(edge);`

			`if (tess_lm_edge == NULL) {`
			`edge_ref = 0;`
			`} else {`
			`edge_ref = tess_lm_edge->EndEdge();`

			`// advance node`
			`edge_ref = dawg_->next_node(edge_ref);`
			`if (edge_ref == 0) {`
Major refactor of beam search, elimination of dead code, misc bug fixes, updates to Makefile.am, Changelog etc. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@878 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2013-09-23 23:26:50 +08:00			`return NULL;`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`}`
			`}`

			`// allocate memory for edges`
			`LangModEdge *edge_array = new LangModEdge [kMaxEdge];`
			`if (edge_array == NULL) {`
			`return NULL;`
			`}`

			`// now get all the emerging edges`
			`(*edge_cnt) += TessLangModEdge::CreateChildren(cntxt_, dawg_, edge_ref,`
			`edge_array + (*edge_cnt));`

			`return edge_array;`
			`}`

			`// returns true if the char_32 is supported by the language model`
			`// TODO(ahmadab) currently not implemented`
			`bool WordListLangModel::IsValidSequence(const char_32 *sequence,`
			`bool terminal, LangModEdge **edges) {`
			`return false;`
			`}`

			`// Recursive helper function for WordVariants().`
			`void WordListLangModel::WordVariants(const CharSet &char_set,`
			`string_32 prefix_str32,`
			`WERD_CHOICE *word_so_far,`
			`string_32 str32,`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Refactored top-level word recognition module, Added simultaneous multi-language capability. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@654 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 11:03:56 +08:00			`vector<WERD_CHOICE > word_variants) {`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`int str_len = str32.length();`
			`if (str_len == 0) {`
			`if (word_so_far->length() > 0) {`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Refactored top-level word recognition module, Added simultaneous multi-language capability. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@654 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 11:03:56 +08:00			`word_variants->push_back(new WERD_CHOICE(*word_so_far));`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`}`
			`} else {`
			`// Try out all the possible prefixes of the str32.`
			`for (int len = 1; len <= str_len; len++) {`
			`// Check if prefix is supported in character set.`
			`string_32 str_pref32 = str32.substr(0, len);`
			`int class_id = char_set.ClassID(reinterpret_cast<const char_32 *>(`
			`str_pref32.c_str()));`
			`if (class_id <= 0) {`
			`continue;`
			`} else {`
			`string_32 new_prefix_str32 = prefix_str32 + str_pref32;`
			`string_32 new_str32 = str32.substr(len);`
			`word_so_far->append_unichar_id(class_id, 1, 0.0, 0.0);`
			`WordVariants(char_set, new_prefix_str32, word_so_far, new_str32,`
			`word_variants);`
			`word_so_far->remove_last_unichar_id();`
			`}`
			`}`
			`}`
			`}`

			`// Compute all the variants of a 32-bit string in terms of the class-ids`
			`// This is needed for languages that have ligatures. A word can then have more`
			`// than one spelling in terms of the class-ids`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Refactored top-level word recognition module, Added simultaneous multi-language capability. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@654 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 11:03:56 +08:00			`void WordListLangModel::WordVariants(const CharSet &char_set,`
			`const UNICHARSET *uchset, string_32 str32,`
			`vector<WERD_CHOICE > word_variants) {`
			`for (int i = 0; i < word_variants->size(); i++) {`
			`delete (*word_variants)[i];`
			`}`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`word_variants->clear();`
			`string_32 prefix_str32;`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Refactored top-level word recognition module, Added simultaneous multi-language capability. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@654 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 11:03:56 +08:00			`WERD_CHOICE word_so_far(uchset);`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants);`
			`}`

			`// add a new UTF-8 string to the lang model`
			`bool WordListLangModel::AddString(const char *char_ptr) {`
			`if (!init_ && !Init()) { // initialize if necessary`
			`return false;`
			`}`

			`string_32 str32;`
			`CubeUtils::UTF8ToUTF32(char_ptr, &str32);`
			`if (str32.length() < 1) {`
			`return false;`
			`}`
			`return AddString32(str32.c_str());`
			`}`

			`// add a new UTF-32 string to the lang model`
			`bool WordListLangModel::AddString32(const char_32 *char_32_ptr) {`
			`if (char_32_ptr == NULL) {`
			`return false;`
			`}`
			`// get all the word variants`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Refactored top-level word recognition module, Added simultaneous multi-language capability. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@654 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 11:03:56 +08:00			`vector<WERD_CHOICE *> word_variants;`
			`WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(),`
			`char_32_ptr, &word_variants);`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00
			`if (word_variants.size() > 0) {`
			`// find the shortest variant`
			`int shortest_word = 0;`
			`for (int word = 1; word < word_variants.size(); word++) {`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Refactored top-level word recognition module, Added simultaneous multi-language capability. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@654 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 11:03:56 +08:00			`if (word_variants[shortest_word]->length() >`
			`word_variants[word]->length()) {`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`shortest_word = word;`
			`}`
			`}`
			`// only add the shortest grapheme interpretation of string to the word list`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Refactored top-level word recognition module, Added simultaneous multi-language capability. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@654 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 11:03:56 +08:00			`dawg_->add_word_to_dawg(*word_variants[shortest_word]);`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`}`
Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic, Refactored top-level word recognition module, Added simultaneous multi-language capability. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@654 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2012-02-02 11:03:56 +08:00			`for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; }`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`return true;`
			`}`

			`}`