mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-24 11:09:06 +08:00
3a998fe7ac
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@651 d0cd1f9f-072b-0410-8dd7-cf729c803f20
203 lines
6.6 KiB
C++
203 lines
6.6 KiB
C++
/**********************************************************************
|
|
* File: cube_reco_context.cpp
|
|
* Description: Implementation of the Cube Recognition Context Class
|
|
* Author: Ahmad Abdulkader
|
|
* Created: 2007
|
|
*
|
|
* (C) Copyright 2008, Google Inc.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
#include <string>
|
|
#include <limits.h>
|
|
|
|
#include "cube_reco_context.h"
|
|
|
|
#include "classifier_factory.h"
|
|
#include "cube_tuning_params.h"
|
|
#include "dict.h"
|
|
#include "feature_bmp.h"
|
|
#include "tessdatamanager.h"
|
|
#include "tesseractclass.h"
|
|
#include "tess_lang_model.h"
|
|
|
|
namespace tesseract {
|
|
|
|
// Instantiate a CubeRecoContext object using a Tesseract object.
|
|
// CubeRecoContext will not take ownership of tess_obj, but will
|
|
// record the pointer to it and will make use of various Tesseract
|
|
// components (language model, flags, etc). Thus the caller should
|
|
// keep tess_obj alive so long as the instantiated CubeRecoContext is used.
|
|
CubeRecoContext::CubeRecoContext(Tesseract *tess_obj) {
|
|
tess_obj_ = tess_obj;
|
|
lang_ = "";
|
|
loaded_ = false;
|
|
lang_mod_ = NULL;
|
|
params_ = NULL;
|
|
char_classifier_ = NULL;
|
|
char_set_ = NULL;
|
|
word_size_model_ = NULL;
|
|
char_bigrams_ = NULL;
|
|
word_unigrams_ = NULL;
|
|
noisy_input_ = false;
|
|
size_normalization_ = false;
|
|
}
|
|
|
|
CubeRecoContext::~CubeRecoContext() {
|
|
if (char_classifier_ != NULL) {
|
|
delete char_classifier_;
|
|
char_classifier_ = NULL;
|
|
}
|
|
|
|
if (word_size_model_ != NULL) {
|
|
delete word_size_model_;
|
|
word_size_model_ = NULL;
|
|
}
|
|
|
|
if (char_set_ != NULL) {
|
|
delete char_set_;
|
|
char_set_ = NULL;
|
|
}
|
|
|
|
if (char_bigrams_ != NULL) {
|
|
delete char_bigrams_;
|
|
char_bigrams_ = NULL;
|
|
}
|
|
|
|
if (word_unigrams_ != NULL) {
|
|
delete word_unigrams_;
|
|
word_unigrams_ = NULL;
|
|
}
|
|
|
|
if (lang_mod_ != NULL) {
|
|
delete lang_mod_;
|
|
lang_mod_ = NULL;
|
|
}
|
|
|
|
if (params_ != NULL) {
|
|
delete params_;
|
|
params_ = NULL;
|
|
}
|
|
}
|
|
|
|
// Returns the path of the data files by looking up the TESSDATA_PREFIX
|
|
// environment variable and appending a "tessdata" directory to it
|
|
bool CubeRecoContext::GetDataFilePath(string *path) const {
|
|
*path = tess_obj_->datadir.string();
|
|
return true;
|
|
}
|
|
|
|
// The object initialization function that loads all the necessary
|
|
// components of a RecoContext. TessdataManager is used to load the
|
|
// data from [lang].traineddata file. If TESSDATA_CUBE_UNICHARSET
|
|
// component is present, Cube will be instantiated with the unicharset
|
|
// specified in this component and the corresponding dictionary
|
|
// (TESSDATA_CUBE_SYSTEM_DAWG), and will map Cube's unicharset to
|
|
// Tesseract's. Otherwise, TessdataManager will assume that Cube will
|
|
// be using Tesseract's unicharset and dawgs, and will load the
|
|
// unicharset from the TESSDATA_UNICHARSET component and will load the
|
|
// dawgs from TESSDATA_*_DAWG components.
|
|
bool CubeRecoContext::Load(TessdataManager *tessdata_manager,
|
|
UNICHARSET *tess_unicharset) {
|
|
ASSERT_HOST(tess_obj_ != NULL);
|
|
tess_unicharset_ = tess_unicharset;
|
|
string data_file_path;
|
|
|
|
// Get the data file path.
|
|
if (GetDataFilePath(&data_file_path) == false) {
|
|
fprintf(stderr, "Unable to get data file path\n");
|
|
return false;
|
|
}
|
|
|
|
// Get the language from the Tesseract object.
|
|
lang_ = tess_obj_->lang.string();
|
|
|
|
// Create the char set.
|
|
if ((char_set_ =
|
|
CharSet::Create(tessdata_manager, tess_unicharset)) == NULL) {
|
|
fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to load "
|
|
"CharSet\n");
|
|
return false;
|
|
}
|
|
// Create the language model.
|
|
string lm_file_name = data_file_path + lang_ + ".cube.lm";
|
|
string lm_params;
|
|
if (!CubeUtils::ReadFileToString(lm_file_name, &lm_params)) {
|
|
fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to read cube "
|
|
"language model params from %s\n", lm_file_name.c_str());
|
|
return false;
|
|
}
|
|
lang_mod_ = new TessLangModel(lm_params, data_file_path,
|
|
tess_obj_->getDict().load_system_dawg,
|
|
tessdata_manager, this);
|
|
if (lang_mod_ == NULL) {
|
|
fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to create "
|
|
"TessLangModel\n");
|
|
return false;
|
|
}
|
|
|
|
// Create the optional char bigrams object.
|
|
char_bigrams_ = CharBigrams::Create(data_file_path, lang_);
|
|
|
|
// Create the optional word unigrams object.
|
|
word_unigrams_ = WordUnigrams::Create(data_file_path, lang_);
|
|
|
|
// Create the optional size model.
|
|
word_size_model_ = WordSizeModel::Create(data_file_path, lang_,
|
|
char_set_, Contextual());
|
|
|
|
// Load tuning params.
|
|
params_ = CubeTuningParams::Create(data_file_path, lang_);
|
|
if (params_ == NULL) {
|
|
fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to read "
|
|
"CubeTuningParams from %s\n", data_file_path.c_str());
|
|
return false;
|
|
}
|
|
|
|
// Create the char classifier.
|
|
char_classifier_ = CharClassifierFactory::Create(data_file_path, lang_,
|
|
lang_mod_, char_set_,
|
|
params_);
|
|
if (char_classifier_ == NULL) {
|
|
fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to load "
|
|
"CharClassifierFactory object from %s\n", data_file_path.c_str());
|
|
return false;
|
|
}
|
|
|
|
loaded_ = true;
|
|
|
|
return true;
|
|
}
|
|
|
|
// Creates a CubeRecoContext object using a tesseract object
|
|
CubeRecoContext * CubeRecoContext::Create(Tesseract *tess_obj,
|
|
TessdataManager *tessdata_manager,
|
|
UNICHARSET *tess_unicharset) {
|
|
// create the object
|
|
CubeRecoContext *cntxt = new CubeRecoContext(tess_obj);
|
|
if (cntxt == NULL) {
|
|
fprintf(stderr, "Cube ERROR (CubeRecoContext::Create): unable to create "
|
|
"CubeRecoContext object\n");
|
|
return NULL;
|
|
}
|
|
// load the necessary components
|
|
if (cntxt->Load(tessdata_manager, tess_unicharset) == false) {
|
|
fprintf(stderr, "Cube ERROR (CubeRecoContext::Create): unable to init "
|
|
"CubeRecoContext object\n");
|
|
delete cntxt;
|
|
return NULL;
|
|
}
|
|
// success
|
|
return cntxt;
|
|
}
|
|
} // tesseract}
|