/********************************************************************** * File: tessedit.cpp (Formerly tessedit.c) * Description: Main program for merge of tess and editor. * Author: Ray Smith * Created: Tue Jan 07 15:21:46 GMT 1992 * * (C) Copyright 1992, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/ #include "mfcpch.h" //#include //#include //#include //#include #include "tfacep.h" //must be before main.h //#include "fileerr.h" #include "stderr.h" #include "basedir.h" #include "tessvars.h" //#include "debgwin.h" //#include "epapdest.h" #include "control.h" #include "imgs.h" #include "reject.h" #include "pageres.h" //#include "gpapdest.h" #include "nwmain.h" #include "pgedit.h" #include "tprintf.h" //#include "ipeerr.h" //#include "restart.h" #include "tessedit.h" //#include "fontfind.h" #include "permute.h" #include "stopper.h" #include "intmatcher.h" #include "chop.h" #include "efio.h" #include "danerror.h" #include "globals.h" #include "tesseractclass.h" #include "params.h" #include "notdll.h" //phils nn stuff #define VARDIR "configs/" /*variables files */ //config under api #define API_CONFIG "configs/api_config" ETEXT_DESC *global_monitor = NULL; // progress monitor namespace tesseract { // Read a "config" file containing a set of variable, value pairs. // Searches the standard places: tessdata/configs, tessdata/tessconfigs // and also accepts a relative or absolute path name. void Tesseract::read_config_file(const char *filename, bool init_only) { STRING path = datadir; path += "configs/"; path += filename; FILE* fp; if ((fp = fopen(path.string(), "r")) != NULL) { fclose(fp); } else { path = datadir; path += "tessconfigs/"; path += filename; if ((fp = fopen(path.string(), "r")) != NULL) { fclose(fp); } else { path = filename; } } ParamUtils::ReadParamsFile(path.string(), init_only, this->params()); } // Returns false if a unicharset file for the specified language was not found // or was invalid. // This function initializes TessdataManager. After TessdataManager is // no longer needed, TessdataManager::End() should be called. // // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless // it is OEM_DEFAULT, in which case the value of the variable will be obtained // from the language-specific config file (stored in [lang].traineddata), from // the config files specified on the command line or left as the default // OEM_TESSERACT_ONLY if none of the configs specify this variable. bool Tesseract::init_tesseract_lang_data( const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, bool configs_init_only) { // Set the basename, compute the data directory. main_setup(arg0, textbase); // Set the language data path prefix lang = language != NULL ? language : "eng"; language_data_path_prefix = datadir; language_data_path_prefix += lang; language_data_path_prefix += "."; // Initialize TessdataManager. STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix; tessdata_manager.Init(tessdata_path.string(), tessdata_manager_debug_level); // If a language specific config file (lang.config) exists, load it in. if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) { ParamUtils::ReadParamsFromFp( tessdata_manager.GetDataFilePtr(), tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG), false, this->params()); if (tessdata_manager_debug_level) { tprintf("Loaded language config file\n"); } } // Load tesseract variables from config files. This is done after loading // language-specific variables from [lang].traineddata file, so that custom // config files can override values in [lang].traineddata file. for (int i = 0; i < configs_size; ++i) { read_config_file(configs[i], configs_init_only); } if (((STRING &)tessedit_write_params_to_file).length() > 0) { FILE *params_file = fopen(tessedit_write_params_to_file.string(), "w"); if (params_file != NULL) { ParamUtils::PrintParams(params_file, this->params()); fclose(params_file); if (tessdata_manager_debug_level > 0) { tprintf("Wrote parameters to %s\n", tessedit_write_params_to_file.string()); } } else { tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.string()); } } // Determine which ocr engine(s) should be loaded and used for recognition. if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem); if (tessdata_manager_debug_level) { tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n", static_cast(tessedit_ocr_engine_mode)); } // Load the unicharset if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) || !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) { return false; } if (unicharset.size() > MAX_NUM_CLASSES) { tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n"); return false; } right_to_left_ = unicharset.any_right_to_left(); if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n"); if (!tessedit_ambigs_training && tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) { unichar_ambigs.LoadUnicharAmbigs( tessdata_manager.GetDataFilePtr(), tessdata_manager.GetEndOffset(TESSDATA_AMBIGS), ambigs_debug_level, use_ambigs_for_adaption, &unicharset); if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n"); } // Load Cube objects if necessary. if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { ASSERT_HOST(init_cube_objects(false, &tessdata_manager)); if (tessdata_manager_debug_level) tprintf("Loaded Cube w/out combiner\n"); } else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { ASSERT_HOST(init_cube_objects(true, &tessdata_manager)); if (tessdata_manager_debug_level) tprintf("Loaded Cube with combiner\n"); } return true; } int Tesseract::init_tesseract( const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, bool configs_init_only) { if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs, configs_size, configs_init_only)) { return -1; } // If only Cube will be used, skip loading Tesseract classifier's // pre-trained templates. bool init_tesseract_classifier = (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY || tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED); // If only Cube will be used and if it has its own Unicharset, // skip initializing permuter and loading Tesseract Dawgs. bool init_dict = !(tessedit_ocr_engine_mode == OEM_CUBE_ONLY && tessdata_manager.SeekToStart(TESSDATA_CUBE_UNICHARSET)); program_editup(textbase, init_tesseract_classifier, init_dict); tessdata_manager.End(); return 0; //Normal exit } // init the LM component int Tesseract::init_tesseract_lm(const char *arg0, const char *textbase, const char *language) { if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY, NULL, 0, false)) return -1; getDict().Load(); tessdata_manager.End(); return 0; } void Tesseract::end_tesseract() { end_recog(); } /* Define command type identifiers */ enum CMD_EVENTS { ACTION_1_CMD_EVENT, RECOG_WERDS, RECOG_PSEUDO, ACTION_2_CMD_EVENT }; } // namespace tesseract