tesseract/ccmain/tessedit.cpp
2009-08-20 22:30:21 +00:00

268 lines
8.8 KiB
C++

/**********************************************************************
* File: tessedit.cpp (Formerly tessedit.c)
* Description: Main program for merge of tess and editor.
* Author: Ray Smith
* Created: Tue Jan 07 15:21:46 GMT 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "mfcpch.h"
//#include <osfcn.h>
//#include <signal.h>
//#include <time.h>
//#include <unistd.h>
#include "tfacep.h" //must be before main.h
//#include "fileerr.h"
#include "stderr.h"
#include "basedir.h"
#include "tessvars.h"
//#include "debgwin.h"
//#include "epapdest.h"
#include "control.h"
#include "imgs.h"
#include "reject.h"
#include "pageres.h"
//#include "gpapdest.h"
#include "mainblk.h"
#include "nwmain.h"
#include "pgedit.h"
#include "ocrshell.h"
#include "tprintf.h"
//#include "ipeerr.h"
//#include "restart.h"
#include "tessedit.h"
//#include "fontfind.h"
#include "permute.h"
#include "permdawg.h"
#include "stopper.h"
#include "adaptmatch.h"
#include "intmatcher.h"
#include "chop.h"
#include "efio.h"
#include "danerror.h"
#include "globals.h"
#include "tesseractclass.h"
#include "varable.h"
/*
** Include automatically generated configuration file if running autoconf
*/
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
// Includes libtiff if HAVE_LIBTIFF is defined
#ifdef HAVE_LIBTIFF
#include "tiffio.h"
#endif
#include "notdll.h" //phils nn stuff
#define VARDIR "configs/" /*variables files */
//config under api
#define API_CONFIG "configs/api_config"
#define EXTERN
EXTERN BOOL_EVAR (tessedit_write_vars, FALSE, "Write all vars to file");
ETEXT_DESC *global_monitor = NULL; // progress monitor
namespace tesseract {
// Read a "config" file containing a set of variable, value pairs.
// Searches the standard places: tessdata/configs, tessdata/tessconfigs
// and also accepts a relative or absolute path name.
void Tesseract::read_config_file(const char *filename, bool global_only) {
STRING path = datadir;
path += "configs/";
path += filename;
FILE* fp;
if ((fp = fopen(path.string(), "r")) != NULL) {
fclose(fp);
} else {
path = datadir;
path += "tessconfigs/";
path += filename;
if ((fp = fopen(path.string(), "r")) != NULL) {
fclose(fp);
} else {
path = filename;
}
}
read_variables_file(path.string(), global_only);
}
// Returns false if a unicharset file for the specified language was not found
// or was invalid.
// This function initializes TessdataManager. After TessdataManager is
// no longer needed, TessdataManager::End() should be called.
bool Tesseract::init_tesseract_lang_data(
const char *arg0, const char *textbase, const char *language,
char **configs, int configs_size, bool configs_global_only) {
FILE *var_file;
static char c_path[MAX_PATH]; //path for c code
// Set the basename, compute the data directory.
main_setup(arg0, textbase);
debug_window_on.set_value (FALSE);
if (tessedit_write_vars) {
var_file = fopen ("edited.cfg", "w");
if (var_file != NULL) {
print_variables(var_file);
fclose(var_file);
}
}
strcpy (c_path, datadir.string());
c_path[strlen (c_path) - strlen (m_data_sub_dir.string ())] = '\0';
demodir = c_path;
// Set the language data path prefix
lang = language != NULL ? language : "eng";
language_data_path_prefix = datadir;
language_data_path_prefix += lang;
language_data_path_prefix += ".";
// Load tesseract variables from config files.
for (int i = 0; i < configs_size; ++i) {
read_config_file(configs[i], configs_global_only);
}
// Initialize TessdataManager.
STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
tessdata_manager.Init(tessdata_path.string());
// If a language specific config file (lang.config) exists, load it in.
if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) {
read_variables_from_fp(tessdata_manager.GetDataFilePtr(),
tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG),
false);
if (global_tessdata_manager_debug_level) {
tprintf("Loaded language config file\n");
}
}
// Load the unicharset
if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) ||
!unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) {
return false;
}
if (unicharset.size() > MAX_NUM_CLASSES) {
tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
return false;
}
if (global_tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
if (!global_tessedit_ambigs_training &&
tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) {
unichar_ambigs.LoadUnicharAmbigs(
tessdata_manager.GetDataFilePtr(),
tessdata_manager.GetEndOffset(TESSDATA_AMBIGS),
&unicharset);
if (global_tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
}
return true;
}
int Tesseract::init_tesseract(
const char *arg0, const char *textbase, const char *language,
char **configs, int configs_size, bool configs_global_only) {
if (!init_tesseract_lang_data(arg0, textbase, language, configs,
configs_size, configs_global_only)) {
return -1;
}
start_recog(textbase);
tessdata_manager.End();
return 0; //Normal exit
}
// Init everything except the language model
int Tesseract::init_tesseract_classifier(
const char *arg0, const char *textbase, const char *language,
char **configs, int configs_size, bool configs_global_only) {
if (!init_tesseract_lang_data (arg0, textbase, language, configs,
configs_size, configs_global_only)) {
return -1;
}
// Dont initialize the permuter.
program_editup(textbase, false);
tessdata_manager.End();
return 0;
}
// init the LM component
int Tesseract::init_tesseract_lm(const char *arg0,
const char *textbase,
const char *language) {
init_tesseract_lang_data(arg0, textbase, language, NULL, 0, false);
getDict().init_permute();
tessdata_manager.End();
return 0;
}
void Tesseract::end_tesseract() {
end_recog();
}
/* Define command type identifiers */
enum CMD_EVENTS
{
ACTION_1_CMD_EVENT,
RECOG_WERDS,
RECOG_PSEUDO,
ACTION_2_CMD_EVENT
};
} // namespace tesseract
#ifdef _TIFFIO_
void read_tiff_image(TIFF* tif, IMAGE* image) {
tdata_t buf;
uint32 image_width, image_height;
uint16 photometric;
inT16 bpp;
inT16 samples_per_pixel = 0;
TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &image_width);
TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &image_height);
if (!TIFFGetField(tif, TIFFTAG_BITSPERSAMPLE, &bpp))
bpp = 1; // Binary is default if no value provided.
TIFFGetField(tif, TIFFTAG_SAMPLESPERPIXEL, &samples_per_pixel);
TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometric);
if (samples_per_pixel > 1)
bpp *= samples_per_pixel;
// Tesseract's internal representation is 0-is-black,
// so if the photometric is 1 (min is black) then high-valued pixels
// are 1 (white), otherwise they are 0 (black).
uinT8 high_value = photometric == 1;
image->create(image_width, image_height, bpp);
IMAGELINE line;
line.init(image_width);
buf = _TIFFmalloc(TIFFScanlineSize(tif));
int bytes_per_line = (image_width*bpp + 7)/8;
uinT8* dest_buf = image->get_buffer();
// This will go badly wrong with one of the more exotic tiff formats,
// but the majority will work OK.
for (int y = 0; y < image_height; ++y) {
TIFFReadScanline(tif, buf, y);
memcpy(dest_buf, buf, bytes_per_line);
dest_buf += bytes_per_line;
}
if (high_value == 0)
invert_image(image);
_TIFFfree(buf);
}
#endif