tesseract/ccmain/tessedit.cpp
theraysmith dd18aea052 Added multi-page tiff capability
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@128 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2008-02-01 00:00:46 +00:00

249 lines
8.7 KiB
C++

/**********************************************************************
* File: tessedit.cpp (Formerly tessedit.c)
* Description: Main program for merge of tess and editor.
* Author: Ray Smith
* Created: Tue Jan 07 15:21:46 GMT 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/
#include "mfcpch.h"
//#include <osfcn.h>
//#include <signal.h>
//#include <time.h>
//#include <unistd.h>
#include "tfacep.h" //must be before main.h
//#include "fileerr.h"
#include "stderr.h"
#include "basedir.h"
#include "tessvars.h"
//#include "debgwin.h"
//#include "epapdest.h"
#include "control.h"
#include "imgs.h"
#include "reject.h"
#include "pageres.h"
//#include "gpapdest.h"
#include "mainblk.h"
#include "nwmain.h"
#include "pgedit.h"
#include "ocrshell.h"
#include "tprintf.h"
//#include "ipeerr.h"
//#include "restart.h"
#include "tessedit.h"
//#include "fontfind.h"
#include "permute.h"
#include "permdawg.h"
#include "permnum.h"
#include "stopper.h"
#include "adaptmatch.h"
#include "intmatcher.h"
#include "chop.h"
#include "efio.h"
#include "danerror.h"
#include "globals.h"
/*
** Include automatically generated configuration file if running autoconf
*/
#ifdef HAVE_CONFIG_H
#include "config_auto.h"
#endif
// Includes libtiff if HAVE_LIBTIFF is defined
#ifdef HAVE_LIBTIFF
#include "tiffio.h"
#endif
//extern "C" {
#include "callnet.h" //phils nn stuff
//}
#include "notdll.h" //phils nn stuff
#define VARDIR "configs/" /*variables files */
//config under api
#define API_CONFIG "configs/api_config"
#define EXTERN
EXTERN STRING_VAR (tessedit_char_blacklist, "",
"Blacklist of chars not to recognize");
EXTERN STRING_VAR (tessedit_char_whitelist, "",
"Whitelist of chars to recognize");
EXTERN BOOL_EVAR (tessedit_write_vars, FALSE, "Write all vars to file");
EXTERN BOOL_VAR (tessedit_tweaking_tess_vars, FALSE,
"Fiddle tess config values");
EXTERN double_VAR (tweak_garbage, 1.5, "Tess VAR");
EXTERN double_VAR (tweak_ok_word, 1.25, "Tess VAR");
EXTERN double_VAR (tweak_good_word, 1.1, "Tess VAR");
EXTERN double_VAR (tweak_freq_word, 1.0, "Tess VAR");
EXTERN double_VAR (tweak_ok_number, 1.4, "Tess VAR");
EXTERN double_VAR (tweak_good_number, 1.1, "Tess VAR");
EXTERN double_VAR (tweak_non_word, 1.25, "Tess VAR");
EXTERN double_VAR (tweak_CertaintyPerChar, -0.5, "Tess VAR");
EXTERN double_VAR (tweak_NonDictCertainty, -2.5, "Tess VAR");
EXTERN double_VAR (tweak_RejectCertaintyOffset, 1.0, "Tess VAR");
EXTERN double_VAR (tweak_GoodAdaptiveMatch, 0.125, "Tess VAR");
EXTERN double_VAR (tweak_GreatAdaptiveMatch, 0.10, "Tess VAR");
EXTERN INT_VAR (tweak_ReliableConfigThreshold, 2, "Tess VAR");
EXTERN INT_VAR (tweak_AdaptProtoThresh, 230, "Tess VAR");
EXTERN INT_VAR (tweak_AdaptFeatureThresh, 230, "Tess VAR");
EXTERN INT_VAR (tweak_min_outline_points, 6, "Tess VAR");
EXTERN INT_VAR (tweak_min_outline_area, 2000, "Tess VAR");
EXTERN double_VAR (tweak_good_split, 50.0, "Tess VAR");
EXTERN double_VAR (tweak_ok_split, 100.0, "Tess VAR");
extern INT16 XOFFSET;
extern INT16 YOFFSET;
extern int NO_BLOCK;
//progress monitor
ETEXT_DESC *global_monitor = NULL;
int init_tesseract(const char *arg0,
const char *textbase,
const char *language,
const char *configfile,
int configc,
const char *const *configv) {
FILE *var_file;
static char c_path[MAX_PATH]; //path for c code
// Set the basename, compute the data directory and read C++ configs.
main_setup(arg0, textbase, configc, configv);
debug_window_on.set_value (FALSE);
if (tessedit_write_vars) {
var_file = fopen ("edited.cfg", "w");
if (var_file != NULL) {
print_variables(var_file);
fclose(var_file);
}
}
strcpy (c_path, datadir.string ());
c_path[strlen (c_path) - strlen (m_data_sub_dir.string ())] = '\0';
demodir = c_path;
// Set the language data path prefix
language_data_path_prefix = datadir;
if (language != NULL)
language_data_path_prefix += language;
else
language_data_path_prefix += "eng";
language_data_path_prefix += ".";
// Load the unichar set
STRING unicharpath = language_data_path_prefix;
unicharpath += "unicharset";
if (!unicharset.load_from_file(unicharpath.string())) {
cprintf("Unable to load unicharset file %s\n", unicharpath.string());
exit(1);
}
if (unicharset.size() > MAX_NUM_CLASSES) {
cprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
exit(1);
}
// Set the white and blacklists (if any)
unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
tessedit_char_whitelist.string());
start_recog(configfile, textbase);
set_tess_tweak_vars();
if (tessedit_use_nn) //phils nn stuff
init_net();
return 0; //Normal exit
}
void end_tesseract() {
end_recog();
}
#ifdef _TIFFIO_
void read_tiff_image(TIFF* tif, IMAGE* image) {
tdata_t buf;
uint32 image_width, image_height;
uint16 photometric;
short bpp;
TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &image_width);
TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &image_height);
TIFFGetField(tif, TIFFTAG_BITSPERSAMPLE, &bpp);
TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometric);
// Tesseract's internal representation is 0-is-black,
// so if the photometric is 1 (min is black) then high-valued pixels
// are 1 (white), otherwise they are 0 (black).
UINT8 high_value = photometric == 1;
image->create(image_width, image_height, bpp);
IMAGELINE line;
line.init(image_width);
buf = _TIFFmalloc(TIFFScanlineSize(tif));
int bytes_per_line = (image_width*bpp + 7)/8;
UINT8* dest_buf = image->get_buffer();
// This will go badly wrong with one of the more exotic tiff formats,
// but the majority will work OK.
for (int y = 0; y < image_height; ++y) {
TIFFReadScanline(tif, buf, y);
memcpy(dest_buf, buf, bytes_per_line);
dest_buf += bytes_per_line;
}
if (high_value == 0)
invert_image(image);
_TIFFfree(buf);
}
#endif
/* Define command type identifiers */
enum CMD_EVENTS
{
ACTION_1_CMD_EVENT,
RECOG_WERDS,
RECOG_PSEUDO,
ACTION_2_CMD_EVENT
};
/*************************************************************************
* set_tess_tweak_vars()
* Set TESS vars from the tweek value - This is only really of use during search
* of the space of tess configs - othertimes the default values are set
*
*************************************************************************/
void set_tess_tweak_vars() {
if (tessedit_tweaking_tess_vars) {
garbage = tweak_garbage;
ok_word = tweak_ok_word;
good_word = tweak_good_word;
freq_word = tweak_freq_word;
ok_number = tweak_ok_number;
good_number = tweak_good_number;
non_word = tweak_non_word;
CertaintyPerChar = tweak_CertaintyPerChar;
NonDictCertainty = tweak_NonDictCertainty;
RejectCertaintyOffset = tweak_RejectCertaintyOffset;
GoodAdaptiveMatch = tweak_GoodAdaptiveMatch;
GreatAdaptiveMatch = tweak_GreatAdaptiveMatch;
ReliableConfigThreshold = tweak_ReliableConfigThreshold;
AdaptProtoThresh = tweak_AdaptProtoThresh;
AdaptFeatureThresh = tweak_AdaptFeatureThresh;
min_outline_points = tweak_min_outline_points;
min_outline_area = tweak_min_outline_area;
good_split = tweak_good_split;
ok_split = tweak_ok_split;
}
// if (expiry_day * 24 * 60 * 60 < time(NULL))
// err_exit();
}