mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-11 23:19:04 +08:00
4523ce9f7d
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20
464 lines
14 KiB
C++
464 lines
14 KiB
C++
/**********************************************************************
|
|
* File: tessedit.cpp (Formerly tessedit.c)
|
|
* Description: Main program for merge of tess and editor.
|
|
* Author: Ray Smith
|
|
* Created: Tue Jan 07 15:21:46 GMT 1992
|
|
*
|
|
* (C) Copyright 1992, Hewlett-Packard Ltd.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
#include "mfcpch.h"
|
|
//#define USE_VLD //Uncomment for Visual Leak Detector.
|
|
#if (defined _MSC_VER && defined USE_VLD)
|
|
#include <vld.h>
|
|
#endif
|
|
#include <ctype.h>
|
|
#include "control.h"
|
|
#include "tessvars.h"
|
|
#include "tessedit.h"
|
|
#include "baseapi.h"
|
|
#include "thresholder.h"
|
|
#include "pageres.h"
|
|
#include "imgs.h"
|
|
#include "params.h"
|
|
#include "paramsd.h"
|
|
#include "tprintf.h"
|
|
#include "tesseractmain.h"
|
|
#include "stderr.h"
|
|
#include "notdll.h"
|
|
#include "output.h"
|
|
#include "globals.h"
|
|
#include "blread.h"
|
|
#include "tfacep.h"
|
|
|
|
// Include automatically generated configuration file if running autoconf
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config_auto.h"
|
|
#endif
|
|
#ifdef USING_GETTEXT
|
|
#include <libintl.h>
|
|
#include <locale.h>
|
|
#define _(x) gettext(x)
|
|
#else
|
|
#define _(x) (x)
|
|
#endif
|
|
#ifndef HAVE_LIBLEPT
|
|
#error "Sorry: Tesseract no longer compiles or runs without Leptonica!";
|
|
#endif
|
|
#include "allheaders.h"
|
|
|
|
|
|
#define VARDIR "configs/" /*variables files */
|
|
//config under api
|
|
#define API_CONFIG "configs/api_config"
|
|
|
|
const int kMaxIntSize = 22;
|
|
char szAppName[] = "Tessedit"; //app name
|
|
|
|
// Recognize a single page, given by the (const) image, and output the text,
|
|
// as controlled by global flag variables into the output text_out STRING:
|
|
// tessedit_serial_unlv is the top-level control, and provides 3 ways of
|
|
// treating the UNLV zones with the adaptive classifier:
|
|
// case 0: if there is a unlv zone file present, use it to segment the page
|
|
// and process the zones in parallel (pass 1 on all, then pass2 on all),
|
|
// otherwise, treat the whole page as a single zone.
|
|
// Independently of the existence of the unlv zone file:
|
|
// if tessedit_create_boxfile, output text in ".box" training file format, with
|
|
// one recognizable unit (as UTF8 characters) per line and its bounding box
|
|
// coded in UTF8(equivalent to ascii) for generating training data by hand.
|
|
// else if tessedit_write_unlv, output text in Latin-1, with a few special
|
|
// hacks for the UNLV test environment. Only works for latin!
|
|
// else (default mode) write plain text in UTF-8.
|
|
// case 1:(tessedit_serial_unlv) Read a unlv zone file (and fail if not found)
|
|
// and treat each zone as an independent "page", including resetting the
|
|
// adaptive classifier between zones.
|
|
// case 2: Read a unlv zone file (fail if not found) and treat each zone as
|
|
// a page of a document, i.e. DON'T reset the adaptive classifier between
|
|
// zones.
|
|
// In case 1 and 2, the UNLV zone file name is derived from input_file, by
|
|
// replacing the last 4 characters with ".uzn". In case 0, the unlv zone
|
|
// file name is derived from the 2nd parameter to InitWithLanguage, and
|
|
// the value of input_file is ignored - ugly, but true - a consequence of
|
|
// the way that unlv zone file reading takes the place of a page layout
|
|
// analyzer.
|
|
void TesseractImage(const char* input_file, Pix* pix, int page_index,
|
|
tesseract::TessBaseAPI* api, STRING* text_out) {
|
|
api->SetInputName(input_file);
|
|
api->SetImage(pix);
|
|
int serial_unlv;
|
|
ASSERT_HOST(api->GetIntVariable("tessedit_serial_unlv", &serial_unlv));
|
|
if (serial_unlv == 0) {
|
|
char* text;
|
|
bool bool_value;
|
|
if ((api->GetBoolVariable("tessedit_create_boxfile", &bool_value) &&
|
|
bool_value) ||
|
|
(api->GetBoolVariable("tessedit_make_boxes_from_boxes", &bool_value) &&
|
|
bool_value)) {
|
|
text = api->GetBoxText(page_index);
|
|
} else if (api->GetBoolVariable("tessedit_write_unlv", &bool_value) &&
|
|
bool_value) {
|
|
text = api->GetUNLVText();
|
|
} else if (api->GetBoolVariable("tessedit_create_hocr", &bool_value)
|
|
&& bool_value) {
|
|
text = api->GetHOCRText(page_index);
|
|
} else {
|
|
text = api->GetUTF8Text();
|
|
}
|
|
*text_out += text;
|
|
delete [] text;
|
|
} else {
|
|
BLOCK_LIST blocks;
|
|
STRING filename = input_file;
|
|
const char* lastdot = strrchr(filename.string(), '.');
|
|
if (lastdot != NULL) {
|
|
filename[lastdot - filename.string()] = '\0';
|
|
}
|
|
if (!read_unlv_file(filename, pixGetWidth(pix), pixGetHeight(pix),
|
|
&blocks)) {
|
|
fprintf(stderr, _("Error: Must have a unlv zone file %s to read!\n"),
|
|
filename.string());
|
|
return;
|
|
}
|
|
BLOCK_IT b_it = &blocks;
|
|
for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
|
|
BLOCK* block = b_it.data();
|
|
TBOX box = block->bounding_box();
|
|
api->SetRectangle(box.left(), pixGetHeight(pix) - box.top(),
|
|
box.width(), box.height());
|
|
char* text = api->GetUNLVText();
|
|
*text_out += text;
|
|
delete [] text;
|
|
if (serial_unlv == 1)
|
|
api->ClearAdaptiveClassifier();
|
|
}
|
|
}
|
|
bool bool_value;
|
|
if (api->GetBoolVariable("tessedit_write_images",
|
|
&bool_value) && bool_value) {
|
|
Pix* page_pix = api->GetThresholdedImage();
|
|
pixWrite("tessinput.tif", page_pix, IFF_TIFF_G4);
|
|
}
|
|
}
|
|
|
|
/**********************************************************************
|
|
* main()
|
|
*
|
|
**********************************************************************/
|
|
|
|
int main(int argc, char **argv) {
|
|
STRING outfile; //output file
|
|
|
|
#ifdef USING_GETTEXT
|
|
setlocale (LC_ALL, "");
|
|
bindtextdomain (PACKAGE, LOCALEDIR);
|
|
textdomain (PACKAGE);
|
|
#endif
|
|
|
|
// Detect incorrectly placed -l option.
|
|
for (int arg = 0; arg < argc; ++arg) {
|
|
if (arg != 3 && strcmp(argv[arg], "-l") == 0) {
|
|
fprintf(stderr, _("Error: -l must be arg3, not %d\n"), arg);
|
|
argc = 0;
|
|
}
|
|
}
|
|
#ifdef HAVE_CONFIG_H /* Assume that only Unix users care about -v */
|
|
if (argc == 2 && strcmp(argv[1], "-v") == 0) {
|
|
fprintf(stderr, "tesseract %s\n", PACKAGE_VERSION);
|
|
exit(1);
|
|
}
|
|
#endif
|
|
if (argc < 3) {
|
|
fprintf(stderr, "Usage:%s imagename outputbase [-l lang]"
|
|
" [configfile [[+|-]varfile]...]\n"
|
|
#if !defined(HAVE_LIBLEPT) && !defined(_TIFFIO_)
|
|
"Warning - no liblept or libtiff - cannot read compressed"
|
|
" tiff files.\n"
|
|
#endif
|
|
, argv[0]);
|
|
exit(1);
|
|
}
|
|
// Find the required language.
|
|
const char* lang = "eng";
|
|
int arg = 3;
|
|
if (argc >= 5 && strcmp(argv[3], "-l") == 0) {
|
|
lang = argv[4];
|
|
arg = 5;
|
|
}
|
|
|
|
tesseract::TessBaseAPI api;
|
|
|
|
api.SetOutputName(argv[2]);
|
|
api.Init(argv[0], lang, tesseract::OEM_DEFAULT, &(argv[arg]), argc-arg, false);
|
|
|
|
tprintf (_("Tesseract Open Source OCR Engine with Leptonica\n"));
|
|
|
|
STRING text_out;
|
|
int tessedit_page_number;
|
|
ASSERT_HOST(api.GetIntVariable("tessedit_page_number",
|
|
&tessedit_page_number));
|
|
int page_number = tessedit_page_number;
|
|
if (page_number < 0)
|
|
page_number = 0;
|
|
FILE* fp = fopen(argv[1], "rb");
|
|
if (fp == NULL) {
|
|
tprintf(_("Image file %s cannot be opened!\n"), argv[1]);
|
|
fclose(fp);
|
|
exit(1);
|
|
}
|
|
int page = page_number;
|
|
int npages = 0;
|
|
bool is_tiff = fileFormatIsTiff(fp);
|
|
if (is_tiff)
|
|
{
|
|
int tiffstat = tiffGetCount(fp, &npages);
|
|
if (tiffstat == 1)
|
|
{
|
|
fprintf (stderr, _("Error reading file %s!\n"), argv[1]);
|
|
fclose(fp);
|
|
exit(1);
|
|
}
|
|
else
|
|
fprintf(stderr, _("Number of found pages: %d.\n"), npages);
|
|
}
|
|
fclose(fp);
|
|
fp = NULL;
|
|
|
|
Pix *pix;
|
|
if (is_tiff) {
|
|
for (; page < npages; ++page)
|
|
{
|
|
pix = pixReadTiff(argv[1], page);
|
|
if (!pix)
|
|
continue;
|
|
if (npages > 1)
|
|
tprintf(_("Page %d\n"), page);
|
|
char page_str[kMaxIntSize];
|
|
snprintf(page_str, kMaxIntSize - 1, "%d", page);
|
|
api.SetVariable("applybox_page", page_str);
|
|
|
|
// Run tesseract on the page!
|
|
TesseractImage(argv[1], pix, page, &api, &text_out);
|
|
pixDestroy(&pix);
|
|
if (tessedit_page_number >= 0 || npages == 1)
|
|
{
|
|
break;
|
|
}
|
|
}
|
|
} else
|
|
{
|
|
// The file is not a tiff file, so use the general pixRead function.
|
|
// If the image fails to read, try it as a list of filenames.
|
|
pix = pixRead(argv[1]);
|
|
if (pix == NULL) {
|
|
FILE* fimg = fopen(argv[1], "r");
|
|
if (fimg == NULL) {
|
|
tprintf(_("File %s cannot be opened!\n"), argv[1]);
|
|
fclose(fimg);
|
|
exit(1);
|
|
}
|
|
char filename[MAX_PATH];
|
|
while (fgets(filename, sizeof(filename), fimg) != NULL) {
|
|
chomp_string(filename);
|
|
pix = pixRead(filename);
|
|
if (pix == NULL) {
|
|
tprintf(_("Image file %s cannot be read!\n"), filename);
|
|
fclose(fimg);
|
|
exit(1);
|
|
}
|
|
tprintf(_("Page %d : %s\n"), page, filename);
|
|
TesseractImage(filename, pix, page, &api, &text_out);
|
|
pixDestroy(&pix);
|
|
++page;
|
|
}
|
|
fclose(fimg);
|
|
} else {
|
|
TesseractImage(argv[1], pix, 0, &api, &text_out);
|
|
pixDestroy(&pix);
|
|
}
|
|
}
|
|
|
|
bool output_hocr = false;
|
|
api.GetBoolVariable("tessedit_create_hocr", &output_hocr);
|
|
bool output_box = false;
|
|
api.GetBoolVariable("tessedit_create_boxfile", &output_box);
|
|
outfile = argv[2];
|
|
outfile += output_hocr ? ".html" : output_box ? ".box" : ".txt";
|
|
FILE* fout = fopen(outfile.string(), "w");
|
|
if (fout == NULL) {
|
|
tprintf(_("Cannot create output file %s\n"), outfile.string());
|
|
fclose(fout);
|
|
exit(1);
|
|
}
|
|
if (output_hocr) {
|
|
const char html_header[] =
|
|
"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\""
|
|
" \"http://www.w3.org/TR/html4/loose.dtd\">\n"
|
|
"<html>\n<head>\n<title></title>\n"
|
|
"<meta http-equiv=\"Content-Type\" content=\"text/html;"
|
|
"charset=utf-8\" >\n<meta name='ocr-system' content='tesseract'>\n"
|
|
"</head>\n<body>\n";
|
|
fprintf(fout, "%s", html_header);
|
|
}
|
|
fwrite(text_out.string(), 1, text_out.length(), fout);
|
|
if (output_hocr)
|
|
fprintf(fout, "</body>\n</html>\n");
|
|
fclose(fout);
|
|
|
|
return 0; //Normal exit
|
|
}
|
|
|
|
#ifdef __MSW32__
|
|
int initialized = 0;
|
|
|
|
/**********************************************************************
|
|
* WinMain
|
|
*
|
|
* Main function for a windows program.
|
|
**********************************************************************/
|
|
|
|
int WINAPI WinMain( //main for windows //command line
|
|
HINSTANCE hInstance,
|
|
HINSTANCE hPrevInstance,
|
|
LPSTR lpszCmdLine,
|
|
int nCmdShow) {
|
|
WNDCLASS wc;
|
|
HWND hwnd;
|
|
MSG msg;
|
|
|
|
char **argv;
|
|
char *argsin[2];
|
|
int argc;
|
|
int exit_code;
|
|
|
|
wc.style = CS_NOCLOSE | CS_OWNDC;
|
|
wc.lpfnWndProc = (WNDPROC) WndProc;
|
|
wc.cbClsExtra = 0;
|
|
wc.cbWndExtra = 0;
|
|
wc.hInstance = hInstance;
|
|
wc.hIcon = NULL; //LoadIcon (NULL, IDI_APPLICATION);
|
|
wc.hCursor = NULL; //LoadCursor (NULL, IDC_ARROW);
|
|
wc.hbrBackground = (HBRUSH) (COLOR_WINDOW + 1);
|
|
wc.lpszMenuName = NULL;
|
|
wc.lpszClassName = szAppName;
|
|
|
|
RegisterClass(&wc);
|
|
|
|
hwnd = CreateWindow (szAppName, szAppName,
|
|
WS_OVERLAPPEDWINDOW | WS_DISABLED,
|
|
CW_USEDEFAULT, CW_USEDEFAULT, CW_USEDEFAULT,
|
|
CW_USEDEFAULT, HWND_DESKTOP, NULL, hInstance, NULL);
|
|
|
|
argsin[0] = strdup (szAppName);
|
|
argsin[1] = strdup (lpszCmdLine);
|
|
/*allocate memory for the args. There can never be more than half*/
|
|
/*the total number of characters in the arguments.*/
|
|
argv =
|
|
(char **) malloc (((strlen (argsin[0]) + strlen (argsin[1])) / 2 + 1) *
|
|
sizeof (char *));
|
|
|
|
/*now construct argv as it should be for C.*/
|
|
argc = parse_args (2, argsin, argv);
|
|
|
|
// ShowWindow (hwnd, nCmdShow);
|
|
// UpdateWindow (hwnd);
|
|
|
|
if (initialized) {
|
|
exit_code = main (argc, argv);
|
|
free (argsin[0]);
|
|
free (argsin[1]);
|
|
free(argv);
|
|
return exit_code;
|
|
}
|
|
while (GetMessage (&msg, NULL, 0, 0)) {
|
|
TranslateMessage(&msg);
|
|
DispatchMessage(&msg);
|
|
if (initialized) {
|
|
exit_code = main (argc, argv);
|
|
break;
|
|
}
|
|
else
|
|
exit_code = msg.wParam;
|
|
}
|
|
free (argsin[0]);
|
|
free (argsin[1]);
|
|
free(argv);
|
|
return exit_code;
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* WndProc
|
|
*
|
|
* Function to respond to messages.
|
|
**********************************************************************/
|
|
|
|
LONG WINAPI WndProc( //message handler
|
|
HWND hwnd, //window with message
|
|
UINT msg, //message typ
|
|
WPARAM wParam,
|
|
LPARAM lParam) {
|
|
HDC hdc;
|
|
|
|
if (msg == WM_CREATE) {
|
|
//
|
|
// Create a rendering context.
|
|
//
|
|
hdc = GetDC (hwnd);
|
|
ReleaseDC(hwnd, hdc);
|
|
initialized = 1;
|
|
return 0;
|
|
}
|
|
return DefWindowProc (hwnd, msg, wParam, lParam);
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* parse_args
|
|
*
|
|
* Turn a list of args into a new list of args with each separate
|
|
* whitespace spaced string being an arg.
|
|
**********************************************************************/
|
|
|
|
int
|
|
parse_args ( /*refine arg list */
|
|
int argc, /*no of input args */
|
|
char *argv[], /*input args */
|
|
char *arglist[] /*output args */
|
|
) {
|
|
int argcount; /*converted argc */
|
|
char *testchar; /*char in option string */
|
|
int arg; /*current argument */
|
|
|
|
argcount = 0; /*no of options */
|
|
for (arg = 0; arg < argc; arg++) {
|
|
testchar = argv[arg]; /*start of arg */
|
|
do {
|
|
while (*testchar
|
|
&& (*testchar == ' ' || *testchar == '\n'
|
|
|| *testchar == '\t'))
|
|
testchar++; /*skip white space */
|
|
if (*testchar) {
|
|
/*new arg */
|
|
arglist[argcount++] = testchar;
|
|
/*skip to white space */
|
|
for (testchar++; *testchar && *testchar != ' ' && *testchar != '\n' && *testchar != '\t'; testchar++) ;
|
|
if (*testchar)
|
|
*testchar++ = '\0'; /*turn to separate args */
|
|
}
|
|
}
|
|
while (*testchar);
|
|
}
|
|
return argcount; /*new number of args */
|
|
}
|
|
#endif
|