mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-23 19:13:00 +08:00
164 lines
4.6 KiB
C++
164 lines
4.6 KiB
C++
/**********************************************************************
|
|
* File: dlltest.cpp
|
|
* Description: Main program to test the tessdll interface.
|
|
* Author: Ray Smith
|
|
* Created: Wed May 16 15:17:46 PDT 2007
|
|
*
|
|
* (C) Copyright 2007, Google Inc.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
#define _UNICODE
|
|
|
|
#include "stdafx.h"
|
|
#include "imgs.h"
|
|
#include "unichar.h"
|
|
#include "tessdll.h"
|
|
|
|
/**********************************************************************
|
|
* main()
|
|
*
|
|
**********************************************************************/
|
|
|
|
|
|
|
|
|
|
static wchar_t *make_unicode_string(const char *utf8)
|
|
{
|
|
int size = 0, out_index = 0;
|
|
wchar_t *out;
|
|
|
|
/* first calculate the size of the target string */
|
|
int used = 0;
|
|
int utf8_len = strlen(utf8);
|
|
while (used < utf8_len) {
|
|
int step = UNICHAR::utf8_step(utf8 + used);
|
|
if (step == 0)
|
|
break;
|
|
used += step;
|
|
++size;
|
|
}
|
|
|
|
out = (wchar_t *) malloc((size + 1) * sizeof(wchar_t));
|
|
if (out == NULL)
|
|
return NULL;
|
|
|
|
/* now convert to Unicode */
|
|
used = 0;
|
|
while (used < utf8_len) {
|
|
int step = UNICHAR::utf8_step(utf8 + used);
|
|
if (step == 0)
|
|
break;
|
|
UNICHAR ch(utf8 + used, step);
|
|
out[out_index++] = ch.first_uni();
|
|
used += step;
|
|
}
|
|
out[out_index] = 0;
|
|
|
|
return out;
|
|
}
|
|
|
|
|
|
int main(int argc, char **argv) {
|
|
if (argc < 3 || argc > 4) {
|
|
fprintf(stderr, "Usage:%s imagename outputname [lang]\n", argv[0]);
|
|
exit(1);
|
|
}
|
|
|
|
|
|
IMAGE image;
|
|
if (image.read_header(argv[1]) < 0) {
|
|
fprintf(stderr, "Can't open %s\n", argv[1]);
|
|
exit(1);
|
|
}
|
|
if (image.read(image.get_ysize ()) < 0) {
|
|
fprintf(stderr, "Can't read %s\n", argv[1]);
|
|
exit(1);
|
|
}
|
|
|
|
|
|
|
|
TessDllAPI api(argc > 3 ? argv[3] : "eng");
|
|
|
|
|
|
|
|
api.BeginPageUpright(image.get_xsize(), image.get_ysize(), image.get_buffer(),
|
|
image.get_bpp());
|
|
|
|
ETEXT_DESC* output = api.Recognize_all_Words();
|
|
|
|
|
|
|
|
|
|
FILE* fp = fopen(argv[2],"w");
|
|
if (fp == NULL) {
|
|
fprintf(stderr, "Can't create %s\n", argv[2]);
|
|
exit(1);
|
|
}
|
|
|
|
// It should be noted that the format for char_code for version 2.0 and beyond is UTF8
|
|
// which means that ASCII characters will come out as one structure but other characters
|
|
// will be returned in two or more instances of this structure with a single byte of the
|
|
// UTF8 code in each, but each will have the same bounding box.
|
|
// Programs which want to handle languagues with different characters sets will need to
|
|
// handle extended characters appropriately, but *all* code needs to be prepared to
|
|
// receive UTF8 coded characters for characters such as bullet and fancy quotes.
|
|
int j;
|
|
for (int i = 0; i < output->count; i = j) {
|
|
const EANYCODE_CHAR* ch = &output->text[i];
|
|
unsigned char unistr[UNICHAR_LEN];
|
|
|
|
for (int b = 0; b < ch->blanks; ++b)
|
|
fprintf(fp, "\n");
|
|
|
|
for (j = i; j < output->count; j++)
|
|
{
|
|
const EANYCODE_CHAR* unich = &output->text[j];
|
|
|
|
if (ch->left != unich->left || ch->right != unich->right ||
|
|
ch->top != unich->top || ch->bottom != unich->bottom)
|
|
break;
|
|
unistr[j - i] = static_cast<unsigned char>(unich->char_code);
|
|
}
|
|
unistr[j - i] = '\0';
|
|
|
|
wchar_t *utf16ch=make_unicode_string(reinterpret_cast<const char*>(unistr));
|
|
#ifndef _UNICODE
|
|
// If we aren't in _UNICODE mode, print string only if ascii.
|
|
if (ch->char_code <= 0x7f) {
|
|
fprintf(fp, "%s", unistr);
|
|
#else
|
|
// %S is a microsoft-special. Attempts to translate the Unicode
|
|
// back to the current locale to print in 8 bit
|
|
fprintf(fp, "%S", utf16ch);
|
|
#endif
|
|
// Print the hex codes of the utf8 code.
|
|
for (int x = 0; unistr[x] != '\0'; ++x)
|
|
fprintf(fp, "[%x]", unistr[x]);
|
|
fprintf(fp, "->");
|
|
// Print the hex codes of the unicode.
|
|
for (int y = 0; utf16ch[y] != 0; ++y)
|
|
fprintf(fp, "[%x]", utf16ch[y]);
|
|
// Print the coords.
|
|
fprintf(fp, "(%d,%d)->(%d,%d)\n",
|
|
ch->left, ch->bottom, ch->right, ch->top);
|
|
if (ch->formatting & 64)
|
|
fprintf(fp, "<nl>\n\n");
|
|
if (ch->formatting & 128)
|
|
fprintf(fp, "<para>\n\n");
|
|
free(utf16ch);
|
|
}
|
|
|
|
fclose(fp);
|
|
|
|
return 0;
|
|
}
|