/********************************************************************** * File: dlltest.cpp * Description: Main program to test the tessdll interface. * Author: Ray Smith * Created: Wed May 16 15:17:46 PDT 2007 * * (C) Copyright 2007, Google Inc. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/ #define _UNICODE #include "stdafx.h" #include "imgs.h" #include "unichar.h" #include "tessdll.h" /********************************************************************** * main() * **********************************************************************/ static wchar_t *make_unicode_string(const char *utf8) { int size = 0, out_index = 0; wchar_t *out; /* first calculate the size of the target string */ int used = 0; int utf8_len = strlen(utf8); while (used < utf8_len) { int step = UNICHAR::utf8_step(utf8 + used); if (step == 0) break; used += step; ++size; } out = (wchar_t *) malloc((size + 1) * sizeof(wchar_t)); if (out == NULL) return NULL; /* now convert to Unicode */ used = 0; while (used < utf8_len) { int step = UNICHAR::utf8_step(utf8 + used); if (step == 0) break; UNICHAR ch(utf8 + used, step); out[out_index++] = ch.first_uni(); used += step; } out[out_index] = 0; return out; } int main(int argc, char **argv) { if (argc < 3 || argc > 4) { fprintf(stderr, "Usage:%s imagename outputname [lang]\n", argv[0]); exit(1); } IMAGE image; if (image.read_header(argv[1]) < 0) { fprintf(stderr, "Can't open %s\n", argv[1]); exit(1); } if (image.read(image.get_ysize ()) < 0) { fprintf(stderr, "Can't read %s\n", argv[1]); exit(1); } TessDllAPI api(argc > 3 ? argv[3] : "eng"); api.BeginPageUpright(image.get_xsize(), image.get_ysize(), image.get_buffer(), image.get_bpp()); ETEXT_DESC* output = api.Recognize_all_Words(); FILE* fp = fopen(argv[2],"w"); if (fp == NULL) { fprintf(stderr, "Can't create %s\n", argv[2]); exit(1); } // It should be noted that the format for char_code for version 2.0 and beyond is UTF8 // which means that ASCII characters will come out as one structure but other characters // will be returned in two or more instances of this structure with a single byte of the // UTF8 code in each, but each will have the same bounding box. // Programs which want to handle languagues with different characters sets will need to // handle extended characters appropriately, but *all* code needs to be prepared to // receive UTF8 coded characters for characters such as bullet and fancy quotes. int j; for (int i = 0; i < output->count; i = j) { const EANYCODE_CHAR* ch = &output->text[i]; unsigned char unistr[UNICHAR_LEN]; for (int b = 0; b < ch->blanks; ++b) fprintf(fp, "\n"); for (j = i; j < output->count; j++) { const EANYCODE_CHAR* unich = &output->text[j]; if (ch->left != unich->left || ch->right != unich->right || ch->top != unich->top || ch->bottom != unich->bottom) break; unistr[j - i] = static_cast<unsigned char>(unich->char_code); } unistr[j - i] = '\0'; wchar_t *utf16ch=make_unicode_string(reinterpret_cast<const char*>(unistr)); #ifndef _UNICODE // If we aren't in _UNICODE mode, print string only if ascii. if (ch->char_code <= 0x7f) { fprintf(fp, "%s", unistr); #else // %S is a microsoft-special. Attempts to translate the Unicode // back to the current locale to print in 8 bit fprintf(fp, "%S", utf16ch); #endif // Print the hex codes of the utf8 code. for (int x = 0; unistr[x] != '\0'; ++x) fprintf(fp, "[%x]", unistr[x]); fprintf(fp, "->"); // Print the hex codes of the unicode. for (int y = 0; utf16ch[y] != 0; ++y) fprintf(fp, "[%x]", utf16ch[y]); // Print the coords. fprintf(fp, "(%d,%d)->(%d,%d)\n", ch->left, ch->bottom, ch->right, ch->top); if (ch->formatting & 64) fprintf(fp, "<nl>\n\n"); if (ch->formatting & 128) fprintf(fp, "<para>\n\n"); free(utf16ch); } fclose(fp); return 0; }