tesseract/dlltest/dlltest.cpp

/**********************************************************************
 * File:        dlltest.cpp
 * Description: Main program to test the tessdll interface.
 * Author:      Ray Smith
 * Created:     Wed May 16 15:17:46 PDT 2007
 *
 * (C) Copyright 2007, Google Inc.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 **********************************************************************/
#define _UNICODE

#include "stdafx.h"
#include "imgs.h"
#include "unichar.h"
#include "tessdll.h"

/**********************************************************************
 *  main()
 *
 **********************************************************************/


static wchar_t *make_unicode_string(const char *utf8)
{
  int size = 0, out_index = 0;
  wchar_t *out;

  /* first calculate the size of the target string */
  int used = 0;
  int utf8_len = strlen(utf8);
  while (used < utf8_len) {
    int step = UNICHAR::utf8_step(utf8 + used);
    if (step == 0)
      break;
    used += step;
    ++size;
  }

  out = (wchar_t *) malloc((size + 1) * sizeof(wchar_t));
  if (out == NULL)
      return NULL;

  /* now convert to Unicode */
  used = 0;
  while (used < utf8_len) {
    int step = UNICHAR::utf8_step(utf8 + used);
    if (step == 0)
      break;
    UNICHAR ch(utf8 + used, step);
    out[out_index++] = ch.first_uni();
    used += step;
  }
  out[out_index] = 0;

  return out;
}


int main(int argc, char **argv) {
  if (argc < 3 || argc > 4) {
    fprintf(stderr, "Usage:%s imagename outputname [lang]\n", argv[0]);
    exit(1);
  }


  IMAGE image;
  if (image.read_header(argv[1]) < 0) {
    fprintf(stderr, "Can't open %s\n", argv[1]);
    exit(1);
  }
  if (image.read(image.get_ysize ()) < 0) {
    fprintf(stderr, "Can't read %s\n", argv[1]);
    exit(1);
  }
 

  TessDllAPI api(argc > 3 ? argv[3] : "eng");


  api.BeginPageUpright(image.get_xsize(), image.get_ysize(), image.get_buffer(),
		       image.get_bpp());

  ETEXT_DESC* output = api.Recognize_all_Words();


  FILE* fp = fopen(argv[2],"w");
  if (fp == NULL) {
    fprintf(stderr, "Can't create %s\n", argv[2]);
    exit(1);
  }

  // It should be noted that the format for char_code for version 2.0 and beyond is UTF8
  // which means that ASCII characters will come out as one structure but other characters
  // will be returned in two or more instances of this structure with a single byte of the
  // UTF8 code in each, but each will have the same bounding box.
  // Programs which want to handle languagues with different characters sets will need to
  // handle extended characters appropriately, but *all* code needs to be prepared to
  // receive UTF8 coded characters for characters such as bullet and fancy quotes.
  int j;
  for (int i = 0; i < output->count; i = j) {
    const EANYCODE_CHAR* ch = &output->text[i];
	  unsigned char unistr[UNICHAR_LEN];
		
    for (int b = 0; b < ch->blanks; ++b)
      fprintf(fp, "\n");

    for (j = i; j < output->count; j++)
	  {
		  const EANYCODE_CHAR* unich = &output->text[j];

		  if (ch->left != unich->left || ch->right != unich->right ||
          ch->top != unich->top || ch->bottom != unich->bottom)
			  break;
		  unistr[j - i] = static_cast<unsigned char>(unich->char_code);
	  }
    unistr[j - i] = '\0';
		  
    wchar_t *utf16ch=make_unicode_string(reinterpret_cast<const char*>(unistr));
#ifndef _UNICODE
    // If we aren't in _UNICODE mode, print string only if ascii.
    if (ch->char_code <= 0x7f) {
      fprintf(fp, "%s", unistr);
#else
    // %S is a microsoft-special. Attempts to translate the Unicode
    // back to the current locale to print in 8 bit
    fprintf(fp, "%S", utf16ch);
#endif
    // Print the hex codes of the utf8 code.
    for (int x = 0; unistr[x] != '\0'; ++x)
      fprintf(fp, "[%x]", unistr[x]);
		fprintf(fp, "->");
    // Print the hex codes of the unicode.
    for (int y = 0; utf16ch[y] != 0; ++y)
      fprintf(fp, "[%x]", utf16ch[y]);
    // Print the coords.
    fprintf(fp, "(%d,%d)->(%d,%d)\n",
      ch->left, ch->bottom, ch->right, ch->top);
    if (ch->formatting & 64)
      fprintf(fp, "<nl>\n\n");
    if (ch->formatting & 128)
      fprintf(fp, "<para>\n\n");
	  free(utf16ch);
  }

  fclose(fp);

  return 0;
}
Adding dlltest git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@60 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-17 08:46:08 +08:00			`/**********************************************************************`
			`* File: dlltest.cpp`
			`* Description: Main program to test the tessdll interface.`
Devstudio changes for v2.00. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@81 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 08:59:35 +08:00			`* Author: Ray Smith`
			`* Created: Wed May 16 15:17:46 PDT 2007`
Adding dlltest git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@60 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-17 08:46:08 +08:00			`*`
			`* (C) Copyright 2007, Google Inc.`
			`** Licensed under the Apache License, Version 2.0 (the "License");`
			`** you may not use this file except in compliance with the License.`
			`** You may obtain a copy of the License at`
			`** http://www.apache.org/licenses/LICENSE-2.0`
			`** Unless required by applicable law or agreed to in writing, software`
			`** distributed under the License is distributed on an "AS IS" BASIS,`
			`** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`** See the License for the specific language governing permissions and`
			`** limitations under the License.`
			`*`
			`**********************************************************************/`
Windows changes for 2.01 includingimprovements to dll api and test program git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@108 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-08-31 02:22:12 +08:00			`#define _UNICODE`
Adding dlltest git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@60 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-17 08:46:08 +08:00
			`#include "stdafx.h"`
Devstudio changes for v2.00. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@81 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 08:59:35 +08:00			`#include "imgs.h"`
Windows changes for 2.01 includingimprovements to dll api and test program git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@108 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-08-31 02:22:12 +08:00			`#include "unichar.h"`
Devstudio changes for v2.00. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@81 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 08:59:35 +08:00			`#include "tessdll.h"`
Adding dlltest git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@60 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-17 08:46:08 +08:00
			`/**********************************************************************`
			`* main()`
			`*`
			`**********************************************************************/`

Windows changes for 2.01 includingimprovements to dll api and test program git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@108 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-08-31 02:22:12 +08:00


			`static wchar_t make_unicode_string(const char utf8)`
			`{`
			`int size = 0, out_index = 0;`
			`wchar_t *out;`

			`/* first calculate the size of the target string */`
			`int used = 0;`
			`int utf8_len = strlen(utf8);`
			`while (used < utf8_len) {`
			`int step = UNICHAR::utf8_step(utf8 + used);`
			`if (step == 0)`
			`break;`
			`used += step;`
			`++size;`
			`}`

			`out = (wchar_t ) malloc((size + 1) sizeof(wchar_t));`
			`if (out == NULL)`
			`return NULL;`

			`/* now convert to Unicode */`
			`used = 0;`
			`while (used < utf8_len) {`
			`int step = UNICHAR::utf8_step(utf8 + used);`
			`if (step == 0)`
			`break;`
			`UNICHAR ch(utf8 + used, step);`
			`out[out_index++] = ch.first_uni();`
			`used += step;`
			`}`
			`out[out_index] = 0;`

			`return out;`
			`}`


Adding dlltest git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@60 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-17 08:46:08 +08:00			`int main(int argc, char **argv) {`
Windows changes for 2.01 includingimprovements to dll api and test program git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@108 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-08-31 02:22:12 +08:00			`if (argc < 3 \|\| argc > 4) {`
			`fprintf(stderr, "Usage:%s imagename outputname [lang]\n", argv[0]);`
Adding dlltest git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@60 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-17 08:46:08 +08:00			`exit(1);`
			`}`

Devstudio changes for v2.00. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@81 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 08:59:35 +08:00
Adding dlltest git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@60 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-17 08:46:08 +08:00			`IMAGE image;`
			`if (image.read_header(argv[1]) < 0) {`
Devstudio changes for v2.00. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@81 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 08:59:35 +08:00			`fprintf(stderr, "Can't open %s\n", argv[1]);`
			`exit(1);`
Adding dlltest git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@60 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-17 08:46:08 +08:00			`}`
			`if (image.read(image.get_ysize ()) < 0) {`
Devstudio changes for v2.00. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@81 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 08:59:35 +08:00			`fprintf(stderr, "Can't read %s\n", argv[1]);`
			`exit(1);`
			`}`



Windows changes for 2.01 includingimprovements to dll api and test program git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@108 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-08-31 02:22:12 +08:00			`TessDllAPI api(argc > 3 ? argv[3] : "eng");`
Devstudio changes for v2.00. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@81 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 08:59:35 +08:00


			`api.BeginPageUpright(image.get_xsize(), image.get_ysize(), image.get_buffer(),`
			`image.get_bpp());`

			`ETEXT_DESC* output = api.Recognize_all_Words();`




			`FILE* fp = fopen(argv[2],"w");`
			`if (fp == NULL) {`
			`fprintf(stderr, "Can't create %s\n", argv[2]);`
			`exit(1);`
			`}`

Windows changes for 2.01 includingimprovements to dll api and test program git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@108 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-08-31 02:22:12 +08:00			`// It should be noted that the format for char_code for version 2.0 and beyond is UTF8`
			`// which means that ASCII characters will come out as one structure but other characters`
			`// will be returned in two or more instances of this structure with a single byte of the`
			`// UTF8 code in each, but each will have the same bounding box.`
			`// Programs which want to handle languagues with different characters sets will need to`
			`// handle extended characters appropriately, but all code needs to be prepared to`
			`// receive UTF8 coded characters for characters such as bullet and fancy quotes.`
			`int j;`
			`for (int i = 0; i < output->count; i = j) {`
Devstudio changes for v2.00. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@81 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 08:59:35 +08:00			`const EANYCODE_CHAR* ch = &output->text[i];`
Windows changes for 2.01 includingimprovements to dll api and test program git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@108 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-08-31 02:22:12 +08:00			`unsigned char unistr[UNICHAR_LEN];`

Devstudio changes for v2.00. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@81 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 08:59:35 +08:00			`for (int b = 0; b < ch->blanks; ++b)`
			`fprintf(fp, "\n");`
Windows changes for 2.01 includingimprovements to dll api and test program git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@108 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-08-31 02:22:12 +08:00
			`for (j = i; j < output->count; j++)`
			`{`
			`const EANYCODE_CHAR* unich = &output->text[j];`

			`if (ch->left != unich->left \|\| ch->right != unich->right \|\|`
			`ch->top != unich->top \|\| ch->bottom != unich->bottom)`
			`break;`
			`unistr[j - i] = static_cast<unsigned char>(unich->char_code);`
			`}`
			`unistr[j - i] = '\0';`

			`wchar_t utf16ch=make_unicode_string(reinterpret_cast<const char>(unistr));`
			`#ifndef _UNICODE`
			`// If we aren't in _UNICODE mode, print string only if ascii.`
			`if (ch->char_code <= 0x7f) {`
			`fprintf(fp, "%s", unistr);`
			`#else`
			`// %S is a microsoft-special. Attempts to translate the Unicode`
			`// back to the current locale to print in 8 bit`
			`fprintf(fp, "%S", utf16ch);`
			`#endif`
			`// Print the hex codes of the utf8 code.`
			`for (int x = 0; unistr[x] != '\0'; ++x)`
			`fprintf(fp, "[%x]", unistr[x]);`
			`fprintf(fp, "->");`
			`// Print the hex codes of the unicode.`
			`for (int y = 0; utf16ch[y] != 0; ++y)`
			`fprintf(fp, "[%x]", utf16ch[y]);`
			`// Print the coords.`
			`fprintf(fp, "(%d,%d)->(%d,%d)\n",`
Devstudio changes for v2.00. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@81 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 08:59:35 +08:00			`ch->left, ch->bottom, ch->right, ch->top);`
			`if (ch->formatting & 64)`
			`fprintf(fp, "<nl>\n\n");`
			`if (ch->formatting & 128)`
			`fprintf(fp, "<para>\n\n");`
Windows changes for 2.01 includingimprovements to dll api and test program git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@108 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-08-31 02:22:12 +08:00			`free(utf16ch);`
Devstudio changes for v2.00. git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@81 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-07-18 08:59:35 +08:00			`}`

			`fclose(fp);`

			`return 0;`
Adding dlltest git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@60 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-05-17 08:46:08 +08:00			`}`