2007-05-16 09:24:06 +08:00
|
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
// File: unicharset_extractor.cpp
|
|
|
|
// Description: Unicode character/ligature set extractor.
|
|
|
|
// Author: Thomas Kielbus
|
|
|
|
// Created: Wed Jun 28 17:05:01 PDT 2006
|
|
|
|
//
|
|
|
|
// (C) Copyright 2006, Google Inc.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
//
|
|
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
// Given a list of box files on the command line, this program generates a file
|
|
|
|
// containing a unicharset, a list of all the characters used by Tesseract
|
|
|
|
//
|
|
|
|
// The file contains the size of the set on the first line, and then one
|
|
|
|
// unichar per line.
|
|
|
|
|
|
|
|
#include <stdio.h>
|
2012-02-20 05:48:45 +08:00
|
|
|
#if defined(HAVE_WCHAR_T) || defined(_WIN32) || defined(GOOGLE3)
|
2007-08-31 02:18:35 +08:00
|
|
|
#include <wchar.h>
|
|
|
|
#include <wctype.h>
|
|
|
|
#define USING_WCTYPE
|
|
|
|
#endif
|
2008-12-31 05:28:08 +08:00
|
|
|
#include <locale.h>
|
2007-05-16 09:24:06 +08:00
|
|
|
|
2007-08-31 02:18:35 +08:00
|
|
|
#include "boxread.h"
|
2012-02-02 11:02:16 +08:00
|
|
|
#include "rect.h"
|
|
|
|
#include "strngs.h"
|
2007-05-16 09:24:06 +08:00
|
|
|
#include "tessopt.h"
|
2012-02-02 11:02:16 +08:00
|
|
|
#include "unichar.h"
|
|
|
|
#include "unicharset.h"
|
2007-05-16 09:24:06 +08:00
|
|
|
|
2017-07-15 00:30:14 +08:00
|
|
|
using tesseract::UNICHAR;
|
|
|
|
|
2007-05-16 09:24:06 +08:00
|
|
|
static const char* const kUnicharsetFileName = "unicharset";
|
|
|
|
|
2009-07-11 10:44:07 +08:00
|
|
|
UNICHAR_ID wc_to_unichar_id(const UNICHARSET &unicharset, int wc) {
|
|
|
|
UNICHAR uch(wc);
|
|
|
|
char *unichar = uch.utf8_str();
|
|
|
|
UNICHAR_ID unichar_id = unicharset.unichar_to_id(unichar);
|
|
|
|
delete[] unichar;
|
|
|
|
return unichar_id;
|
|
|
|
}
|
|
|
|
|
2007-08-31 02:18:35 +08:00
|
|
|
// Set character properties using wctype if we have it.
|
|
|
|
// Contributed by piggy@gmail.com.
|
|
|
|
// Modified by Ray to use UNICHAR for unicode conversion
|
|
|
|
// and to check for wctype using autoconf/presence of windows.
|
|
|
|
void set_properties(UNICHARSET *unicharset, const char* const c_string) {
|
|
|
|
#ifdef USING_WCTYPE
|
|
|
|
UNICHAR_ID id;
|
|
|
|
int wc;
|
|
|
|
|
|
|
|
// Convert the string to a unichar id.
|
|
|
|
id = unicharset->unichar_to_id(c_string);
|
|
|
|
|
2009-07-11 10:44:07 +08:00
|
|
|
// Set the other_case property to be this unichar id by default.
|
|
|
|
unicharset->set_other_case(id, id);
|
|
|
|
|
2008-02-01 08:57:56 +08:00
|
|
|
int step = UNICHAR::utf8_step(c_string);
|
|
|
|
if (step == 0)
|
|
|
|
return; // Invalid utf-8.
|
|
|
|
|
2009-07-11 10:44:07 +08:00
|
|
|
// Get the next Unicode code point in the string.
|
2008-02-01 08:57:56 +08:00
|
|
|
UNICHAR ch(c_string, step);
|
|
|
|
wc = ch.first_uni();
|
|
|
|
|
|
|
|
/* Copy the properties. */
|
|
|
|
if (iswalpha(wc)) {
|
|
|
|
unicharset->set_isalpha(id, 1);
|
2009-07-11 10:44:07 +08:00
|
|
|
if (iswlower(wc)) {
|
2008-02-01 08:57:56 +08:00
|
|
|
unicharset->set_islower(id, 1);
|
2009-07-11 10:44:07 +08:00
|
|
|
unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
|
2010-11-24 02:34:14 +08:00
|
|
|
towupper(wc)));
|
2009-07-11 10:44:07 +08:00
|
|
|
}
|
|
|
|
if (iswupper(wc)) {
|
2008-02-01 08:57:56 +08:00
|
|
|
unicharset->set_isupper(id, 1);
|
2009-07-11 10:44:07 +08:00
|
|
|
unicharset->set_other_case(id, wc_to_unichar_id(*unicharset,
|
2010-11-24 02:34:14 +08:00
|
|
|
towlower(wc)));
|
2009-07-11 10:44:07 +08:00
|
|
|
}
|
2007-08-31 02:18:35 +08:00
|
|
|
}
|
2008-02-01 08:57:56 +08:00
|
|
|
if (iswdigit(wc))
|
|
|
|
unicharset->set_isdigit(id, 1);
|
2009-07-11 10:44:07 +08:00
|
|
|
if(iswpunct(wc))
|
|
|
|
unicharset->set_ispunctuation(id, 1);
|
|
|
|
|
2007-08-31 02:18:35 +08:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2007-05-16 09:24:06 +08:00
|
|
|
int main(int argc, char** argv) {
|
|
|
|
int option;
|
|
|
|
const char* output_directory = ".";
|
|
|
|
STRING unicharset_file_name;
|
2013-09-23 23:26:50 +08:00
|
|
|
// Special characters are now included by default.
|
2007-05-16 09:24:06 +08:00
|
|
|
UNICHARSET unicharset;
|
|
|
|
|
2008-12-31 05:28:08 +08:00
|
|
|
setlocale(LC_ALL, "");
|
2007-05-16 09:24:06 +08:00
|
|
|
|
|
|
|
// Print usage
|
|
|
|
if (argc <= 1) {
|
|
|
|
printf("Usage: %s [-D DIRECTORY] FILE...\n", argv[0]);
|
2015-10-05 18:54:24 +08:00
|
|
|
#ifdef USING_WCTYPE
|
|
|
|
printf("Character properties using wctype is enabled\n");
|
|
|
|
#else
|
|
|
|
printf("WARNING: Character properties using wctype is DISABLED\n");
|
|
|
|
#endif
|
2007-05-16 09:24:06 +08:00
|
|
|
exit(1);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
// Parse arguments
|
|
|
|
while ((option = tessopt(argc, argv, "D" )) != EOF) {
|
|
|
|
switch (option) {
|
|
|
|
case 'D':
|
2007-07-18 09:00:54 +08:00
|
|
|
output_directory = tessoptarg;
|
|
|
|
++tessoptind;
|
2007-05-16 09:24:06 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Save file name
|
|
|
|
unicharset_file_name = output_directory;
|
|
|
|
unicharset_file_name += "/";
|
|
|
|
unicharset_file_name += kUnicharsetFileName;
|
|
|
|
|
|
|
|
// Load box files
|
2007-07-18 09:00:54 +08:00
|
|
|
for (; tessoptind < argc; ++tessoptind) {
|
|
|
|
printf("Extracting unicharset from %s\n", argv[tessoptind]);
|
2007-05-16 09:24:06 +08:00
|
|
|
|
2011-08-12 05:42:13 +08:00
|
|
|
FILE* box_file = fopen(argv[tessoptind], "rb");
|
2016-12-13 15:08:01 +08:00
|
|
|
if (box_file == nullptr) {
|
2007-07-18 09:00:54 +08:00
|
|
|
printf("Cannot open box file %s\n", argv[tessoptind]);
|
2007-05-16 09:24:06 +08:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2012-02-02 11:02:16 +08:00
|
|
|
TBOX box;
|
|
|
|
STRING unichar_string;
|
2010-11-24 02:34:14 +08:00
|
|
|
int line_number = 0;
|
2012-02-02 11:02:16 +08:00
|
|
|
while (ReadNextBox(&line_number, box_file, &unichar_string, &box)) {
|
|
|
|
unicharset.unichar_insert(unichar_string.string());
|
|
|
|
set_properties(&unicharset, unichar_string.string());
|
2007-05-16 09:24:06 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Write unicharset file
|
|
|
|
if (unicharset.save_to_file(unicharset_file_name.string())) {
|
|
|
|
printf("Wrote unicharset file %s.\n", unicharset_file_name.string());
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
printf("Cannot save unicharset file %s.\n", unicharset_file_name.string());
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|