mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-22 01:30:49 +08:00
dfb81e163e
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@484 d0cd1f9f-072b-0410-8dd7-cf729c803f20
64 lines
3.1 KiB
XML
64 lines
3.1 KiB
XML
<?xml version="1.0" encoding="UTF-8"?>
|
|
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
|
<?asciidoc-toc?>
|
|
<?asciidoc-numbered?>
|
|
<refentry lang="en">
|
|
<refmeta>
|
|
<refentrytitle>unicharset</refentrytitle>
|
|
<manvolnum>5</manvolnum>
|
|
<refmiscinfo class="source"> </refmiscinfo>
|
|
<refmiscinfo class="manual"> </refmiscinfo>
|
|
</refmeta>
|
|
<refnamediv>
|
|
<refname>unicharset</refname>
|
|
<refpurpose>character properties for use by Tesseract</refpurpose>
|
|
</refnamediv>
|
|
<refsect1 id="_description">
|
|
<title>DESCRIPTION</title>
|
|
<simpara>Tesseract needs to have access to the character properties isalpha,
|
|
isdigit, isupper, islower, ispunctuation. This data must be encoded
|
|
in the unicharset data file. Each line of this file corresponds to
|
|
one character. The character in UTF-8 is followed by a hexadecimal
|
|
number representing a binary mask that encodes the properties.
|
|
Each bit corresponds to a property. If the bit is set to 1, it
|
|
means that the property is true. The bit ordering is (from least
|
|
significant bit to most significant bit): isalpha, islower, isupper,
|
|
isdigit, ispunctuation.</simpara>
|
|
<simpara>Each line in the unicharset file has four space-separated fields:</simpara>
|
|
<literallayout class="monospaced">[character] [properties] [script] [id]</literallayout>
|
|
</refsect1>
|
|
<refsect1 id="_example">
|
|
<title>EXAMPLE</title>
|
|
<literallayout class="monospaced">; 10 Common 46
|
|
b 3 Latin 59
|
|
W 5 Latin 40
|
|
7 8 Common 66
|
|
= 0 Common 93</literallayout>
|
|
<simpara>";" is a punctuation character. Its properties are thus represented by the binary number
|
|
10000 (10 in hexadecimal).</simpara>
|
|
<simpara>"b" is an alphabetic character and a lower case character. Its properties are thus
|
|
represented by the binary number 00011 (3 in hexadecimal).</simpara>
|
|
<simpara>"W" is an alphabetic character and an upper case character. Its properties are thus
|
|
represented by the binary number 00101 (5 in hexadecimal).</simpara>
|
|
<simpara>"7" is just a digit. Its properties are thus represented by the binary number 01000
|
|
(8 in hexadecimal).</simpara>
|
|
<simpara>"=" is not punctuation nor a digit nor an alphabetic character. Its properties are
|
|
thus represented by the binary number 00000 (0 in hexadecimal).</simpara>
|
|
<simpara>Japanese or Chinese alphabetic character properties are represented by the binary number
|
|
00001 (1 in hexadecimal): they are alphabetic, but neither upper nor lower case.</simpara>
|
|
<simpara>The last two columns represent the type of script (Latin, Common, Greek, Cyrillic, Han,
|
|
null) and id code of the character.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_history">
|
|
<title>HISTORY</title>
|
|
<simpara>The unicharset format first appeared with Tesseract 2.00, which was the first version
|
|
to support languages other than English. The unicharset file contained only the first
|
|
two fields, and the "ispunctuation" property was absent (punctuation was regarded as
|
|
"0", as "=" is in the above example.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_see_also">
|
|
<title>SEE ALSO</title>
|
|
<simpara>tesseract(1), unicharset_extractor(1)</simpara>
|
|
</refsect1>
|
|
</refentry>
|