mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-05 02:47:00 +08:00
58e06c8c45
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@670 d0cd1f9f-072b-0410-8dd7-cf729c803f20
124 lines
4.0 KiB
XML
124 lines
4.0 KiB
XML
<?xml version="1.0" encoding="UTF-8"?>
|
|
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
|
<?asciidoc-toc?>
|
|
<?asciidoc-numbered?>
|
|
<refentry lang="en">
|
|
<refmeta>
|
|
<refentrytitle>unicharambigs</refentrytitle>
|
|
<manvolnum>5</manvolnum>
|
|
<refmiscinfo class="source"> </refmiscinfo>
|
|
<refmiscinfo class="manual"> </refmiscinfo>
|
|
</refmeta>
|
|
<refnamediv>
|
|
<refname>unicharambigs</refname>
|
|
<refpurpose>Tesseract unicharset ambiguities</refpurpose>
|
|
</refnamediv>
|
|
<refsect1 id="_description">
|
|
<title>DESCRIPTION</title>
|
|
<simpara>The unicharambigs file (a component of traineddata, see combine_tessdata(1) )
|
|
is used by Tesseract to represent possible ambiguities between characters,
|
|
or groups of characters.</simpara>
|
|
<simpara>The file contains a number of lines, laid out as follow:</simpara>
|
|
<literallayout class="monospaced">[num] <TAB> [char(s)] <TAB> [num] <TAB> [char(s)] <TAB> [num]</literallayout>
|
|
<informaltable tabstyle="horizontal" frame="none" colsep="0" rowsep="0"><tgroup cols="2"><colspec colwidth="15*"/><colspec colwidth="85*"/><tbody valign="top">
|
|
<row>
|
|
<entry>
|
|
<simpara>
|
|
Field one
|
|
</simpara>
|
|
</entry>
|
|
<entry>
|
|
<simpara>
|
|
the number of characters contained in field two
|
|
</simpara>
|
|
</entry>
|
|
</row>
|
|
<row>
|
|
<entry>
|
|
<simpara>
|
|
Field two
|
|
</simpara>
|
|
</entry>
|
|
<entry>
|
|
<simpara>
|
|
the character sequence to be replaced
|
|
</simpara>
|
|
</entry>
|
|
</row>
|
|
<row>
|
|
<entry>
|
|
<simpara>
|
|
Field three
|
|
</simpara>
|
|
</entry>
|
|
<entry>
|
|
<simpara>
|
|
the number of characters contained in field four
|
|
</simpara>
|
|
</entry>
|
|
</row>
|
|
<row>
|
|
<entry>
|
|
<simpara>
|
|
Field four
|
|
</simpara>
|
|
</entry>
|
|
<entry>
|
|
<simpara>
|
|
the character sequence used to replace field two
|
|
</simpara>
|
|
</entry>
|
|
</row>
|
|
<row>
|
|
<entry>
|
|
<simpara>
|
|
Field five
|
|
</simpara>
|
|
</entry>
|
|
<entry>
|
|
<simpara>
|
|
contains either 1 or 0. 1 denotes a mandatory
|
|
replacement, 0 denotes an optional replacement.
|
|
</simpara>
|
|
</entry>
|
|
</row>
|
|
</tbody></tgroup></informaltable>
|
|
<simpara>Characters appearing in fields two and four should appear in
|
|
unicharset. The numbers in fields one and three refer to the
|
|
number of unichars (not bytes).</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_example">
|
|
<title>EXAMPLE</title>
|
|
<literallayout class="monospaced">2 ' ' 1 " 1
|
|
1 m 2 r n 0
|
|
3 i i i 1 m 0</literallayout>
|
|
<simpara>In this example, all instances of the <emphasis>2</emphasis> character sequence <emphasis>'</emphasis>' will
|
|
<emphasis role="strong">always</emphasis> be replaced by the <emphasis>1</emphasis> character sequence <emphasis>"</emphasis>; a <emphasis>1</emphasis> character
|
|
sequence <emphasis>m</emphasis> <emphasis role="strong">may</emphasis> be replaced by the <emphasis>2</emphasis> character sequence <emphasis>rn</emphasis>, and
|
|
the <emphasis>3</emphasis> character sequence <emphasis role="strong">may</emphasis> be replaced by the <emphasis>1</emphasis> character
|
|
sequence <emphasis>m</emphasis>.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_history">
|
|
<title>HISTORY</title>
|
|
<simpara>The unicharambigs file first appeared in Tesseract 3.00; prior to that, a
|
|
similar format, called DangAmbigs (<emphasis>dangerous ambiguities</emphasis>) was used: the
|
|
format was almost identical, except only mandatory replacements could be
|
|
specified, and field 5 was absent.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_bugs">
|
|
<title>BUGS</title>
|
|
<simpara>This is a documentation "bug": it’s not currently clear what should be done
|
|
in the case of ligatures (such as <emphasis>fi</emphasis>) which may also appear as regular
|
|
letters in the unicharset.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_see_also">
|
|
<title>SEE ALSO</title>
|
|
<simpara>tesseract(1), unicharset(5)</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_author">
|
|
<title>AUTHOR</title>
|
|
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
|
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
|
</refsect1>
|
|
</refentry>
|