mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 20:59:36 +08:00
doc: Fix line endings
Remove spaces at line endings and replace CRLF by LF. Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
parent
798d79aaa5
commit
61d0e8f0ff
File diff suppressed because it is too large
Load Diff
@ -1,43 +1,43 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>AMBIGUOUS_WORDS(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>ambiguous_words</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>ambiguous_words</refname>
|
||||
<refpurpose>generate sets of words Tesseract is likely to find ambiguous</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara><emphasis role="strong">ambiguous_words</emphasis> [-l lang] <emphasis>TESSDATADIR</emphasis> <emphasis>WORDLIST</emphasis> <emphasis>AMBIGUOUSFILE</emphasis></simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>ambiguous_words(1) runs Tesseract in a special mode, and for each word
|
||||
in word list, produces a set of words which Tesseract thinks might be
|
||||
ambiguous with it. <emphasis>TESSDATADIR</emphasis> must be set to the absolute path of
|
||||
a directory containing <emphasis>tessdata/lang.traineddata</emphasis>.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1)</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Copyright (C) 2012 Google, Inc.
|
||||
Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>AMBIGUOUS_WORDS(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>ambiguous_words</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>ambiguous_words</refname>
|
||||
<refpurpose>generate sets of words Tesseract is likely to find ambiguous</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara><emphasis role="strong">ambiguous_words</emphasis> [-l lang] <emphasis>TESSDATADIR</emphasis> <emphasis>WORDLIST</emphasis> <emphasis>AMBIGUOUSFILE</emphasis></simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>ambiguous_words(1) runs Tesseract in a special mode, and for each word
|
||||
in word list, produces a set of words which Tesseract thinks might be
|
||||
ambiguous with it. <emphasis>TESSDATADIR</emphasis> must be set to the absolute path of
|
||||
a directory containing <emphasis>tessdata/lang.traineddata</emphasis>.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1)</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Copyright (C) 2012 Google, Inc.
|
||||
Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,58 +1,58 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>CNTRAINING(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>cntraining</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>cntraining</refname>
|
||||
<refpurpose>character normalization training for Tesseract</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara><emphasis role="strong">cntraining</emphasis> [-D <emphasis>dir</emphasis>] <emphasis>FILE</emphasis>…</simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>cntraining takes a list of .tr files, from which it generates the
|
||||
<emphasis role="strong">normproto</emphasis> data file (the character normalization sensitivity
|
||||
prototypes).</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_options">
|
||||
<title>OPTIONS</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-D <emphasis>dir</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Directory to write output files to.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), shapeclustering(1), mftraining(1)</simpara>
|
||||
<simpara><ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Copyright (c) Hewlett-Packard Company, 1988
|
||||
Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>CNTRAINING(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>cntraining</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>cntraining</refname>
|
||||
<refpurpose>character normalization training for Tesseract</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara><emphasis role="strong">cntraining</emphasis> [-D <emphasis>dir</emphasis>] <emphasis>FILE</emphasis>…</simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>cntraining takes a list of .tr files, from which it generates the
|
||||
<emphasis role="strong">normproto</emphasis> data file (the character normalization sensitivity
|
||||
prototypes).</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_options">
|
||||
<title>OPTIONS</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-D <emphasis>dir</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Directory to write output files to.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), shapeclustering(1), mftraining(1)</simpara>
|
||||
<simpara><ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Copyright (c) Hewlett-Packard Company, 1988
|
||||
Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
|
@ -11,7 +11,7 @@ SYNOPSIS
|
||||
|
||||
DESCRIPTION
|
||||
-----------
|
||||
combine_tessdata(1) is the main program to combine/extract/overwrite
|
||||
combine_tessdata(1) is the main program to combine/extract/overwrite
|
||||
tessdata components in [lang].traineddata files.
|
||||
|
||||
To combine all the individual tessdata components (unicharset, DAWGs,
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,281 +1,281 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>COMBINE_TESSDATA(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>combine_tessdata</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>combine_tessdata</refname>
|
||||
<refpurpose>combine/extract/overwrite Tesseract data</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara><emphasis role="strong">combine_tessdata</emphasis> [<emphasis>OPTION</emphasis>] <emphasis>FILE</emphasis>…</simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>combine_tessdata(1) is the main program to combine/extract/overwrite
|
||||
tessdata components in [lang].traineddata files.</simpara>
|
||||
<simpara>To combine all the individual tessdata components (unicharset, DAWGs,
|
||||
classifier templates, ambiguities, language configs) located at, say,
|
||||
/home/$USER/temp/eng.* run:</simpara>
|
||||
<literallayout class="monospaced">combine_tessdata /home/$USER/temp/eng.</literallayout>
|
||||
<simpara>The result will be a combined tessdata file /home/$USER/temp/eng.traineddata</simpara>
|
||||
<simpara>Specify option -e if you would like to extract individual components
|
||||
from a combined traineddata file. For example, to extract language config
|
||||
file and the unicharset from tessdata/eng.traineddata run:</simpara>
|
||||
<literallayout class="monospaced">combine_tessdata -e tessdata/eng.traineddata \
|
||||
/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset</literallayout>
|
||||
<simpara>The desired config file and unicharset will be written to
|
||||
/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset</simpara>
|
||||
<simpara>Specify option -o to overwrite individual components of the given
|
||||
[lang].traineddata file. For example, to overwrite language config
|
||||
and unichar ambiguities files in tessdata/eng.traineddata use:</simpara>
|
||||
<literallayout class="monospaced">combine_tessdata -o tessdata/eng.traineddata \
|
||||
/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs</literallayout>
|
||||
<simpara>As a result, tessdata/eng.traineddata will contain the new language config
|
||||
and unichar ambigs, plus all the original DAWGs, classifier templates, etc.</simpara>
|
||||
<simpara>Note: the file names of the files to extract to and to overwrite from should
|
||||
have the appropriate file suffixes (extensions) indicating their tessdata
|
||||
component type (.unicharset for the unicharset, .unicharambigs for unichar
|
||||
ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.</simpara>
|
||||
<simpara>Specify option -u to unpack all the components to the specified path:</simpara>
|
||||
<literallayout class="monospaced">combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.</literallayout>
|
||||
<simpara>This will create /home/$USER/temp/eng.* files with individual tessdata
|
||||
components from tessdata/eng.traineddata.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_options">
|
||||
<title>OPTIONS</title>
|
||||
<simpara><emphasis role="strong">-e</emphasis> <emphasis>.traineddata</emphasis> <emphasis>FILE</emphasis>…:
|
||||
Extracts the specified components from the .traineddata file</simpara>
|
||||
<simpara><emphasis role="strong">-o</emphasis> <emphasis>.traineddata</emphasis> <emphasis>FILE</emphasis>…:
|
||||
Overwrites the specified components of the .traineddata file
|
||||
with those provided on the comand line.</simpara>
|
||||
<simpara><emphasis role="strong">-u</emphasis> <emphasis>.traineddata</emphasis> <emphasis>PATHPREFIX</emphasis>
|
||||
Unpacks the .traineddata using the provided prefix.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_caveats">
|
||||
<title>CAVEATS</title>
|
||||
<simpara><emphasis>Prefix</emphasis> refers to the full file prefix, including period (.)</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_components">
|
||||
<title>COMPONENTS</title>
|
||||
<simpara>The components in a Tesseract lang.traineddata file as of
|
||||
Tesseract 3.02 are briefly described below; For more information on
|
||||
many of these files, see
|
||||
<ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.config
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) Language-specific overrides to default config variables.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.unicharset
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Required) The list of symbols that Tesseract recognizes, with properties.
|
||||
See unicharset(5).
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.unicharambigs
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) This file contains information on pairs of recognized symbols
|
||||
which are often confused. For example, <emphasis>rn</emphasis> and <emphasis>m</emphasis>.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.inttemp
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Required) Character shape templates for each unichar. Produced by
|
||||
mftraining(1).
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.pffmtable
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Required) The number of features expected for each unichar.
|
||||
Produced by mftraining(1) from <emphasis role="strong">.tr</emphasis> files.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.normproto
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Required) Character normalization prototypes generated by cntraining(1)
|
||||
from <emphasis role="strong">.tr</emphasis> files.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.punc-dawg
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) A dawg made from punctuation patterns found around words.
|
||||
The "word" part is replaced by a single space.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.word-dawg
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) A dawg made from dictionary words from the language.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.number-dawg
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) A dawg made from tokens which originally contained digits.
|
||||
Each digit is replaced by a space character.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.freq-dawg
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) A dawg made from the most frequent words which would have
|
||||
gone into word-dawg.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.fixed-length-dawgs
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) Several dawgs of different fixed lengths — useful for
|
||||
languages like Chinese.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.cube-unicharset
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) A unicharset for cube, if cube was trained on a different set
|
||||
of symbols.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.cube-word-dawg
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) A word dawg for cube’s alternate unicharset. Not needed if Cube
|
||||
was trained with Tesseract’s unicharset.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.shapetable
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) When present, a shapetable is an extra layer between the character
|
||||
classifier and the word recognizer that allows the character classifier to
|
||||
return a collection of unichar ids and fonts instead of a single unichar-id
|
||||
and font.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.bigram-dawg
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) A dawg of word bigrams where the words are separated by a space
|
||||
and each digit is replaced by a <emphasis>?</emphasis>.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.unambig-dawg
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) TODO: Describe.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.params-training-model
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) TODO: Describe.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
<refsect1 id="_history">
|
||||
<title>HISTORY</title>
|
||||
<simpara>combine_tessdata(1) first appeared in version 3.00 of Tesseract</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), wordlist2dawg(1), cntraining(1), mftraining(1), unicharset(5),
|
||||
unicharambigs(5)</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Copyright (C) 2009, Google Inc.
|
||||
Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>COMBINE_TESSDATA(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>combine_tessdata</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>combine_tessdata</refname>
|
||||
<refpurpose>combine/extract/overwrite Tesseract data</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara><emphasis role="strong">combine_tessdata</emphasis> [<emphasis>OPTION</emphasis>] <emphasis>FILE</emphasis>…</simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>combine_tessdata(1) is the main program to combine/extract/overwrite
|
||||
tessdata components in [lang].traineddata files.</simpara>
|
||||
<simpara>To combine all the individual tessdata components (unicharset, DAWGs,
|
||||
classifier templates, ambiguities, language configs) located at, say,
|
||||
/home/$USER/temp/eng.* run:</simpara>
|
||||
<literallayout class="monospaced">combine_tessdata /home/$USER/temp/eng.</literallayout>
|
||||
<simpara>The result will be a combined tessdata file /home/$USER/temp/eng.traineddata</simpara>
|
||||
<simpara>Specify option -e if you would like to extract individual components
|
||||
from a combined traineddata file. For example, to extract language config
|
||||
file and the unicharset from tessdata/eng.traineddata run:</simpara>
|
||||
<literallayout class="monospaced">combine_tessdata -e tessdata/eng.traineddata \
|
||||
/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset</literallayout>
|
||||
<simpara>The desired config file and unicharset will be written to
|
||||
/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset</simpara>
|
||||
<simpara>Specify option -o to overwrite individual components of the given
|
||||
[lang].traineddata file. For example, to overwrite language config
|
||||
and unichar ambiguities files in tessdata/eng.traineddata use:</simpara>
|
||||
<literallayout class="monospaced">combine_tessdata -o tessdata/eng.traineddata \
|
||||
/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs</literallayout>
|
||||
<simpara>As a result, tessdata/eng.traineddata will contain the new language config
|
||||
and unichar ambigs, plus all the original DAWGs, classifier templates, etc.</simpara>
|
||||
<simpara>Note: the file names of the files to extract to and to overwrite from should
|
||||
have the appropriate file suffixes (extensions) indicating their tessdata
|
||||
component type (.unicharset for the unicharset, .unicharambigs for unichar
|
||||
ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.</simpara>
|
||||
<simpara>Specify option -u to unpack all the components to the specified path:</simpara>
|
||||
<literallayout class="monospaced">combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.</literallayout>
|
||||
<simpara>This will create /home/$USER/temp/eng.* files with individual tessdata
|
||||
components from tessdata/eng.traineddata.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_options">
|
||||
<title>OPTIONS</title>
|
||||
<simpara><emphasis role="strong">-e</emphasis> <emphasis>.traineddata</emphasis> <emphasis>FILE</emphasis>…:
|
||||
Extracts the specified components from the .traineddata file</simpara>
|
||||
<simpara><emphasis role="strong">-o</emphasis> <emphasis>.traineddata</emphasis> <emphasis>FILE</emphasis>…:
|
||||
Overwrites the specified components of the .traineddata file
|
||||
with those provided on the comand line.</simpara>
|
||||
<simpara><emphasis role="strong">-u</emphasis> <emphasis>.traineddata</emphasis> <emphasis>PATHPREFIX</emphasis>
|
||||
Unpacks the .traineddata using the provided prefix.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_caveats">
|
||||
<title>CAVEATS</title>
|
||||
<simpara><emphasis>Prefix</emphasis> refers to the full file prefix, including period (.)</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_components">
|
||||
<title>COMPONENTS</title>
|
||||
<simpara>The components in a Tesseract lang.traineddata file as of
|
||||
Tesseract 3.02 are briefly described below; For more information on
|
||||
many of these files, see
|
||||
<ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.config
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) Language-specific overrides to default config variables.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.unicharset
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Required) The list of symbols that Tesseract recognizes, with properties.
|
||||
See unicharset(5).
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.unicharambigs
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) This file contains information on pairs of recognized symbols
|
||||
which are often confused. For example, <emphasis>rn</emphasis> and <emphasis>m</emphasis>.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.inttemp
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Required) Character shape templates for each unichar. Produced by
|
||||
mftraining(1).
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.pffmtable
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Required) The number of features expected for each unichar.
|
||||
Produced by mftraining(1) from <emphasis role="strong">.tr</emphasis> files.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.normproto
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Required) Character normalization prototypes generated by cntraining(1)
|
||||
from <emphasis role="strong">.tr</emphasis> files.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.punc-dawg
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) A dawg made from punctuation patterns found around words.
|
||||
The "word" part is replaced by a single space.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.word-dawg
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) A dawg made from dictionary words from the language.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.number-dawg
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) A dawg made from tokens which originally contained digits.
|
||||
Each digit is replaced by a space character.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.freq-dawg
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) A dawg made from the most frequent words which would have
|
||||
gone into word-dawg.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.fixed-length-dawgs
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) Several dawgs of different fixed lengths — useful for
|
||||
languages like Chinese.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.cube-unicharset
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) A unicharset for cube, if cube was trained on a different set
|
||||
of symbols.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.cube-word-dawg
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) A word dawg for cube’s alternate unicharset. Not needed if Cube
|
||||
was trained with Tesseract’s unicharset.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.shapetable
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) When present, a shapetable is an extra layer between the character
|
||||
classifier and the word recognizer that allows the character classifier to
|
||||
return a collection of unichar ids and fonts instead of a single unichar-id
|
||||
and font.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.bigram-dawg
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) A dawg of word bigrams where the words are separated by a space
|
||||
and each digit is replaced by a <emphasis>?</emphasis>.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.unambig-dawg
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) TODO: Describe.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
lang.params-training-model
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Optional) TODO: Describe.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
<refsect1 id="_history">
|
||||
<title>HISTORY</title>
|
||||
<simpara>combine_tessdata(1) first appeared in version 3.00 of Tesseract</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), wordlist2dawg(1), cntraining(1), mftraining(1), unicharset(5),
|
||||
unicharambigs(5)</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Copyright (C) 2009, Google Inc.
|
||||
Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,53 +1,53 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>DAWG2WORDLIST(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>dawg2wordlist</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>dawg2wordlist</refname>
|
||||
<refpurpose>convert a Tesseract DAWG to a wordlist</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara><emphasis role="strong">dawg2wordlist</emphasis> <emphasis>UNICHARSET</emphasis> <emphasis>DAWG</emphasis> <emphasis>WORDLIST</emphasis></simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>dawg2wordlist(1) converts a Tesseract Directed Acyclic Word
|
||||
Graph (DAWG) to a list of words using a unicharset as key.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_options">
|
||||
<title>OPTIONS</title>
|
||||
<simpara><emphasis>UNICHARSET</emphasis>
|
||||
The unicharset of the language. This is the unicharset
|
||||
generated by mftraining(1).</simpara>
|
||||
<simpara><emphasis>DAWG</emphasis>
|
||||
The input DAWG, created by wordlist2dawg(1)</simpara>
|
||||
<simpara><emphasis>WORDLIST</emphasis>
|
||||
Plain text (output) file in UTF-8, one word per line</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), mftraining(1), wordlist2dawg(1), unicharset(5),
|
||||
combine_tessdata(1)</simpara>
|
||||
<simpara><ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Copyright (C) 2012 Google, Inc.
|
||||
Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>DAWG2WORDLIST(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>dawg2wordlist</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>dawg2wordlist</refname>
|
||||
<refpurpose>convert a Tesseract DAWG to a wordlist</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara><emphasis role="strong">dawg2wordlist</emphasis> <emphasis>UNICHARSET</emphasis> <emphasis>DAWG</emphasis> <emphasis>WORDLIST</emphasis></simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>dawg2wordlist(1) converts a Tesseract Directed Acyclic Word
|
||||
Graph (DAWG) to a list of words using a unicharset as key.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_options">
|
||||
<title>OPTIONS</title>
|
||||
<simpara><emphasis>UNICHARSET</emphasis>
|
||||
The unicharset of the language. This is the unicharset
|
||||
generated by mftraining(1).</simpara>
|
||||
<simpara><emphasis>DAWG</emphasis>
|
||||
The input DAWG, created by wordlist2dawg(1)</simpara>
|
||||
<simpara><emphasis>WORDLIST</emphasis>
|
||||
Plain text (output) file in UTF-8, one word per line</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), mftraining(1), wordlist2dawg(1), unicharset(5),
|
||||
combine_tessdata(1)</simpara>
|
||||
<simpara><ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Copyright (C) 2012 Google, Inc.
|
||||
Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
|
@ -24,12 +24,12 @@ OPTIONS
|
||||
|
||||
-F 'font_properties_file'::
|
||||
(Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1:
|
||||
|
||||
|
||||
*font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur*
|
||||
|
||||
-X 'xheights_file'::
|
||||
(Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ]
|
||||
|
||||
|
||||
*font_name* *xheight*
|
||||
|
||||
-D 'dir'::
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,102 +1,102 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>MFTRAINING(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>mftraining</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>mftraining</refname>
|
||||
<refpurpose>feature training for Tesseract</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara>mftraining -U <emphasis>unicharset</emphasis> -O <emphasis>lang.unicharset</emphasis> <emphasis>FILE</emphasis>…</simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>mftraining takes a list of .tr files, from which it generates the
|
||||
files <emphasis role="strong">inttemp</emphasis> (the shape prototypes), <emphasis role="strong">shapetable</emphasis>, and <emphasis role="strong">pffmtable</emphasis>
|
||||
(the number of expected features for each character). (A fourth file
|
||||
called Microfeat is also written by this program, but it is not used.)</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_options">
|
||||
<title>OPTIONS</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-U <emphasis>FILE</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Input) The unicharset generated by unicharset_extractor(1)
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-F <emphasis>font_properties_file</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1:
|
||||
</simpara>
|
||||
<literallayout class="monospaced">*font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur*</literallayout>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-X <emphasis>xheights_file</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ]
|
||||
</simpara>
|
||||
<literallayout class="monospaced">*font_name* *xheight*</literallayout>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-D <emphasis>dir</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Directory to write output files to.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-O <emphasis>FILE</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Output) The output unicharset that will be given to combine_tessdata(1)
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1),
|
||||
shapeclustering(1), unicharset(5)</simpara>
|
||||
<simpara><ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Copyright (C) Hewlett-Packard Company, 1988
|
||||
Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>MFTRAINING(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>mftraining</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>mftraining</refname>
|
||||
<refpurpose>feature training for Tesseract</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara>mftraining -U <emphasis>unicharset</emphasis> -O <emphasis>lang.unicharset</emphasis> <emphasis>FILE</emphasis>…</simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>mftraining takes a list of .tr files, from which it generates the
|
||||
files <emphasis role="strong">inttemp</emphasis> (the shape prototypes), <emphasis role="strong">shapetable</emphasis>, and <emphasis role="strong">pffmtable</emphasis>
|
||||
(the number of expected features for each character). (A fourth file
|
||||
called Microfeat is also written by this program, but it is not used.)</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_options">
|
||||
<title>OPTIONS</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-U <emphasis>FILE</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Input) The unicharset generated by unicharset_extractor(1)
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-F <emphasis>font_properties_file</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1:
|
||||
</simpara>
|
||||
<literallayout class="monospaced">*font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur*</literallayout>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-X <emphasis>xheights_file</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ]
|
||||
</simpara>
|
||||
<literallayout class="monospaced">*font_name* *xheight*</literallayout>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-D <emphasis>dir</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Directory to write output files to.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-O <emphasis>FILE</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Output) The output unicharset that will be given to combine_tessdata(1)
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1),
|
||||
shapeclustering(1), unicharset(5)</simpara>
|
||||
<simpara><ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Copyright (C) Hewlett-Packard Company, 1988
|
||||
Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
|
@ -35,7 +35,7 @@ OPTIONS
|
||||
|
||||
-X 'xheights_file'::
|
||||
(Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ]
|
||||
|
||||
|
||||
'font_name' 'xheight'
|
||||
|
||||
-O 'FILE'::
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,105 +1,105 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>SHAPECLUSTERING(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>shapeclustering</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>shapeclustering</refname>
|
||||
<refpurpose>shape clustering training for Tesseract</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara>shapeclustering -D <emphasis>output_dir</emphasis>
|
||||
-U <emphasis>unicharset</emphasis> -O <emphasis>mfunicharset</emphasis>
|
||||
-F <emphasis>font_props</emphasis> -X <emphasis>xheights</emphasis>
|
||||
<emphasis>FILE</emphasis>…</simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>shapeclustering(1) takes extracted feature .tr files (generated by
|
||||
tesseract(1) run in a special mode from box files) and produces a
|
||||
file <emphasis role="strong">shapetable</emphasis> and an enhanced unicharset. This program is still
|
||||
experimental, and is not required (yet) for training Tesseract.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_options">
|
||||
<title>OPTIONS</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-U <emphasis>FILE</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The unicharset generated by unicharset_extractor(1).
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-D <emphasis>dir</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Directory to write output files to.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-F <emphasis>font_properties_file</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Input) font properties file, where each line is of the following form, where each field other than the font name is 0 or 1:
|
||||
</simpara>
|
||||
<literallayout class="monospaced">'font_name' 'italic' 'bold' 'fixed_pitch' 'serif' 'fraktur'</literallayout>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-X <emphasis>xheights_file</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ]
|
||||
</simpara>
|
||||
<literallayout class="monospaced">'font_name' 'xheight'</literallayout>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-O <emphasis>FILE</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The output unicharset that will be given to combine_tessdata(1).
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1),
|
||||
unicharset(5)</simpara>
|
||||
<simpara><ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Copyright (C) Google, 2011
|
||||
Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>SHAPECLUSTERING(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>shapeclustering</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>shapeclustering</refname>
|
||||
<refpurpose>shape clustering training for Tesseract</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara>shapeclustering -D <emphasis>output_dir</emphasis>
|
||||
-U <emphasis>unicharset</emphasis> -O <emphasis>mfunicharset</emphasis>
|
||||
-F <emphasis>font_props</emphasis> -X <emphasis>xheights</emphasis>
|
||||
<emphasis>FILE</emphasis>…</simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>shapeclustering(1) takes extracted feature .tr files (generated by
|
||||
tesseract(1) run in a special mode from box files) and produces a
|
||||
file <emphasis role="strong">shapetable</emphasis> and an enhanced unicharset. This program is still
|
||||
experimental, and is not required (yet) for training Tesseract.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_options">
|
||||
<title>OPTIONS</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-U <emphasis>FILE</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The unicharset generated by unicharset_extractor(1).
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-D <emphasis>dir</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Directory to write output files to.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-F <emphasis>font_properties_file</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Input) font properties file, where each line is of the following form, where each field other than the font name is 0 or 1:
|
||||
</simpara>
|
||||
<literallayout class="monospaced">'font_name' 'italic' 'bold' 'fixed_pitch' 'serif' 'fraktur'</literallayout>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-X <emphasis>xheights_file</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
(Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ]
|
||||
</simpara>
|
||||
<literallayout class="monospaced">'font_name' 'xheight'</literallayout>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
-O <emphasis>FILE</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The output unicharset that will be given to combine_tessdata(1).
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1),
|
||||
unicharset(5)</simpara>
|
||||
<simpara><ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Copyright (C) Google, 2011
|
||||
Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
|
@ -67,7 +67,7 @@ OPTIONS
|
||||
6 = Assume a single uniform block of text.
|
||||
7 = Treat the image as a single text line.
|
||||
8 = Treat the image as a single word.
|
||||
9 = Treat the image as a single word in a circle.
|
||||
9 = Treat the image as a single word in a circle.
|
||||
10 = Treat the image as a single character.
|
||||
|
||||
'configfile'::
|
||||
@ -264,10 +264,10 @@ on read_pattern_list().
|
||||
|
||||
HISTORY
|
||||
-------
|
||||
The engine was developed at Hewlett Packard Laboratories Bristol and at
|
||||
Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more
|
||||
changes made in 1996 to port to Windows, and some C\+\+izing in 1998. A
|
||||
lot of the code was written in C, and then some more was written in C\+\+.
|
||||
The engine was developed at Hewlett Packard Laboratories Bristol and at
|
||||
Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more
|
||||
changes made in 1996 to port to Windows, and some C\+\+izing in 1998. A
|
||||
lot of the code was written in C, and then some more was written in C\+\+.
|
||||
The C\+\+ code makes heavy use of a list system using macros. This predates
|
||||
stl, was portable before stl, and is more efficient than stl lists, but has
|
||||
the big negative that if you do get a segmentation violation, it is hard to
|
||||
@ -276,18 +276,18 @@ debug.
|
||||
Version 2.00 brought Unicode (UTF-8) support, six languages, and the ability
|
||||
to train Tesseract.
|
||||
|
||||
Tesseract was included in UNLV's Fourth Annual Test of OCR Accuracy.
|
||||
Tesseract was included in UNLV's Fourth Annual Test of OCR Accuracy.
|
||||
See <https://github.com/tesseract-ocr/docs/blob/master/AT-1995.pdf>. With Tesseract 2.00,
|
||||
scripts are now included to allow anyone to reproduce some of these tests.
|
||||
See <https://github.com/tesseract-ocr/tesseract/wiki/TestingTesseract> for more
|
||||
scripts are now included to allow anyone to reproduce some of these tests.
|
||||
See <https://github.com/tesseract-ocr/tesseract/wiki/TestingTesseract> for more
|
||||
details.
|
||||
|
||||
Tesseract 3.00 adds a number of new languages, including Chinese, Japanese,
|
||||
and Korean. It also introduces a new, single-file based system of managing
|
||||
Tesseract 3.00 adds a number of new languages, including Chinese, Japanese,
|
||||
and Korean. It also introduces a new, single-file based system of managing
|
||||
language data.
|
||||
|
||||
Tesseract 3.02 adds BiDirectional text support, the ability to recognize
|
||||
multiple languages in a single image, and improved layout analysis.
|
||||
Tesseract 3.02 adds BiDirectional text support, the ability to recognize
|
||||
multiple languages in a single image, and improved layout analysis.
|
||||
|
||||
For further details, see the file ReleaseNotes included with the distribution.
|
||||
|
||||
|
2326
doc/tesseract.1.html
2326
doc/tesseract.1.html
File diff suppressed because it is too large
Load Diff
@ -1,424 +1,424 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>TESSERACT(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>tesseract</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>tesseract</refname>
|
||||
<refpurpose>command-line OCR engine</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara><emphasis role="strong">tesseract</emphasis> <emphasis>imagename</emphasis>|<emphasis>stdin</emphasis> <emphasis>outputbase</emphasis>|<emphasis>stdout</emphasis> [options…] [configfile…]</simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>tesseract(1) is a commercial quality OCR engine originally developed at HP
|
||||
between 1985 and 1995. In 1995, this engine was among the top 3 evaluated by
|
||||
UNLV. It was open-sourced by HP and UNLV in 2005, and has been developed
|
||||
at Google since then.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_in_out_arguments">
|
||||
<title>IN/OUT ARGUMENTS</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>imagename</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The name of the input image. Most image file formats (anything
|
||||
readable by Leptonica) are supported.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>stdin</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Instruction to read data from standard input
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>outputbase</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The basename of the output file (to which the appropriate extension
|
||||
will be appended). By default the output will be named <emphasis>outbase.txt</emphasis>.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>stdout</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Instruction to sent output data to standard output
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
<refsect1 id="_options">
|
||||
<title>OPTIONS</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>--tessdata-dir /path</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Specify the location of tessdata path
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>--user-words /path/to/file</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Specify the location of user words file
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>--user-patterns /path/to/file specify</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The location of user patterns file
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>-c configvar=value</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Set value for control parameter. Multiple -c arguments are allowed.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>-l lang</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The language to use. If none is specified, English is assumed.
|
||||
Multiple languages may be specified, separated by plus characters.
|
||||
Tesseract uses 3-character ISO 639-2 language codes. (See LANGUAGES)
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>--psm N</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Set Tesseract to only run a subset of layout analysis and assume
|
||||
a certain form of image. The options for <emphasis role="strong">N</emphasis> are:
|
||||
</simpara>
|
||||
<literallayout class="monospaced">0 = Orientation and script detection (OSD) only.
|
||||
1 = Automatic page segmentation with OSD.
|
||||
2 = Automatic page segmentation, but no OSD, or OCR.
|
||||
3 = Fully automatic page segmentation, but no OSD. (Default)
|
||||
4 = Assume a single column of text of variable sizes.
|
||||
5 = Assume a single uniform block of vertically aligned text.
|
||||
6 = Assume a single uniform block of text.
|
||||
7 = Treat the image as a single text line.
|
||||
8 = Treat the image as a single word.
|
||||
9 = Treat the image as a single word in a circle.
|
||||
10 = Treat the image as a single character.</literallayout>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>configfile</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The name of a config to use. A config is a plaintext file which
|
||||
contains a list of variables and their values, one per line, with a
|
||||
space separating variable from value. Interesting config files
|
||||
include:<?asciidoc-br?>
|
||||
</simpara>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<simpara>
|
||||
hocr - Output in hOCR format instead of as a text file.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
pdf - Output in pdf instead of a text file.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
<simpara><emphasis role="strong">Nota Bene:</emphasis> The options <emphasis>-l lang</emphasis> and <emphasis>--psm N</emphasis> must occur
|
||||
before any <emphasis>configfile</emphasis>.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_single_options">
|
||||
<title>SINGLE OPTIONS</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>-v</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Returns the current version of the tesseract(1) executable.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>--list-langs</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
list available languages for tesseract engine. Can be used with --tessdata-dir.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>--print-parameters</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
print tesseract parameters to the stdout.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
<refsect1 id="_languages">
|
||||
<title>LANGUAGES</title>
|
||||
<simpara>There are currently language packs available for the following languages
|
||||
(in <ulink url="https://github.com/tesseract-ocr/tessdata">https://github.com/tesseract-ocr/tessdata</ulink>):</simpara>
|
||||
<simpara><emphasis role="strong">afr</emphasis> (Afrikaans)
|
||||
<emphasis role="strong">amh</emphasis> (Amharic)
|
||||
<emphasis role="strong">ara</emphasis> (Arabic)
|
||||
<emphasis role="strong">asm</emphasis> (Assamese)
|
||||
<emphasis role="strong">aze</emphasis> (Azerbaijani)
|
||||
<emphasis role="strong">aze_cyrl</emphasis> (Azerbaijani - Cyrilic)
|
||||
<emphasis role="strong">bel</emphasis> (Belarusian)
|
||||
<emphasis role="strong">ben</emphasis> (Bengali)
|
||||
<emphasis role="strong">bod</emphasis> (Tibetan)
|
||||
<emphasis role="strong">bos</emphasis> (Bosnian)
|
||||
<emphasis role="strong">bul</emphasis> (Bulgarian)
|
||||
<emphasis role="strong">cat</emphasis> (Catalan; Valencian)
|
||||
<emphasis role="strong">ceb</emphasis> (Cebuano)
|
||||
<emphasis role="strong">ces</emphasis> (Czech)
|
||||
<emphasis role="strong">chi_sim</emphasis> (Chinese - Simplified)
|
||||
<emphasis role="strong">chi_tra</emphasis> (Chinese - Traditional)
|
||||
<emphasis role="strong">chr</emphasis> (Cherokee)
|
||||
<emphasis role="strong">cym</emphasis> (Welsh)
|
||||
<emphasis role="strong">dan</emphasis> (Danish)
|
||||
<emphasis role="strong">dan_frak</emphasis> (Danish - Fraktur)
|
||||
<emphasis role="strong">deu</emphasis> (German)
|
||||
<emphasis role="strong">deu_frak</emphasis> (German - Fraktur)
|
||||
<emphasis role="strong">dzo</emphasis> (Dzongkha)
|
||||
<emphasis role="strong">ell</emphasis> (Greek, Modern (1453-))
|
||||
<emphasis role="strong">eng</emphasis> (English)
|
||||
<emphasis role="strong">enm</emphasis> (English, Middle (1100-1500))
|
||||
<emphasis role="strong">epo</emphasis> (Esperanto)
|
||||
<emphasis role="strong">equ</emphasis> (Math / equation detection module)
|
||||
<emphasis role="strong">est</emphasis> (Estonian)
|
||||
<emphasis role="strong">eus</emphasis> (Basque)
|
||||
<emphasis role="strong">fas</emphasis> (Persian)
|
||||
<emphasis role="strong">fin</emphasis> (Finnish)
|
||||
<emphasis role="strong">fra</emphasis> (French)
|
||||
<emphasis role="strong">frk</emphasis> (Frankish)
|
||||
<emphasis role="strong">frm</emphasis> (French, Middle (ca.1400-1600))
|
||||
<emphasis role="strong">gle</emphasis> (Irish)
|
||||
<emphasis role="strong">glg</emphasis> (Galician)
|
||||
<emphasis role="strong">grc</emphasis> (Greek, Ancient (to 1453))
|
||||
<emphasis role="strong">guj</emphasis> (Gujarati)
|
||||
<emphasis role="strong">hat</emphasis> (Haitian; Haitian Creole)
|
||||
<emphasis role="strong">heb</emphasis> (Hebrew)
|
||||
<emphasis role="strong">hin</emphasis> (Hindi)
|
||||
<emphasis role="strong">hrv</emphasis> (Croatian)
|
||||
<emphasis role="strong">hun</emphasis> (Hungarian)
|
||||
<emphasis role="strong">iku</emphasis> (Inuktitut)
|
||||
<emphasis role="strong">ind</emphasis> (Indonesian)
|
||||
<emphasis role="strong">isl</emphasis> (Icelandic)
|
||||
<emphasis role="strong">ita</emphasis> (Italian)
|
||||
<emphasis role="strong">ita_old</emphasis> (Italian - Old)
|
||||
<emphasis role="strong">jav</emphasis> (Javanese)
|
||||
<emphasis role="strong">jpn</emphasis> (Japanese)
|
||||
<emphasis role="strong">kan</emphasis> (Kannada)
|
||||
<emphasis role="strong">kat</emphasis> (Georgian)
|
||||
<emphasis role="strong">kat_old</emphasis> (Georgian - Old)
|
||||
<emphasis role="strong">kaz</emphasis> (Kazakh)
|
||||
<emphasis role="strong">khm</emphasis> (Central Khmer)
|
||||
<emphasis role="strong">kir</emphasis> (Kirghiz; Kyrgyz)
|
||||
<emphasis role="strong">kor</emphasis> (Korean)
|
||||
<emphasis role="strong">kur</emphasis> (Kurdish)
|
||||
<emphasis role="strong">lao</emphasis> (Lao)
|
||||
<emphasis role="strong">lat</emphasis> (Latin)
|
||||
<emphasis role="strong">lav</emphasis> (Latvian)
|
||||
<emphasis role="strong">lit</emphasis> (Lithuanian)
|
||||
<emphasis role="strong">mal</emphasis> (Malayalam)
|
||||
<emphasis role="strong">mar</emphasis> (Marathi)
|
||||
<emphasis role="strong">mkd</emphasis> (Macedonian)
|
||||
<emphasis role="strong">mlt</emphasis> (Maltese)
|
||||
<emphasis role="strong">msa</emphasis> (Malay)
|
||||
<emphasis role="strong">mya</emphasis> (Burmese)
|
||||
<emphasis role="strong">nep</emphasis> (Nepali)
|
||||
<emphasis role="strong">nld</emphasis> (Dutch; Flemish)
|
||||
<emphasis role="strong">nor</emphasis> (Norwegian)
|
||||
<emphasis role="strong">ori</emphasis> (Oriya)
|
||||
<emphasis role="strong">osd</emphasis> (Orientation and script detection module)
|
||||
<emphasis role="strong">pan</emphasis> (Panjabi; Punjabi)
|
||||
<emphasis role="strong">pol</emphasis> (Polish)
|
||||
<emphasis role="strong">por</emphasis> (Portuguese)
|
||||
<emphasis role="strong">pus</emphasis> (Pushto; Pashto)
|
||||
<emphasis role="strong">ron</emphasis> (Romanian; Moldavian; Moldovan)
|
||||
<emphasis role="strong">rus</emphasis> (Russian)
|
||||
<emphasis role="strong">san</emphasis> (Sanskrit)
|
||||
<emphasis role="strong">sin</emphasis> (Sinhala; Sinhalese)
|
||||
<emphasis role="strong">slk</emphasis> (Slovak)
|
||||
<emphasis role="strong">slk_frak</emphasis> (Slovak - Fraktur)
|
||||
<emphasis role="strong">slv</emphasis> (Slovenian)
|
||||
<emphasis role="strong">spa</emphasis> (Spanish; Castilian)
|
||||
<emphasis role="strong">spa_old</emphasis> (Spanish; Castilian - Old)
|
||||
<emphasis role="strong">sqi</emphasis> (Albanian)
|
||||
<emphasis role="strong">srp</emphasis> (Serbian)
|
||||
<emphasis role="strong">srp_latn</emphasis> (Serbian - Latin)
|
||||
<emphasis role="strong">swa</emphasis> (Swahili)
|
||||
<emphasis role="strong">swe</emphasis> (Swedish)
|
||||
<emphasis role="strong">syr</emphasis> (Syriac)
|
||||
<emphasis role="strong">tam</emphasis> (Tamil)
|
||||
<emphasis role="strong">tel</emphasis> (Telugu)
|
||||
<emphasis role="strong">tgk</emphasis> (Tajik)
|
||||
<emphasis role="strong">tgl</emphasis> (Tagalog)
|
||||
<emphasis role="strong">tha</emphasis> (Thai)
|
||||
<emphasis role="strong">tir</emphasis> (Tigrinya)
|
||||
<emphasis role="strong">tur</emphasis> (Turkish)
|
||||
<emphasis role="strong">uig</emphasis> (Uighur; Uyghur)
|
||||
<emphasis role="strong">ukr</emphasis> (Ukrainian)
|
||||
<emphasis role="strong">urd</emphasis> (Urdu)
|
||||
<emphasis role="strong">uzb</emphasis> (Uzbek)
|
||||
<emphasis role="strong">uzb_cyrl</emphasis> (Uzbek - Cyrilic)
|
||||
<emphasis role="strong">vie</emphasis> (Vietnamese)
|
||||
<emphasis role="strong">yid</emphasis> (Yiddish)</simpara>
|
||||
<simpara>To use a non-standard language pack named <emphasis role="strong">foo.traineddata</emphasis>, set the
|
||||
<emphasis role="strong">TESSDATA_PREFIX</emphasis> environment variable so the file can be found at
|
||||
<emphasis role="strong">TESSDATA_PREFIX</emphasis>/tessdata/<emphasis role="strong">foo</emphasis>.traineddata and give Tesseract the
|
||||
argument <emphasis>-l foo</emphasis>.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_config_files_and_augmenting_with_user_data">
|
||||
<title>CONFIG FILES AND AUGMENTING WITH USER DATA</title>
|
||||
<simpara>Tesseract config files consist of lines with variable-value pairs (space
|
||||
separated). The variables are documented as flags in the source code like
|
||||
the following one in tesseractclass.h:</simpara>
|
||||
<simpara>STRING_VAR_H(tessedit_char_blacklist, "",
|
||||
"Blacklist of chars not to recognize");</simpara>
|
||||
<simpara>These variables may enable or disable various features of the engine, and
|
||||
may cause it to load (or not load) various data. For instance, let’s suppose
|
||||
you want to OCR in English, but suppress the normal dictionary and load an
|
||||
alternative word list and an alternative list of patterns — these two files
|
||||
are the most commonly used extra data files.</simpara>
|
||||
<simpara>If your language pack is in /path/to/eng.traineddata and the hocr config
|
||||
is in /path/to/configs/hocr then create three new files:</simpara>
|
||||
<simpara>/path/to/eng.user-words:</simpara>
|
||||
<blockquote>
|
||||
<literallayout>the
|
||||
quick
|
||||
brown
|
||||
fox
|
||||
jumped</literallayout>
|
||||
</blockquote>
|
||||
<simpara>/path/to/eng.user-patterns:</simpara>
|
||||
<blockquote>
|
||||
<literallayout>1-\d\d\d-GOOG-411
|
||||
www.\n\\\*.com</literallayout>
|
||||
</blockquote>
|
||||
<simpara>/path/to/configs/bazaar:</simpara>
|
||||
<blockquote>
|
||||
<literallayout>load_system_dawg F
|
||||
load_freq_dawg F
|
||||
user_words_suffix user-words
|
||||
user_patterns_suffix user-patterns</literallayout>
|
||||
</blockquote>
|
||||
<simpara>Now, if you pass the word <emphasis>bazaar</emphasis> as a trailing command line parameter
|
||||
to Tesseract, Tesseract will not bother loading the system dictionary nor
|
||||
the dictionary of frequent words and will load and use the eng.user-words
|
||||
and eng.user-patterns files you provided. The former is a simple word list,
|
||||
one per line. The format of the latter is documented in dict/trie.h
|
||||
on read_pattern_list().</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_history">
|
||||
<title>HISTORY</title>
|
||||
<simpara>The engine was developed at Hewlett Packard Laboratories Bristol and at
|
||||
Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more
|
||||
changes made in 1996 to port to Windows, and some C++izing in 1998. A
|
||||
lot of the code was written in C, and then some more was written in C++.
|
||||
The C\++ code makes heavy use of a list system using macros. This predates
|
||||
stl, was portable before stl, and is more efficient than stl lists, but has
|
||||
the big negative that if you do get a segmentation violation, it is hard to
|
||||
debug.</simpara>
|
||||
<simpara>Version 2.00 brought Unicode (UTF-8) support, six languages, and the ability
|
||||
to train Tesseract.</simpara>
|
||||
<simpara>Tesseract was included in UNLV’s Fourth Annual Test of OCR Accuracy.
|
||||
See <ulink url="https://github.com/tesseract-ocr/docs/blob/master/AT-1995.pdf">https://github.com/tesseract-ocr/docs/blob/master/AT-1995.pdf</ulink>. With Tesseract 2.00,
|
||||
scripts are now included to allow anyone to reproduce some of these tests.
|
||||
See <ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TestingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TestingTesseract</ulink> for more
|
||||
details.</simpara>
|
||||
<simpara>Tesseract 3.00 adds a number of new languages, including Chinese, Japanese,
|
||||
and Korean. It also introduces a new, single-file based system of managing
|
||||
language data.</simpara>
|
||||
<simpara>Tesseract 3.02 adds BiDirectional text support, the ability to recognize
|
||||
multiple languages in a single image, and improved layout analysis.</simpara>
|
||||
<simpara>For further details, see the file ReleaseNotes included with the distribution.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_resources">
|
||||
<title>RESOURCES</title>
|
||||
<simpara>Main web site: <ulink url="https://github.com/tesseract-ocr">https://github.com/tesseract-ocr</ulink><?asciidoc-br?>
|
||||
Information on training: <ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>ambiguous_words(1), cntraining(1), combine_tessdata(1), dawg2wordlist(1),
|
||||
shape_training(1), mftraining(1), unicharambigs(5), unicharset(5),
|
||||
unicharset_extractor(1), wordlist2dawg(1)</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>Tesseract development was led at Hewlett-Packard and Google by Ray Smith.
|
||||
The development team has included:</simpara>
|
||||
<simpara>Ahmad Abdulkader, Chris Newton, Dan Johnson, Dar-Shyang Lee, David Eger,
|
||||
Eric Wiseblatt, Faisal Shafait, Hiroshi Takenaka, Joe Liu, Joern Wanke,
|
||||
Mark Seaman, Mickey Namiki, Nicholas Beato, Oded Fuhrmann, Phil Cheatle,
|
||||
Pingping Xiu, Pong Eksombatchai (Chantat), Ranjith Unnikrishnan, Raquel
|
||||
Romano, Ray Smith, Rika Antonova, Robert Moss, Samuel Charron, Sheelagh
|
||||
Lloyd, Shobhit Saxena, and Thomas Kielbus.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>TESSERACT(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>tesseract</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>tesseract</refname>
|
||||
<refpurpose>command-line OCR engine</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara><emphasis role="strong">tesseract</emphasis> <emphasis>imagename</emphasis>|<emphasis>stdin</emphasis> <emphasis>outputbase</emphasis>|<emphasis>stdout</emphasis> [options…] [configfile…]</simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>tesseract(1) is a commercial quality OCR engine originally developed at HP
|
||||
between 1985 and 1995. In 1995, this engine was among the top 3 evaluated by
|
||||
UNLV. It was open-sourced by HP and UNLV in 2005, and has been developed
|
||||
at Google since then.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_in_out_arguments">
|
||||
<title>IN/OUT ARGUMENTS</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>imagename</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The name of the input image. Most image file formats (anything
|
||||
readable by Leptonica) are supported.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>stdin</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Instruction to read data from standard input
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>outputbase</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The basename of the output file (to which the appropriate extension
|
||||
will be appended). By default the output will be named <emphasis>outbase.txt</emphasis>.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>stdout</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Instruction to sent output data to standard output
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
<refsect1 id="_options">
|
||||
<title>OPTIONS</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>--tessdata-dir /path</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Specify the location of tessdata path
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>--user-words /path/to/file</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Specify the location of user words file
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>--user-patterns /path/to/file specify</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The location of user patterns file
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>-c configvar=value</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Set value for control parameter. Multiple -c arguments are allowed.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>-l lang</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The language to use. If none is specified, English is assumed.
|
||||
Multiple languages may be specified, separated by plus characters.
|
||||
Tesseract uses 3-character ISO 639-2 language codes. (See LANGUAGES)
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>--psm N</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Set Tesseract to only run a subset of layout analysis and assume
|
||||
a certain form of image. The options for <emphasis role="strong">N</emphasis> are:
|
||||
</simpara>
|
||||
<literallayout class="monospaced">0 = Orientation and script detection (OSD) only.
|
||||
1 = Automatic page segmentation with OSD.
|
||||
2 = Automatic page segmentation, but no OSD, or OCR.
|
||||
3 = Fully automatic page segmentation, but no OSD. (Default)
|
||||
4 = Assume a single column of text of variable sizes.
|
||||
5 = Assume a single uniform block of vertically aligned text.
|
||||
6 = Assume a single uniform block of text.
|
||||
7 = Treat the image as a single text line.
|
||||
8 = Treat the image as a single word.
|
||||
9 = Treat the image as a single word in a circle.
|
||||
10 = Treat the image as a single character.</literallayout>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>configfile</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The name of a config to use. A config is a plaintext file which
|
||||
contains a list of variables and their values, one per line, with a
|
||||
space separating variable from value. Interesting config files
|
||||
include:<?asciidoc-br?>
|
||||
</simpara>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<simpara>
|
||||
hocr - Output in hOCR format instead of as a text file.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
pdf - Output in pdf instead of a text file.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
<simpara><emphasis role="strong">Nota Bene:</emphasis> The options <emphasis>-l lang</emphasis> and <emphasis>--psm N</emphasis> must occur
|
||||
before any <emphasis>configfile</emphasis>.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_single_options">
|
||||
<title>SINGLE OPTIONS</title>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>-v</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Returns the current version of the tesseract(1) executable.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>--list-langs</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
list available languages for tesseract engine. Can be used with --tessdata-dir.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>--print-parameters</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
print tesseract parameters to the stdout.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
<refsect1 id="_languages">
|
||||
<title>LANGUAGES</title>
|
||||
<simpara>There are currently language packs available for the following languages
|
||||
(in <ulink url="https://github.com/tesseract-ocr/tessdata">https://github.com/tesseract-ocr/tessdata</ulink>):</simpara>
|
||||
<simpara><emphasis role="strong">afr</emphasis> (Afrikaans)
|
||||
<emphasis role="strong">amh</emphasis> (Amharic)
|
||||
<emphasis role="strong">ara</emphasis> (Arabic)
|
||||
<emphasis role="strong">asm</emphasis> (Assamese)
|
||||
<emphasis role="strong">aze</emphasis> (Azerbaijani)
|
||||
<emphasis role="strong">aze_cyrl</emphasis> (Azerbaijani - Cyrilic)
|
||||
<emphasis role="strong">bel</emphasis> (Belarusian)
|
||||
<emphasis role="strong">ben</emphasis> (Bengali)
|
||||
<emphasis role="strong">bod</emphasis> (Tibetan)
|
||||
<emphasis role="strong">bos</emphasis> (Bosnian)
|
||||
<emphasis role="strong">bul</emphasis> (Bulgarian)
|
||||
<emphasis role="strong">cat</emphasis> (Catalan; Valencian)
|
||||
<emphasis role="strong">ceb</emphasis> (Cebuano)
|
||||
<emphasis role="strong">ces</emphasis> (Czech)
|
||||
<emphasis role="strong">chi_sim</emphasis> (Chinese - Simplified)
|
||||
<emphasis role="strong">chi_tra</emphasis> (Chinese - Traditional)
|
||||
<emphasis role="strong">chr</emphasis> (Cherokee)
|
||||
<emphasis role="strong">cym</emphasis> (Welsh)
|
||||
<emphasis role="strong">dan</emphasis> (Danish)
|
||||
<emphasis role="strong">dan_frak</emphasis> (Danish - Fraktur)
|
||||
<emphasis role="strong">deu</emphasis> (German)
|
||||
<emphasis role="strong">deu_frak</emphasis> (German - Fraktur)
|
||||
<emphasis role="strong">dzo</emphasis> (Dzongkha)
|
||||
<emphasis role="strong">ell</emphasis> (Greek, Modern (1453-))
|
||||
<emphasis role="strong">eng</emphasis> (English)
|
||||
<emphasis role="strong">enm</emphasis> (English, Middle (1100-1500))
|
||||
<emphasis role="strong">epo</emphasis> (Esperanto)
|
||||
<emphasis role="strong">equ</emphasis> (Math / equation detection module)
|
||||
<emphasis role="strong">est</emphasis> (Estonian)
|
||||
<emphasis role="strong">eus</emphasis> (Basque)
|
||||
<emphasis role="strong">fas</emphasis> (Persian)
|
||||
<emphasis role="strong">fin</emphasis> (Finnish)
|
||||
<emphasis role="strong">fra</emphasis> (French)
|
||||
<emphasis role="strong">frk</emphasis> (Frankish)
|
||||
<emphasis role="strong">frm</emphasis> (French, Middle (ca.1400-1600))
|
||||
<emphasis role="strong">gle</emphasis> (Irish)
|
||||
<emphasis role="strong">glg</emphasis> (Galician)
|
||||
<emphasis role="strong">grc</emphasis> (Greek, Ancient (to 1453))
|
||||
<emphasis role="strong">guj</emphasis> (Gujarati)
|
||||
<emphasis role="strong">hat</emphasis> (Haitian; Haitian Creole)
|
||||
<emphasis role="strong">heb</emphasis> (Hebrew)
|
||||
<emphasis role="strong">hin</emphasis> (Hindi)
|
||||
<emphasis role="strong">hrv</emphasis> (Croatian)
|
||||
<emphasis role="strong">hun</emphasis> (Hungarian)
|
||||
<emphasis role="strong">iku</emphasis> (Inuktitut)
|
||||
<emphasis role="strong">ind</emphasis> (Indonesian)
|
||||
<emphasis role="strong">isl</emphasis> (Icelandic)
|
||||
<emphasis role="strong">ita</emphasis> (Italian)
|
||||
<emphasis role="strong">ita_old</emphasis> (Italian - Old)
|
||||
<emphasis role="strong">jav</emphasis> (Javanese)
|
||||
<emphasis role="strong">jpn</emphasis> (Japanese)
|
||||
<emphasis role="strong">kan</emphasis> (Kannada)
|
||||
<emphasis role="strong">kat</emphasis> (Georgian)
|
||||
<emphasis role="strong">kat_old</emphasis> (Georgian - Old)
|
||||
<emphasis role="strong">kaz</emphasis> (Kazakh)
|
||||
<emphasis role="strong">khm</emphasis> (Central Khmer)
|
||||
<emphasis role="strong">kir</emphasis> (Kirghiz; Kyrgyz)
|
||||
<emphasis role="strong">kor</emphasis> (Korean)
|
||||
<emphasis role="strong">kur</emphasis> (Kurdish)
|
||||
<emphasis role="strong">lao</emphasis> (Lao)
|
||||
<emphasis role="strong">lat</emphasis> (Latin)
|
||||
<emphasis role="strong">lav</emphasis> (Latvian)
|
||||
<emphasis role="strong">lit</emphasis> (Lithuanian)
|
||||
<emphasis role="strong">mal</emphasis> (Malayalam)
|
||||
<emphasis role="strong">mar</emphasis> (Marathi)
|
||||
<emphasis role="strong">mkd</emphasis> (Macedonian)
|
||||
<emphasis role="strong">mlt</emphasis> (Maltese)
|
||||
<emphasis role="strong">msa</emphasis> (Malay)
|
||||
<emphasis role="strong">mya</emphasis> (Burmese)
|
||||
<emphasis role="strong">nep</emphasis> (Nepali)
|
||||
<emphasis role="strong">nld</emphasis> (Dutch; Flemish)
|
||||
<emphasis role="strong">nor</emphasis> (Norwegian)
|
||||
<emphasis role="strong">ori</emphasis> (Oriya)
|
||||
<emphasis role="strong">osd</emphasis> (Orientation and script detection module)
|
||||
<emphasis role="strong">pan</emphasis> (Panjabi; Punjabi)
|
||||
<emphasis role="strong">pol</emphasis> (Polish)
|
||||
<emphasis role="strong">por</emphasis> (Portuguese)
|
||||
<emphasis role="strong">pus</emphasis> (Pushto; Pashto)
|
||||
<emphasis role="strong">ron</emphasis> (Romanian; Moldavian; Moldovan)
|
||||
<emphasis role="strong">rus</emphasis> (Russian)
|
||||
<emphasis role="strong">san</emphasis> (Sanskrit)
|
||||
<emphasis role="strong">sin</emphasis> (Sinhala; Sinhalese)
|
||||
<emphasis role="strong">slk</emphasis> (Slovak)
|
||||
<emphasis role="strong">slk_frak</emphasis> (Slovak - Fraktur)
|
||||
<emphasis role="strong">slv</emphasis> (Slovenian)
|
||||
<emphasis role="strong">spa</emphasis> (Spanish; Castilian)
|
||||
<emphasis role="strong">spa_old</emphasis> (Spanish; Castilian - Old)
|
||||
<emphasis role="strong">sqi</emphasis> (Albanian)
|
||||
<emphasis role="strong">srp</emphasis> (Serbian)
|
||||
<emphasis role="strong">srp_latn</emphasis> (Serbian - Latin)
|
||||
<emphasis role="strong">swa</emphasis> (Swahili)
|
||||
<emphasis role="strong">swe</emphasis> (Swedish)
|
||||
<emphasis role="strong">syr</emphasis> (Syriac)
|
||||
<emphasis role="strong">tam</emphasis> (Tamil)
|
||||
<emphasis role="strong">tel</emphasis> (Telugu)
|
||||
<emphasis role="strong">tgk</emphasis> (Tajik)
|
||||
<emphasis role="strong">tgl</emphasis> (Tagalog)
|
||||
<emphasis role="strong">tha</emphasis> (Thai)
|
||||
<emphasis role="strong">tir</emphasis> (Tigrinya)
|
||||
<emphasis role="strong">tur</emphasis> (Turkish)
|
||||
<emphasis role="strong">uig</emphasis> (Uighur; Uyghur)
|
||||
<emphasis role="strong">ukr</emphasis> (Ukrainian)
|
||||
<emphasis role="strong">urd</emphasis> (Urdu)
|
||||
<emphasis role="strong">uzb</emphasis> (Uzbek)
|
||||
<emphasis role="strong">uzb_cyrl</emphasis> (Uzbek - Cyrilic)
|
||||
<emphasis role="strong">vie</emphasis> (Vietnamese)
|
||||
<emphasis role="strong">yid</emphasis> (Yiddish)</simpara>
|
||||
<simpara>To use a non-standard language pack named <emphasis role="strong">foo.traineddata</emphasis>, set the
|
||||
<emphasis role="strong">TESSDATA_PREFIX</emphasis> environment variable so the file can be found at
|
||||
<emphasis role="strong">TESSDATA_PREFIX</emphasis>/tessdata/<emphasis role="strong">foo</emphasis>.traineddata and give Tesseract the
|
||||
argument <emphasis>-l foo</emphasis>.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_config_files_and_augmenting_with_user_data">
|
||||
<title>CONFIG FILES AND AUGMENTING WITH USER DATA</title>
|
||||
<simpara>Tesseract config files consist of lines with variable-value pairs (space
|
||||
separated). The variables are documented as flags in the source code like
|
||||
the following one in tesseractclass.h:</simpara>
|
||||
<simpara>STRING_VAR_H(tessedit_char_blacklist, "",
|
||||
"Blacklist of chars not to recognize");</simpara>
|
||||
<simpara>These variables may enable or disable various features of the engine, and
|
||||
may cause it to load (or not load) various data. For instance, let’s suppose
|
||||
you want to OCR in English, but suppress the normal dictionary and load an
|
||||
alternative word list and an alternative list of patterns — these two files
|
||||
are the most commonly used extra data files.</simpara>
|
||||
<simpara>If your language pack is in /path/to/eng.traineddata and the hocr config
|
||||
is in /path/to/configs/hocr then create three new files:</simpara>
|
||||
<simpara>/path/to/eng.user-words:</simpara>
|
||||
<blockquote>
|
||||
<literallayout>the
|
||||
quick
|
||||
brown
|
||||
fox
|
||||
jumped</literallayout>
|
||||
</blockquote>
|
||||
<simpara>/path/to/eng.user-patterns:</simpara>
|
||||
<blockquote>
|
||||
<literallayout>1-\d\d\d-GOOG-411
|
||||
www.\n\\\*.com</literallayout>
|
||||
</blockquote>
|
||||
<simpara>/path/to/configs/bazaar:</simpara>
|
||||
<blockquote>
|
||||
<literallayout>load_system_dawg F
|
||||
load_freq_dawg F
|
||||
user_words_suffix user-words
|
||||
user_patterns_suffix user-patterns</literallayout>
|
||||
</blockquote>
|
||||
<simpara>Now, if you pass the word <emphasis>bazaar</emphasis> as a trailing command line parameter
|
||||
to Tesseract, Tesseract will not bother loading the system dictionary nor
|
||||
the dictionary of frequent words and will load and use the eng.user-words
|
||||
and eng.user-patterns files you provided. The former is a simple word list,
|
||||
one per line. The format of the latter is documented in dict/trie.h
|
||||
on read_pattern_list().</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_history">
|
||||
<title>HISTORY</title>
|
||||
<simpara>The engine was developed at Hewlett Packard Laboratories Bristol and at
|
||||
Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more
|
||||
changes made in 1996 to port to Windows, and some C++izing in 1998. A
|
||||
lot of the code was written in C, and then some more was written in C++.
|
||||
The C\++ code makes heavy use of a list system using macros. This predates
|
||||
stl, was portable before stl, and is more efficient than stl lists, but has
|
||||
the big negative that if you do get a segmentation violation, it is hard to
|
||||
debug.</simpara>
|
||||
<simpara>Version 2.00 brought Unicode (UTF-8) support, six languages, and the ability
|
||||
to train Tesseract.</simpara>
|
||||
<simpara>Tesseract was included in UNLV’s Fourth Annual Test of OCR Accuracy.
|
||||
See <ulink url="https://github.com/tesseract-ocr/docs/blob/master/AT-1995.pdf">https://github.com/tesseract-ocr/docs/blob/master/AT-1995.pdf</ulink>. With Tesseract 2.00,
|
||||
scripts are now included to allow anyone to reproduce some of these tests.
|
||||
See <ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TestingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TestingTesseract</ulink> for more
|
||||
details.</simpara>
|
||||
<simpara>Tesseract 3.00 adds a number of new languages, including Chinese, Japanese,
|
||||
and Korean. It also introduces a new, single-file based system of managing
|
||||
language data.</simpara>
|
||||
<simpara>Tesseract 3.02 adds BiDirectional text support, the ability to recognize
|
||||
multiple languages in a single image, and improved layout analysis.</simpara>
|
||||
<simpara>For further details, see the file ReleaseNotes included with the distribution.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_resources">
|
||||
<title>RESOURCES</title>
|
||||
<simpara>Main web site: <ulink url="https://github.com/tesseract-ocr">https://github.com/tesseract-ocr</ulink><?asciidoc-br?>
|
||||
Information on training: <ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>ambiguous_words(1), cntraining(1), combine_tessdata(1), dawg2wordlist(1),
|
||||
shape_training(1), mftraining(1), unicharambigs(5), unicharset(5),
|
||||
unicharset_extractor(1), wordlist2dawg(1)</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>Tesseract development was led at Hewlett-Packard and Google by Ray Smith.
|
||||
The development team has included:</simpara>
|
||||
<simpara>Ahmad Abdulkader, Chris Newton, Dan Johnson, Dar-Shyang Lee, David Eger,
|
||||
Eric Wiseblatt, Faisal Shafait, Hiroshi Takenaka, Joe Liu, Joern Wanke,
|
||||
Mark Seaman, Mickey Namiki, Nicholas Beato, Oded Fuhrmann, Phil Cheatle,
|
||||
Pingping Xiu, Pong Eksombatchai (Chantat), Ranjith Unnikrishnan, Raquel
|
||||
Romano, Ray Smith, Rika Antonova, Robert Moss, Samuel Charron, Sheelagh
|
||||
Lloyd, Shobhit Saxena, and Thomas Kielbus.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
|
@ -38,7 +38,7 @@ EXAMPLE
|
||||
3 i i i 1 m 0
|
||||
...............................
|
||||
|
||||
In this example, all instances of the '2' character sequence '''' will
|
||||
In this example, all instances of the '2' character sequence '''' will
|
||||
*always* be replaced by the '1' character sequence '"'; a '1' character
|
||||
sequence 'm' *may* be replaced by the '2' character sequence 'rn', and
|
||||
the '3' character sequence *may* be replaced by the '1' character
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,126 +1,126 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>UNICHARAMBIGS(5)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>unicharambigs</refentrytitle>
|
||||
<manvolnum>5</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>unicharambigs</refname>
|
||||
<refpurpose>Tesseract unicharset ambiguities</refpurpose>
|
||||
</refnamediv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>The unicharambigs file (a component of traineddata, see combine_tessdata(1) )
|
||||
is used by Tesseract to represent possible ambiguities between characters,
|
||||
or groups of characters.</simpara>
|
||||
<simpara>The file contains a number of lines, laid out as follow:</simpara>
|
||||
<literallayout class="monospaced">[num] <TAB> [char(s)] <TAB> [num] <TAB> [char(s)] <TAB> [num]</literallayout>
|
||||
<informaltable tabstyle="horizontal" frame="none" colsep="0" rowsep="0"><tgroup cols="2"><colspec colwidth="15*"/><colspec colwidth="85*"/><tbody valign="top">
|
||||
<row>
|
||||
<entry>
|
||||
<simpara>
|
||||
Field one
|
||||
</simpara>
|
||||
</entry>
|
||||
<entry>
|
||||
<simpara>
|
||||
the number of characters contained in field two
|
||||
</simpara>
|
||||
</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>
|
||||
<simpara>
|
||||
Field two
|
||||
</simpara>
|
||||
</entry>
|
||||
<entry>
|
||||
<simpara>
|
||||
the character sequence to be replaced
|
||||
</simpara>
|
||||
</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>
|
||||
<simpara>
|
||||
Field three
|
||||
</simpara>
|
||||
</entry>
|
||||
<entry>
|
||||
<simpara>
|
||||
the number of characters contained in field four
|
||||
</simpara>
|
||||
</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>
|
||||
<simpara>
|
||||
Field four
|
||||
</simpara>
|
||||
</entry>
|
||||
<entry>
|
||||
<simpara>
|
||||
the character sequence used to replace field two
|
||||
</simpara>
|
||||
</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>
|
||||
<simpara>
|
||||
Field five
|
||||
</simpara>
|
||||
</entry>
|
||||
<entry>
|
||||
<simpara>
|
||||
contains either 1 or 0. 1 denotes a mandatory
|
||||
replacement, 0 denotes an optional replacement.
|
||||
</simpara>
|
||||
</entry>
|
||||
</row>
|
||||
</tbody></tgroup></informaltable>
|
||||
<simpara>Characters appearing in fields two and four should appear in
|
||||
unicharset. The numbers in fields one and three refer to the
|
||||
number of unichars (not bytes).</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_example">
|
||||
<title>EXAMPLE</title>
|
||||
<literallayout class="monospaced">2 ' ' 1 " 1
|
||||
1 m 2 r n 0
|
||||
3 i i i 1 m 0</literallayout>
|
||||
<simpara>In this example, all instances of the <emphasis>2</emphasis> character sequence <emphasis>'</emphasis>' will
|
||||
<emphasis role="strong">always</emphasis> be replaced by the <emphasis>1</emphasis> character sequence <emphasis>"</emphasis>; a <emphasis>1</emphasis> character
|
||||
sequence <emphasis>m</emphasis> <emphasis role="strong">may</emphasis> be replaced by the <emphasis>2</emphasis> character sequence <emphasis>rn</emphasis>, and
|
||||
the <emphasis>3</emphasis> character sequence <emphasis role="strong">may</emphasis> be replaced by the <emphasis>1</emphasis> character
|
||||
sequence <emphasis>m</emphasis>.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_history">
|
||||
<title>HISTORY</title>
|
||||
<simpara>The unicharambigs file first appeared in Tesseract 3.00; prior to that, a
|
||||
similar format, called DangAmbigs (<emphasis>dangerous ambiguities</emphasis>) was used: the
|
||||
format was almost identical, except only mandatory replacements could be
|
||||
specified, and field 5 was absent.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_bugs">
|
||||
<title>BUGS</title>
|
||||
<simpara>This is a documentation "bug": it’s not currently clear what should be done
|
||||
in the case of ligatures (such as <emphasis>fi</emphasis>) which may also appear as regular
|
||||
letters in the unicharset.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), unicharset(5)</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>UNICHARAMBIGS(5)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>unicharambigs</refentrytitle>
|
||||
<manvolnum>5</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>unicharambigs</refname>
|
||||
<refpurpose>Tesseract unicharset ambiguities</refpurpose>
|
||||
</refnamediv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>The unicharambigs file (a component of traineddata, see combine_tessdata(1) )
|
||||
is used by Tesseract to represent possible ambiguities between characters,
|
||||
or groups of characters.</simpara>
|
||||
<simpara>The file contains a number of lines, laid out as follow:</simpara>
|
||||
<literallayout class="monospaced">[num] <TAB> [char(s)] <TAB> [num] <TAB> [char(s)] <TAB> [num]</literallayout>
|
||||
<informaltable tabstyle="horizontal" frame="none" colsep="0" rowsep="0"><tgroup cols="2"><colspec colwidth="15*"/><colspec colwidth="85*"/><tbody valign="top">
|
||||
<row>
|
||||
<entry>
|
||||
<simpara>
|
||||
Field one
|
||||
</simpara>
|
||||
</entry>
|
||||
<entry>
|
||||
<simpara>
|
||||
the number of characters contained in field two
|
||||
</simpara>
|
||||
</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>
|
||||
<simpara>
|
||||
Field two
|
||||
</simpara>
|
||||
</entry>
|
||||
<entry>
|
||||
<simpara>
|
||||
the character sequence to be replaced
|
||||
</simpara>
|
||||
</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>
|
||||
<simpara>
|
||||
Field three
|
||||
</simpara>
|
||||
</entry>
|
||||
<entry>
|
||||
<simpara>
|
||||
the number of characters contained in field four
|
||||
</simpara>
|
||||
</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>
|
||||
<simpara>
|
||||
Field four
|
||||
</simpara>
|
||||
</entry>
|
||||
<entry>
|
||||
<simpara>
|
||||
the character sequence used to replace field two
|
||||
</simpara>
|
||||
</entry>
|
||||
</row>
|
||||
<row>
|
||||
<entry>
|
||||
<simpara>
|
||||
Field five
|
||||
</simpara>
|
||||
</entry>
|
||||
<entry>
|
||||
<simpara>
|
||||
contains either 1 or 0. 1 denotes a mandatory
|
||||
replacement, 0 denotes an optional replacement.
|
||||
</simpara>
|
||||
</entry>
|
||||
</row>
|
||||
</tbody></tgroup></informaltable>
|
||||
<simpara>Characters appearing in fields two and four should appear in
|
||||
unicharset. The numbers in fields one and three refer to the
|
||||
number of unichars (not bytes).</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_example">
|
||||
<title>EXAMPLE</title>
|
||||
<literallayout class="monospaced">2 ' ' 1 " 1
|
||||
1 m 2 r n 0
|
||||
3 i i i 1 m 0</literallayout>
|
||||
<simpara>In this example, all instances of the <emphasis>2</emphasis> character sequence <emphasis>'</emphasis>' will
|
||||
<emphasis role="strong">always</emphasis> be replaced by the <emphasis>1</emphasis> character sequence <emphasis>"</emphasis>; a <emphasis>1</emphasis> character
|
||||
sequence <emphasis>m</emphasis> <emphasis role="strong">may</emphasis> be replaced by the <emphasis>2</emphasis> character sequence <emphasis>rn</emphasis>, and
|
||||
the <emphasis>3</emphasis> character sequence <emphasis role="strong">may</emphasis> be replaced by the <emphasis>1</emphasis> character
|
||||
sequence <emphasis>m</emphasis>.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_history">
|
||||
<title>HISTORY</title>
|
||||
<simpara>The unicharambigs file first appeared in Tesseract 3.00; prior to that, a
|
||||
similar format, called DangAmbigs (<emphasis>dangerous ambiguities</emphasis>) was used: the
|
||||
format was almost identical, except only mandatory replacements could be
|
||||
specified, and field 5 was absent.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_bugs">
|
||||
<title>BUGS</title>
|
||||
<simpara>This is a documentation "bug": it’s not currently clear what should be done
|
||||
in the case of ligatures (such as <emphasis>fi</emphasis>) which may also appear as regular
|
||||
letters in the unicharset.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), unicharset(5)</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,219 +1,219 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>UNICHARSET(5)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>unicharset</refentrytitle>
|
||||
<manvolnum>5</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>unicharset</refname>
|
||||
<refpurpose>character properties file used by tesseract(1)</refpurpose>
|
||||
</refnamediv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>Tesseract’s unicharset file contains information on each symbol
|
||||
(unichar) the Tesseract OCR engine is trained to recognize.</simpara>
|
||||
<simpara>A unicharset file (i.e. <emphasis>eng.unicharset</emphasis>) is distributed as part of a
|
||||
Tesseract language pack (i.e. <emphasis>eng.traineddata</emphasis>). For information on
|
||||
extracting the unicharset file, see combine_tessdata(1).</simpara>
|
||||
<simpara>The first line of a unicharset file contains the number of unichars in
|
||||
the file. After this line, each subsequent line provides information for
|
||||
a single unichar. The first such line contains a placeholder reserved for
|
||||
the space character. Each unichar is referred to within Tesseract by its
|
||||
Unichar ID, which is the line number (minus 1) within the unicharset file.
|
||||
Therefore, space gets unichar 0.</simpara>
|
||||
<simpara>Each unichar line in the unicharset file (v2+) may have four space-separated fields:</simpara>
|
||||
<literallayout class="monospaced">'character' 'properties' 'script' 'id'</literallayout>
|
||||
<simpara>Starting with Tesseract v3.02, more information may be given for each unichar:</simpara>
|
||||
<literallayout class="monospaced">'character' 'properties' 'glyph_metrics' 'script' 'other_case' 'direction' 'mirror' 'normed_form'</literallayout>
|
||||
<simpara>Entries:</simpara>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>character</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The UTF-8 encoded string to be produced for this unichar.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>properties</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
An integer mask of character properties, one per bit.
|
||||
From least to most significant bit, these are: isalpha, islower, isupper,
|
||||
isdigit, ispunctuation.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>glyph_metrics</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Ten comma-separated integers representing various standards
|
||||
for where this glyph is to be found within a baseline-normalized coordinate
|
||||
system where 128 is normalized to x-height.
|
||||
</simpara>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<simpara>
|
||||
min_bottom, max_bottom: the ranges where the bottom of the character can
|
||||
be found.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
min_top, max_top: the ranges where the top of the character may be found.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
min_width, max_width: horizontal width of the character.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
min_bearing, max_bearing: how far from the usual start position does the
|
||||
leftmost part of the character begin.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
min_advance, max_advance: how far from the printer’s cell left do we
|
||||
advance to begin the next character.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>script</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Name of the script (Latin, Common, Greek, Cyrillic, Han, null).
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>other_case</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The Unichar ID of the other case version of this character
|
||||
(upper or lower).
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>direction</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The Unicode BiDi direction of this character, as defined by
|
||||
ICU’s enum UCharDirection. (0 = Left to Right, 1 = Right to Left,
|
||||
2 = European Number…)
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>mirror</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The Unichar ID of the BiDirectional mirror of this character.
|
||||
For example the mirror of open paren is close paren, but Latin Capital C
|
||||
has no mirror, so it remains a Latin Capital C.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>normed_form</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The UTF-8 representation of a "normalized form" of this unichar
|
||||
for the purpose of blaming a module for errors given ground truth text.
|
||||
For instance, a left or right single quote may normalize to an ASCII quote.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
<refsect1 id="_example_v2">
|
||||
<title>EXAMPLE (v2)</title>
|
||||
<literallayout class="monospaced">; 10 Common 46
|
||||
b 3 Latin 59
|
||||
W 5 Latin 40
|
||||
7 8 Common 66
|
||||
= 0 Common 93</literallayout>
|
||||
<simpara>";" is a punctuation character. Its properties are thus represented by the
|
||||
binary number 10000 (10 in hexadecimal).</simpara>
|
||||
<simpara>"b" is an alphabetic character and a lower case character. Its properties are
|
||||
thus represented by the binary number 00011 (3 in hexadecimal).</simpara>
|
||||
<simpara>"W" is an alphabetic character and an upper case character. Its properties are
|
||||
thus represented by the binary number 00101 (5 in hexadecimal).</simpara>
|
||||
<simpara>"7" is just a digit. Its properties are thus represented by the binary number
|
||||
01000 (8 in hexadecimal).</simpara>
|
||||
<simpara>"=" is not punctuation nor a digit nor an alphabetic character. Its properties
|
||||
are thus represented by the binary number 00000 (0 in hexadecimal).</simpara>
|
||||
<simpara>Japanese or Chinese alphabetic character properties are represented by the
|
||||
binary number 00001 (1 in hexadecimal): they are alphabetic, but neither
|
||||
upper nor lower case.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_example_v3_02">
|
||||
<title>EXAMPLE (v3.02)</title>
|
||||
<literallayout class="monospaced">110
|
||||
NULL 0 NULL 0
|
||||
N 5 59,68,216,255,87,236,0,27,104,227 Latin 11 0 1 N
|
||||
Y 5 59,68,216,255,91,205,0,47,91,223 Latin 33 0 2 Y
|
||||
1 8 59,69,203,255,45,128,0,66,74,173 Common 3 2 3 1
|
||||
9 8 18,66,203,255,89,156,0,39,104,173 Common 4 2 4 9
|
||||
a 3 58,65,186,198,85,164,0,26,97,185 Latin 56 0 5 a
|
||||
. . .</literallayout>
|
||||
</refsect1>
|
||||
<refsect1 id="_caveats">
|
||||
<title>CAVEATS</title>
|
||||
<simpara>Although the unicharset reader maintains the ability to read unicharsets
|
||||
of older formats and will assign default values to missing fields,
|
||||
the accuracy will be degraded.</simpara>
|
||||
<simpara>Further, most other data files are indexed by the unicharset file,
|
||||
so changing it without re-generating the others is likely to have dire
|
||||
consequences.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_history">
|
||||
<title>HISTORY</title>
|
||||
<simpara>The unicharset format first appeared with Tesseract 2.00, which was the
|
||||
first version to support languages other than English. The unicharset file
|
||||
contained only the first two fields, and the "ispunctuation" property was
|
||||
absent (punctuation was regarded as "0", as "=" is in the above example.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), combine_tessdata(1), unicharset_extractor(1)</simpara>
|
||||
<simpara><ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>UNICHARSET(5)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>unicharset</refentrytitle>
|
||||
<manvolnum>5</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>unicharset</refname>
|
||||
<refpurpose>character properties file used by tesseract(1)</refpurpose>
|
||||
</refnamediv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>Tesseract’s unicharset file contains information on each symbol
|
||||
(unichar) the Tesseract OCR engine is trained to recognize.</simpara>
|
||||
<simpara>A unicharset file (i.e. <emphasis>eng.unicharset</emphasis>) is distributed as part of a
|
||||
Tesseract language pack (i.e. <emphasis>eng.traineddata</emphasis>). For information on
|
||||
extracting the unicharset file, see combine_tessdata(1).</simpara>
|
||||
<simpara>The first line of a unicharset file contains the number of unichars in
|
||||
the file. After this line, each subsequent line provides information for
|
||||
a single unichar. The first such line contains a placeholder reserved for
|
||||
the space character. Each unichar is referred to within Tesseract by its
|
||||
Unichar ID, which is the line number (minus 1) within the unicharset file.
|
||||
Therefore, space gets unichar 0.</simpara>
|
||||
<simpara>Each unichar line in the unicharset file (v2+) may have four space-separated fields:</simpara>
|
||||
<literallayout class="monospaced">'character' 'properties' 'script' 'id'</literallayout>
|
||||
<simpara>Starting with Tesseract v3.02, more information may be given for each unichar:</simpara>
|
||||
<literallayout class="monospaced">'character' 'properties' 'glyph_metrics' 'script' 'other_case' 'direction' 'mirror' 'normed_form'</literallayout>
|
||||
<simpara>Entries:</simpara>
|
||||
<variablelist>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>character</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The UTF-8 encoded string to be produced for this unichar.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>properties</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
An integer mask of character properties, one per bit.
|
||||
From least to most significant bit, these are: isalpha, islower, isupper,
|
||||
isdigit, ispunctuation.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>glyph_metrics</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Ten comma-separated integers representing various standards
|
||||
for where this glyph is to be found within a baseline-normalized coordinate
|
||||
system where 128 is normalized to x-height.
|
||||
</simpara>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<simpara>
|
||||
min_bottom, max_bottom: the ranges where the bottom of the character can
|
||||
be found.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
min_top, max_top: the ranges where the top of the character may be found.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
min_width, max_width: horizontal width of the character.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
min_bearing, max_bearing: how far from the usual start position does the
|
||||
leftmost part of the character begin.
|
||||
</simpara>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<simpara>
|
||||
min_advance, max_advance: how far from the printer’s cell left do we
|
||||
advance to begin the next character.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>script</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
Name of the script (Latin, Common, Greek, Cyrillic, Han, null).
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>other_case</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The Unichar ID of the other case version of this character
|
||||
(upper or lower).
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>direction</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The Unicode BiDi direction of this character, as defined by
|
||||
ICU’s enum UCharDirection. (0 = Left to Right, 1 = Right to Left,
|
||||
2 = European Number…)
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>mirror</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The Unichar ID of the BiDirectional mirror of this character.
|
||||
For example the mirror of open paren is close paren, but Latin Capital C
|
||||
has no mirror, so it remains a Latin Capital C.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
<varlistentry>
|
||||
<term>
|
||||
<emphasis>normed_form</emphasis>
|
||||
</term>
|
||||
<listitem>
|
||||
<simpara>
|
||||
The UTF-8 representation of a "normalized form" of this unichar
|
||||
for the purpose of blaming a module for errors given ground truth text.
|
||||
For instance, a left or right single quote may normalize to an ASCII quote.
|
||||
</simpara>
|
||||
</listitem>
|
||||
</varlistentry>
|
||||
</variablelist>
|
||||
</refsect1>
|
||||
<refsect1 id="_example_v2">
|
||||
<title>EXAMPLE (v2)</title>
|
||||
<literallayout class="monospaced">; 10 Common 46
|
||||
b 3 Latin 59
|
||||
W 5 Latin 40
|
||||
7 8 Common 66
|
||||
= 0 Common 93</literallayout>
|
||||
<simpara>";" is a punctuation character. Its properties are thus represented by the
|
||||
binary number 10000 (10 in hexadecimal).</simpara>
|
||||
<simpara>"b" is an alphabetic character and a lower case character. Its properties are
|
||||
thus represented by the binary number 00011 (3 in hexadecimal).</simpara>
|
||||
<simpara>"W" is an alphabetic character and an upper case character. Its properties are
|
||||
thus represented by the binary number 00101 (5 in hexadecimal).</simpara>
|
||||
<simpara>"7" is just a digit. Its properties are thus represented by the binary number
|
||||
01000 (8 in hexadecimal).</simpara>
|
||||
<simpara>"=" is not punctuation nor a digit nor an alphabetic character. Its properties
|
||||
are thus represented by the binary number 00000 (0 in hexadecimal).</simpara>
|
||||
<simpara>Japanese or Chinese alphabetic character properties are represented by the
|
||||
binary number 00001 (1 in hexadecimal): they are alphabetic, but neither
|
||||
upper nor lower case.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_example_v3_02">
|
||||
<title>EXAMPLE (v3.02)</title>
|
||||
<literallayout class="monospaced">110
|
||||
NULL 0 NULL 0
|
||||
N 5 59,68,216,255,87,236,0,27,104,227 Latin 11 0 1 N
|
||||
Y 5 59,68,216,255,91,205,0,47,91,223 Latin 33 0 2 Y
|
||||
1 8 59,69,203,255,45,128,0,66,74,173 Common 3 2 3 1
|
||||
9 8 18,66,203,255,89,156,0,39,104,173 Common 4 2 4 9
|
||||
a 3 58,65,186,198,85,164,0,26,97,185 Latin 56 0 5 a
|
||||
. . .</literallayout>
|
||||
</refsect1>
|
||||
<refsect1 id="_caveats">
|
||||
<title>CAVEATS</title>
|
||||
<simpara>Although the unicharset reader maintains the ability to read unicharsets
|
||||
of older formats and will assign default values to missing fields,
|
||||
the accuracy will be degraded.</simpara>
|
||||
<simpara>Further, most other data files are indexed by the unicharset file,
|
||||
so changing it without re-generating the others is likely to have dire
|
||||
consequences.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_history">
|
||||
<title>HISTORY</title>
|
||||
<simpara>The unicharset format first appeared with Tesseract 2.00, which was the
|
||||
first version to support languages other than English. The unicharset file
|
||||
contained only the first two fields, and the "ispunctuation" property was
|
||||
absent (punctuation was regarded as "0", as "=" is in the above example.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), combine_tessdata(1), unicharset_extractor(1)</simpara>
|
||||
<simpara><ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
|
@ -11,9 +11,9 @@ SYNOPSIS
|
||||
|
||||
DESCRIPTION
|
||||
-----------
|
||||
Tesseract needs to know the set of possible characters it can output.
|
||||
To generate the unicharset data file, use the unicharset_extractor
|
||||
program on the same training pages bounding box files as used for
|
||||
Tesseract needs to know the set of possible characters it can output.
|
||||
To generate the unicharset data file, use the unicharset_extractor
|
||||
program on the same training pages bounding box files as used for
|
||||
clustering:
|
||||
|
||||
unicharset_extractor fontfile_1.box fontfile_2.box ...
|
||||
@ -21,19 +21,19 @@ clustering:
|
||||
The unicharset will be put into the file 'dir/unicharset', or simply
|
||||
'./unicharset' if no output directory is provided.
|
||||
|
||||
Tesseract also needs to have access to character properties isalpha,
|
||||
isdigit, isupper, islower, ispunctuation. all of this auxilury data
|
||||
Tesseract also needs to have access to character properties isalpha,
|
||||
isdigit, isupper, islower, ispunctuation. all of this auxilury data
|
||||
and more is encoded in this file. (See unicharset(5))
|
||||
|
||||
If your system supports the wctype functions, these values will be set
|
||||
automatically by unicharset_extractor and there is no need to edit the
|
||||
unicharset file. On some older systems (eg Windows 95), the unicharset
|
||||
If your system supports the wctype functions, these values will be set
|
||||
automatically by unicharset_extractor and there is no need to edit the
|
||||
unicharset file. On some older systems (eg Windows 95), the unicharset
|
||||
file must be edited by hand to add these property description codes.
|
||||
|
||||
*NOTE* The unicharset file must be regenerated whenever inttemp, normproto
|
||||
and pffmtable are generated (i.e. they must all be recreated when the box
|
||||
file is changed) as they have to be in sync. This is made easier than in
|
||||
previous versions by running unicharset_extractor before mftraining and
|
||||
*NOTE* The unicharset file must be regenerated whenever inttemp, normproto
|
||||
and pffmtable are generated (i.e. they must all be recreated when the box
|
||||
file is changed) as they have to be in sync. This is made easier than in
|
||||
previous versions by running unicharset_extractor before mftraining and
|
||||
cntraining, and giving the unicharset to mftraining.
|
||||
|
||||
SEE ALSO
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,63 +1,63 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>UNICHARSET_EXTRACTOR(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>unicharset_extractor</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>unicharset_extractor</refname>
|
||||
<refpurpose>extract unicharset from Tesseract boxfiles</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara><emphasis role="strong">unicharset_extractor</emphasis> <emphasis>[-D dir]</emphasis> <emphasis>FILE</emphasis>…</simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>Tesseract needs to know the set of possible characters it can output.
|
||||
To generate the unicharset data file, use the unicharset_extractor
|
||||
program on the same training pages bounding box files as used for
|
||||
clustering:</simpara>
|
||||
<literallayout class="monospaced">unicharset_extractor fontfile_1.box fontfile_2.box ...</literallayout>
|
||||
<simpara>The unicharset will be put into the file <emphasis>dir/unicharset</emphasis>, or simply
|
||||
<emphasis>./unicharset</emphasis> if no output directory is provided.</simpara>
|
||||
<simpara>Tesseract also needs to have access to character properties isalpha,
|
||||
isdigit, isupper, islower, ispunctuation. all of this auxilury data
|
||||
and more is encoded in this file. (See unicharset(5))</simpara>
|
||||
<simpara>If your system supports the wctype functions, these values will be set
|
||||
automatically by unicharset_extractor and there is no need to edit the
|
||||
unicharset file. On some older systems (eg Windows 95), the unicharset
|
||||
file must be edited by hand to add these property description codes.</simpara>
|
||||
<simpara><emphasis role="strong">NOTE</emphasis> The unicharset file must be regenerated whenever inttemp, normproto
|
||||
and pffmtable are generated (i.e. they must all be recreated when the box
|
||||
file is changed) as they have to be in sync. This is made easier than in
|
||||
previous versions by running unicharset_extractor before mftraining and
|
||||
cntraining, and giving the unicharset to mftraining.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), unicharset(5)</simpara>
|
||||
<simpara><ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_history">
|
||||
<title>HISTORY</title>
|
||||
<simpara>unicharset_extractor first appeared in Tesseract 2.00.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Copyright (C) 2006, Google Inc.
|
||||
Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>UNICHARSET_EXTRACTOR(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>unicharset_extractor</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>unicharset_extractor</refname>
|
||||
<refpurpose>extract unicharset from Tesseract boxfiles</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara><emphasis role="strong">unicharset_extractor</emphasis> <emphasis>[-D dir]</emphasis> <emphasis>FILE</emphasis>…</simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>Tesseract needs to know the set of possible characters it can output.
|
||||
To generate the unicharset data file, use the unicharset_extractor
|
||||
program on the same training pages bounding box files as used for
|
||||
clustering:</simpara>
|
||||
<literallayout class="monospaced">unicharset_extractor fontfile_1.box fontfile_2.box ...</literallayout>
|
||||
<simpara>The unicharset will be put into the file <emphasis>dir/unicharset</emphasis>, or simply
|
||||
<emphasis>./unicharset</emphasis> if no output directory is provided.</simpara>
|
||||
<simpara>Tesseract also needs to have access to character properties isalpha,
|
||||
isdigit, isupper, islower, ispunctuation. all of this auxilury data
|
||||
and more is encoded in this file. (See unicharset(5))</simpara>
|
||||
<simpara>If your system supports the wctype functions, these values will be set
|
||||
automatically by unicharset_extractor and there is no need to edit the
|
||||
unicharset file. On some older systems (eg Windows 95), the unicharset
|
||||
file must be edited by hand to add these property description codes.</simpara>
|
||||
<simpara><emphasis role="strong">NOTE</emphasis> The unicharset file must be regenerated whenever inttemp, normproto
|
||||
and pffmtable are generated (i.e. they must all be recreated when the box
|
||||
file is changed) as they have to be in sync. This is made easier than in
|
||||
previous versions by running unicharset_extractor before mftraining and
|
||||
cntraining, and giving the unicharset to mftraining.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), unicharset(5)</simpara>
|
||||
<simpara><ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_history">
|
||||
<title>HISTORY</title>
|
||||
<simpara>unicharset_extractor first appeared in Tesseract 2.00.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Copyright (C) 2006, Google Inc.
|
||||
Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,69 +1,69 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>WORDLIST2DAWG(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>wordlist2dawg</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>wordlist2dawg</refname>
|
||||
<refpurpose>convert a wordlist to a DAWG for Tesseract</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara><emphasis role="strong">wordlist2dawg</emphasis> <emphasis>WORDLIST</emphasis> <emphasis>DAWG</emphasis> <emphasis>lang.unicharset</emphasis></simpara>
|
||||
<simpara><emphasis role="strong">wordlist2dawg</emphasis> -t <emphasis>WORDLIST</emphasis> <emphasis>DAWG</emphasis> <emphasis>lang.unicharset</emphasis></simpara>
|
||||
<simpara><emphasis role="strong">wordlist2dawg</emphasis> -r 1 <emphasis>WORDLIST</emphasis> <emphasis>DAWG</emphasis> <emphasis>lang.unicharset</emphasis></simpara>
|
||||
<simpara><emphasis role="strong">wordlist2dawg</emphasis> -r 2 <emphasis>WORDLIST</emphasis> <emphasis>DAWG</emphasis> <emphasis>lang.unicharset</emphasis></simpara>
|
||||
<simpara><emphasis role="strong">wordlist2dawg</emphasis> -l <short> <long> <emphasis>WORDLIST</emphasis> <emphasis>DAWG</emphasis> <emphasis>lang.unicharset</emphasis></simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>wordlist2dawg(1) converts a wordlist to a Directed Acyclic Word Graph
|
||||
(DAWG) for use with Tesseract. A DAWG is a compressed, space and time
|
||||
efficient representation of a word list.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_options">
|
||||
<title>OPTIONS</title>
|
||||
<simpara>-t
|
||||
Verify that a given dawg file is equivalent to a given wordlist.</simpara>
|
||||
<simpara>-r 1
|
||||
Reverse a word if it contains an RTL character.</simpara>
|
||||
<simpara>-r 2
|
||||
Reverse all words.</simpara>
|
||||
<simpara>-l <short> <long>
|
||||
Produce a file with several dawgs in it, one each for words
|
||||
of length <short>, <short+1>,… <long></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_arguments">
|
||||
<title>ARGUMENTS</title>
|
||||
<simpara><emphasis>WORDLIST</emphasis>
|
||||
A plain text file in UTF-8, one word per line.</simpara>
|
||||
<simpara><emphasis>DAWG</emphasis>
|
||||
The output DAWG to write.</simpara>
|
||||
<simpara><emphasis>lang.unicharset</emphasis>
|
||||
The unicharset of the language. This is the unicharset
|
||||
generated by mftraining(1).</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), combine_tessdata(1), dawg2wordlist(1)</simpara>
|
||||
<simpara><ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Copyright (C) 2006 Google, Inc.
|
||||
Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
||||
<?asciidoc-toc?>
|
||||
<?asciidoc-numbered?>
|
||||
<refentry lang="en">
|
||||
<refentryinfo>
|
||||
<title>WORDLIST2DAWG(1)</title>
|
||||
</refentryinfo>
|
||||
<refmeta>
|
||||
<refentrytitle>wordlist2dawg</refentrytitle>
|
||||
<manvolnum>1</manvolnum>
|
||||
<refmiscinfo class="source"> </refmiscinfo>
|
||||
<refmiscinfo class="manual"> </refmiscinfo>
|
||||
</refmeta>
|
||||
<refnamediv>
|
||||
<refname>wordlist2dawg</refname>
|
||||
<refpurpose>convert a wordlist to a DAWG for Tesseract</refpurpose>
|
||||
</refnamediv>
|
||||
<refsynopsisdiv id="_synopsis">
|
||||
<simpara><emphasis role="strong">wordlist2dawg</emphasis> <emphasis>WORDLIST</emphasis> <emphasis>DAWG</emphasis> <emphasis>lang.unicharset</emphasis></simpara>
|
||||
<simpara><emphasis role="strong">wordlist2dawg</emphasis> -t <emphasis>WORDLIST</emphasis> <emphasis>DAWG</emphasis> <emphasis>lang.unicharset</emphasis></simpara>
|
||||
<simpara><emphasis role="strong">wordlist2dawg</emphasis> -r 1 <emphasis>WORDLIST</emphasis> <emphasis>DAWG</emphasis> <emphasis>lang.unicharset</emphasis></simpara>
|
||||
<simpara><emphasis role="strong">wordlist2dawg</emphasis> -r 2 <emphasis>WORDLIST</emphasis> <emphasis>DAWG</emphasis> <emphasis>lang.unicharset</emphasis></simpara>
|
||||
<simpara><emphasis role="strong">wordlist2dawg</emphasis> -l <short> <long> <emphasis>WORDLIST</emphasis> <emphasis>DAWG</emphasis> <emphasis>lang.unicharset</emphasis></simpara>
|
||||
</refsynopsisdiv>
|
||||
<refsect1 id="_description">
|
||||
<title>DESCRIPTION</title>
|
||||
<simpara>wordlist2dawg(1) converts a wordlist to a Directed Acyclic Word Graph
|
||||
(DAWG) for use with Tesseract. A DAWG is a compressed, space and time
|
||||
efficient representation of a word list.</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_options">
|
||||
<title>OPTIONS</title>
|
||||
<simpara>-t
|
||||
Verify that a given dawg file is equivalent to a given wordlist.</simpara>
|
||||
<simpara>-r 1
|
||||
Reverse a word if it contains an RTL character.</simpara>
|
||||
<simpara>-r 2
|
||||
Reverse all words.</simpara>
|
||||
<simpara>-l <short> <long>
|
||||
Produce a file with several dawgs in it, one each for words
|
||||
of length <short>, <short+1>,… <long></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_arguments">
|
||||
<title>ARGUMENTS</title>
|
||||
<simpara><emphasis>WORDLIST</emphasis>
|
||||
A plain text file in UTF-8, one word per line.</simpara>
|
||||
<simpara><emphasis>DAWG</emphasis>
|
||||
The output DAWG to write.</simpara>
|
||||
<simpara><emphasis>lang.unicharset</emphasis>
|
||||
The unicharset of the language. This is the unicharset
|
||||
generated by mftraining(1).</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_see_also">
|
||||
<title>SEE ALSO</title>
|
||||
<simpara>tesseract(1), combine_tessdata(1), dawg2wordlist(1)</simpara>
|
||||
<simpara><ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_copying">
|
||||
<title>COPYING</title>
|
||||
<simpara>Copyright (C) 2006 Google, Inc.
|
||||
Licensed under the Apache License, Version 2.0</simpara>
|
||||
</refsect1>
|
||||
<refsect1 id="_author">
|
||||
<title>AUTHOR</title>
|
||||
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
||||
</refsect1>
|
||||
</refentry>
|
||||
|
Loading…
Reference in New Issue
Block a user