mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-23 06:57:50 +08:00
425 lines
17 KiB
XML
425 lines
17 KiB
XML
<?xml version="1.0" encoding="UTF-8"?>
|
|
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
|
<?asciidoc-toc?>
|
|
<?asciidoc-numbered?>
|
|
<refentry lang="en">
|
|
<refentryinfo>
|
|
<title>TESSERACT(1)</title>
|
|
</refentryinfo>
|
|
<refmeta>
|
|
<refentrytitle>tesseract</refentrytitle>
|
|
<manvolnum>1</manvolnum>
|
|
<refmiscinfo class="source"> </refmiscinfo>
|
|
<refmiscinfo class="manual"> </refmiscinfo>
|
|
</refmeta>
|
|
<refnamediv>
|
|
<refname>tesseract</refname>
|
|
<refpurpose>command-line OCR engine</refpurpose>
|
|
</refnamediv>
|
|
<refsynopsisdiv id="_synopsis">
|
|
<simpara><emphasis role="strong">tesseract</emphasis> <emphasis>imagename</emphasis>|<emphasis>stdin</emphasis> <emphasis>outputbase</emphasis>|<emphasis>stdout</emphasis> [options…] [configfile…]</simpara>
|
|
</refsynopsisdiv>
|
|
<refsect1 id="_description">
|
|
<title>DESCRIPTION</title>
|
|
<simpara>tesseract(1) is a commercial quality OCR engine originally developed at HP
|
|
between 1985 and 1995. In 1995, this engine was among the top 3 evaluated by
|
|
UNLV. It was open-sourced by HP and UNLV in 2005, and has been developed
|
|
at Google since then.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_in_out_arguments">
|
|
<title>IN/OUT ARGUMENTS</title>
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>imagename</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
The name of the input image. Most image file formats (anything
|
|
readable by Leptonica) are supported.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>stdin</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Instruction to read data from standard input
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>outputbase</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
The basename of the output file (to which the appropriate extension
|
|
will be appended). By default the output will be named <emphasis>outbase.txt</emphasis>.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>stdout</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Instruction to sent output data to standard output
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
</refsect1>
|
|
<refsect1 id="_options">
|
|
<title>OPTIONS</title>
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>--tessdata-dir /path</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Specify the location of tessdata path
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>--user-words /path/to/file</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Specify the location of user words file
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>--user-patterns /path/to/file specify</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
The location of user patterns file
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>-c configvar=value</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Set value for control parameter. Multiple -c arguments are allowed.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>-l lang</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
The language to use. If none is specified, English is assumed.
|
|
Multiple languages may be specified, separated by plus characters.
|
|
Tesseract uses 3-character ISO 639-2 language codes. (See LANGUAGES)
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>-psm N</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Set Tesseract to only run a subset of layout analysis and assume
|
|
a certain form of image. The options for <emphasis role="strong">N</emphasis> are:
|
|
</simpara>
|
|
<literallayout class="monospaced">0 = Orientation and script detection (OSD) only.
|
|
1 = Automatic page segmentation with OSD.
|
|
2 = Automatic page segmentation, but no OSD, or OCR.
|
|
3 = Fully automatic page segmentation, but no OSD. (Default)
|
|
4 = Assume a single column of text of variable sizes.
|
|
5 = Assume a single uniform block of vertically aligned text.
|
|
6 = Assume a single uniform block of text.
|
|
7 = Treat the image as a single text line.
|
|
8 = Treat the image as a single word.
|
|
9 = Treat the image as a single word in a circle.
|
|
10 = Treat the image as a single character.</literallayout>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>configfile</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
The name of a config to use. A config is a plaintext file which
|
|
contains a list of variables and their values, one per line, with a
|
|
space separating variable from value. Interesting config files
|
|
include:<?asciidoc-br?>
|
|
</simpara>
|
|
<itemizedlist>
|
|
<listitem>
|
|
<simpara>
|
|
hocr - Output in hOCR format instead of as a text file.
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
pdf - Output in pdf instead of a text file.
|
|
</simpara>
|
|
</listitem>
|
|
</itemizedlist>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
<simpara><emphasis role="strong">Nota Bene:</emphasis> The options <emphasis>-l lang</emphasis> and <emphasis>-psm N</emphasis> must occur
|
|
before any <emphasis>configfile</emphasis>.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_single_options">
|
|
<title>SINGLE OPTIONS</title>
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>-v</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Returns the current version of the tesseract(1) executable.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>--list-langs</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
list available languages for tesseract engine. Can be used with --tessdata-dir.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>--print-parameters</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
print tesseract parameters to the stdout.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
</refsect1>
|
|
<refsect1 id="_languages">
|
|
<title>LANGUAGES</title>
|
|
<simpara>There are currently language packs available for the following languages
|
|
(in <ulink url="https://github.com/tesseract-ocr/tessdata">https://github.com/tesseract-ocr/tessdata</ulink>):</simpara>
|
|
<simpara><emphasis role="strong">afr</emphasis> (Afrikaans)
|
|
<emphasis role="strong">amh</emphasis> (Amharic)
|
|
<emphasis role="strong">ara</emphasis> (Arabic)
|
|
<emphasis role="strong">asm</emphasis> (Assamese)
|
|
<emphasis role="strong">aze</emphasis> (Azerbaijani)
|
|
<emphasis role="strong">aze_cyrl</emphasis> (Azerbaijani - Cyrilic)
|
|
<emphasis role="strong">bel</emphasis> (Belarusian)
|
|
<emphasis role="strong">ben</emphasis> (Bengali)
|
|
<emphasis role="strong">bod</emphasis> (Tibetan)
|
|
<emphasis role="strong">bos</emphasis> (Bosnian)
|
|
<emphasis role="strong">bul</emphasis> (Bulgarian)
|
|
<emphasis role="strong">cat</emphasis> (Catalan; Valencian)
|
|
<emphasis role="strong">ceb</emphasis> (Cebuano)
|
|
<emphasis role="strong">ces</emphasis> (Czech)
|
|
<emphasis role="strong">chi_sim</emphasis> (Chinese - Simplified)
|
|
<emphasis role="strong">chi_tra</emphasis> (Chinese - Traditional)
|
|
<emphasis role="strong">chr</emphasis> (Cherokee)
|
|
<emphasis role="strong">cym</emphasis> (Welsh)
|
|
<emphasis role="strong">dan</emphasis> (Danish)
|
|
<emphasis role="strong">dan_frak</emphasis> (Danish - Fraktur)
|
|
<emphasis role="strong">deu</emphasis> (German)
|
|
<emphasis role="strong">deu_frak</emphasis> (German - Fraktur)
|
|
<emphasis role="strong">dzo</emphasis> (Dzongkha)
|
|
<emphasis role="strong">ell</emphasis> (Greek, Modern (1453-))
|
|
<emphasis role="strong">eng</emphasis> (English)
|
|
<emphasis role="strong">enm</emphasis> (English, Middle (1100-1500))
|
|
<emphasis role="strong">epo</emphasis> (Esperanto)
|
|
<emphasis role="strong">equ</emphasis> (Math / equation detection module)
|
|
<emphasis role="strong">est</emphasis> (Estonian)
|
|
<emphasis role="strong">eus</emphasis> (Basque)
|
|
<emphasis role="strong">fas</emphasis> (Persian)
|
|
<emphasis role="strong">fin</emphasis> (Finnish)
|
|
<emphasis role="strong">fra</emphasis> (French)
|
|
<emphasis role="strong">frk</emphasis> (Frankish)
|
|
<emphasis role="strong">frm</emphasis> (French, Middle (ca.1400-1600))
|
|
<emphasis role="strong">gle</emphasis> (Irish)
|
|
<emphasis role="strong">glg</emphasis> (Galician)
|
|
<emphasis role="strong">grc</emphasis> (Greek, Ancient (to 1453))
|
|
<emphasis role="strong">guj</emphasis> (Gujarati)
|
|
<emphasis role="strong">hat</emphasis> (Haitian; Haitian Creole)
|
|
<emphasis role="strong">heb</emphasis> (Hebrew)
|
|
<emphasis role="strong">hin</emphasis> (Hindi)
|
|
<emphasis role="strong">hrv</emphasis> (Croatian)
|
|
<emphasis role="strong">hun</emphasis> (Hungarian)
|
|
<emphasis role="strong">iku</emphasis> (Inuktitut)
|
|
<emphasis role="strong">ind</emphasis> (Indonesian)
|
|
<emphasis role="strong">isl</emphasis> (Icelandic)
|
|
<emphasis role="strong">ita</emphasis> (Italian)
|
|
<emphasis role="strong">ita_old</emphasis> (Italian - Old)
|
|
<emphasis role="strong">jav</emphasis> (Javanese)
|
|
<emphasis role="strong">jpn</emphasis> (Japanese)
|
|
<emphasis role="strong">kan</emphasis> (Kannada)
|
|
<emphasis role="strong">kat</emphasis> (Georgian)
|
|
<emphasis role="strong">kat_old</emphasis> (Georgian - Old)
|
|
<emphasis role="strong">kaz</emphasis> (Kazakh)
|
|
<emphasis role="strong">khm</emphasis> (Central Khmer)
|
|
<emphasis role="strong">kir</emphasis> (Kirghiz; Kyrgyz)
|
|
<emphasis role="strong">kor</emphasis> (Korean)
|
|
<emphasis role="strong">kur</emphasis> (Kurdish)
|
|
<emphasis role="strong">lao</emphasis> (Lao)
|
|
<emphasis role="strong">lat</emphasis> (Latin)
|
|
<emphasis role="strong">lav</emphasis> (Latvian)
|
|
<emphasis role="strong">lit</emphasis> (Lithuanian)
|
|
<emphasis role="strong">mal</emphasis> (Malayalam)
|
|
<emphasis role="strong">mar</emphasis> (Marathi)
|
|
<emphasis role="strong">mkd</emphasis> (Macedonian)
|
|
<emphasis role="strong">mlt</emphasis> (Maltese)
|
|
<emphasis role="strong">msa</emphasis> (Malay)
|
|
<emphasis role="strong">mya</emphasis> (Burmese)
|
|
<emphasis role="strong">nep</emphasis> (Nepali)
|
|
<emphasis role="strong">nld</emphasis> (Dutch; Flemish)
|
|
<emphasis role="strong">nor</emphasis> (Norwegian)
|
|
<emphasis role="strong">ori</emphasis> (Oriya)
|
|
<emphasis role="strong">osd</emphasis> (Orientation and script detection module)
|
|
<emphasis role="strong">pan</emphasis> (Panjabi; Punjabi)
|
|
<emphasis role="strong">pol</emphasis> (Polish)
|
|
<emphasis role="strong">por</emphasis> (Portuguese)
|
|
<emphasis role="strong">pus</emphasis> (Pushto; Pashto)
|
|
<emphasis role="strong">ron</emphasis> (Romanian; Moldavian; Moldovan)
|
|
<emphasis role="strong">rus</emphasis> (Russian)
|
|
<emphasis role="strong">san</emphasis> (Sanskrit)
|
|
<emphasis role="strong">sin</emphasis> (Sinhala; Sinhalese)
|
|
<emphasis role="strong">slk</emphasis> (Slovak)
|
|
<emphasis role="strong">slk_frak</emphasis> (Slovak - Fraktur)
|
|
<emphasis role="strong">slv</emphasis> (Slovenian)
|
|
<emphasis role="strong">spa</emphasis> (Spanish; Castilian)
|
|
<emphasis role="strong">spa_old</emphasis> (Spanish; Castilian - Old)
|
|
<emphasis role="strong">sqi</emphasis> (Albanian)
|
|
<emphasis role="strong">srp</emphasis> (Serbian)
|
|
<emphasis role="strong">srp_latn</emphasis> (Serbian - Latin)
|
|
<emphasis role="strong">swa</emphasis> (Swahili)
|
|
<emphasis role="strong">swe</emphasis> (Swedish)
|
|
<emphasis role="strong">syr</emphasis> (Syriac)
|
|
<emphasis role="strong">tam</emphasis> (Tamil)
|
|
<emphasis role="strong">tel</emphasis> (Telugu)
|
|
<emphasis role="strong">tgk</emphasis> (Tajik)
|
|
<emphasis role="strong">tgl</emphasis> (Tagalog)
|
|
<emphasis role="strong">tha</emphasis> (Thai)
|
|
<emphasis role="strong">tir</emphasis> (Tigrinya)
|
|
<emphasis role="strong">tur</emphasis> (Turkish)
|
|
<emphasis role="strong">uig</emphasis> (Uighur; Uyghur)
|
|
<emphasis role="strong">ukr</emphasis> (Ukrainian)
|
|
<emphasis role="strong">urd</emphasis> (Urdu)
|
|
<emphasis role="strong">uzb</emphasis> (Uzbek)
|
|
<emphasis role="strong">uzb_cyrl</emphasis> (Uzbek - Cyrilic)
|
|
<emphasis role="strong">vie</emphasis> (Vietnamese)
|
|
<emphasis role="strong">yid</emphasis> (Yiddish)</simpara>
|
|
<simpara>To use a non-standard language pack named <emphasis role="strong">foo.traineddata</emphasis>, set the
|
|
<emphasis role="strong">TESSDATA_PREFIX</emphasis> environment variable so the file can be found at
|
|
<emphasis role="strong">TESSDATA_PREFIX</emphasis>/tessdata/<emphasis role="strong">foo</emphasis>.traineddata and give Tesseract the
|
|
argument <emphasis>-l foo</emphasis>.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_config_files_and_augmenting_with_user_data">
|
|
<title>CONFIG FILES AND AUGMENTING WITH USER DATA</title>
|
|
<simpara>Tesseract config files consist of lines with variable-value pairs (space
|
|
separated). The variables are documented as flags in the source code like
|
|
the following one in tesseractclass.h:</simpara>
|
|
<simpara>STRING_VAR_H(tessedit_char_blacklist, "",
|
|
"Blacklist of chars not to recognize");</simpara>
|
|
<simpara>These variables may enable or disable various features of the engine, and
|
|
may cause it to load (or not load) various data. For instance, let’s suppose
|
|
you want to OCR in English, but suppress the normal dictionary and load an
|
|
alternative word list and an alternative list of patterns — these two files
|
|
are the most commonly used extra data files.</simpara>
|
|
<simpara>If your language pack is in /path/to/eng.traineddata and the hocr config
|
|
is in /path/to/configs/hocr then create three new files:</simpara>
|
|
<simpara>/path/to/eng.user-words:</simpara>
|
|
<blockquote>
|
|
<literallayout>the
|
|
quick
|
|
brown
|
|
fox
|
|
jumped</literallayout>
|
|
</blockquote>
|
|
<simpara>/path/to/eng.user-patterns:</simpara>
|
|
<blockquote>
|
|
<literallayout>1-\d\d\d-GOOG-411
|
|
www.\n\\\*.com</literallayout>
|
|
</blockquote>
|
|
<simpara>/path/to/configs/bazaar:</simpara>
|
|
<blockquote>
|
|
<literallayout>load_system_dawg F
|
|
load_freq_dawg F
|
|
user_words_suffix user-words
|
|
user_patterns_suffix user-patterns</literallayout>
|
|
</blockquote>
|
|
<simpara>Now, if you pass the word <emphasis>bazaar</emphasis> as a trailing command line parameter
|
|
to Tesseract, Tesseract will not bother loading the system dictionary nor
|
|
the dictionary of frequent words and will load and use the eng.user-words
|
|
and eng.user-patterns files you provided. The former is a simple word list,
|
|
one per line. The format of the latter is documented in dict/trie.h
|
|
on read_pattern_list().</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_history">
|
|
<title>HISTORY</title>
|
|
<simpara>The engine was developed at Hewlett Packard Laboratories Bristol and at
|
|
Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more
|
|
changes made in 1996 to port to Windows, and some C++izing in 1998. A
|
|
lot of the code was written in C, and then some more was written in C++.
|
|
The C\++ code makes heavy use of a list system using macros. This predates
|
|
stl, was portable before stl, and is more efficient than stl lists, but has
|
|
the big negative that if you do get a segmentation violation, it is hard to
|
|
debug.</simpara>
|
|
<simpara>Version 2.00 brought Unicode (UTF-8) support, six languages, and the ability
|
|
to train Tesseract.</simpara>
|
|
<simpara>Tesseract was included in UNLV’s Fourth Annual Test of OCR Accuracy.
|
|
See <ulink url="https://github.com/tesseract-ocr/docs/blob/master/AT-1995.pdf">https://github.com/tesseract-ocr/docs/blob/master/AT-1995.pdf</ulink>. With Tesseract 2.00,
|
|
scripts are now included to allow anyone to reproduce some of these tests.
|
|
See <ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TestingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TestingTesseract</ulink> for more
|
|
details.</simpara>
|
|
<simpara>Tesseract 3.00 adds a number of new languages, including Chinese, Japanese,
|
|
and Korean. It also introduces a new, single-file based system of managing
|
|
language data.</simpara>
|
|
<simpara>Tesseract 3.02 adds BiDirectional text support, the ability to recognize
|
|
multiple languages in a single image, and improved layout analysis.</simpara>
|
|
<simpara>For further details, see the file ReleaseNotes included with the distribution.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_resources">
|
|
<title>RESOURCES</title>
|
|
<simpara>Main web site: <ulink url="https://github.com/tesseract-ocr">https://github.com/tesseract-ocr</ulink><?asciidoc-br?>
|
|
Information on training: <ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
|
</refsect1>
|
|
<refsect1 id="_see_also">
|
|
<title>SEE ALSO</title>
|
|
<simpara>ambiguous_words(1), cntraining(1), combine_tessdata(1), dawg2wordlist(1),
|
|
shape_training(1), mftraining(1), unicharambigs(5), unicharset(5),
|
|
unicharset_extractor(1), wordlist2dawg(1)</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_author">
|
|
<title>AUTHOR</title>
|
|
<simpara>Tesseract development was led at Hewlett-Packard and Google by Ray Smith.
|
|
The development team has included:</simpara>
|
|
<simpara>Ahmad Abdulkader, Chris Newton, Dan Johnson, Dar-Shyang Lee, David Eger,
|
|
Eric Wiseblatt, Faisal Shafait, Hiroshi Takenaka, Joe Liu, Joern Wanke,
|
|
Mark Seaman, Mickey Namiki, Nicholas Beato, Oded Fuhrmann, Phil Cheatle,
|
|
Pingping Xiu, Pong Eksombatchai (Chantat), Ranjith Unnikrishnan, Raquel
|
|
Romano, Ray Smith, Rika Antonova, Robert Moss, Samuel Charron, Sheelagh
|
|
Lloyd, Shobhit Saxena, and Thomas Kielbus.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_copying">
|
|
<title>COPYING</title>
|
|
<simpara>Licensed under the Apache License, Version 2.0</simpara>
|
|
</refsect1>
|
|
</refentry>
|