mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-15 01:24:54 +08:00
366 lines
14 KiB
XML
366 lines
14 KiB
XML
<?xml version="1.0" encoding="UTF-8"?>
|
|
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
|
<?asciidoc-toc?>
|
|
<?asciidoc-numbered?>
|
|
<refentry lang="en">
|
|
<refentryinfo>
|
|
<title>TESSERACT(1)</title>
|
|
</refentryinfo>
|
|
<refmeta>
|
|
<refentrytitle>tesseract</refentrytitle>
|
|
<manvolnum>1</manvolnum>
|
|
<refmiscinfo class="source"> </refmiscinfo>
|
|
<refmiscinfo class="manual"> </refmiscinfo>
|
|
</refmeta>
|
|
<refnamediv>
|
|
<refname>tesseract</refname>
|
|
<refpurpose>command-line OCR engine</refpurpose>
|
|
</refnamediv>
|
|
<refsynopsisdiv id="_synopsis">
|
|
<simpara><emphasis role="strong">tesseract</emphasis> <emphasis>imagename</emphasis>|<emphasis>stdin</emphasis> <emphasis>outputbase</emphasis>|<emphasis>stdout</emphasis> [options…] [configfile…]</simpara>
|
|
</refsynopsisdiv>
|
|
<refsect1 id="_description">
|
|
<title>DESCRIPTION</title>
|
|
<simpara>tesseract(1) is a commercial quality OCR engine originally developed at HP
|
|
between 1985 and 1995. In 1995, this engine was among the top 3 evaluated by
|
|
UNLV. It was open-sourced by HP and UNLV in 2005, and has been developed
|
|
at Google since then.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_in_out_arguments">
|
|
<title>IN/OUT ARGUMENTS</title>
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>imagename</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
The name of the input image. Most image file formats (anything
|
|
readable by Leptonica) are supported.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>stdin</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Instruction to read data from standard input
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>outputbase</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
The basename of the output file (to which the appropriate extension
|
|
will be appended). By default the output will be named <emphasis>outbase.txt</emphasis>.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>stdout</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Instruction to sent output data to standard output
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
</refsect1>
|
|
<refsect1 id="_options">
|
|
<title>OPTIONS</title>
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>--tessdata-dir /path</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Specify the location of tessdata path
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>--user-words /path/to/file</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Specify the location of user words file
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>--user-patterns /path/to/file specify</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
The location of user patterns file
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>-c configvar=value</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Set value for control parameter. Multiple -c arguments are allowed.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>-l lang</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
The language to use. If none is specified, English is assumed.
|
|
Multiple languages may be specified, separated by plus characters.
|
|
Tesseract uses 3-character ISO 639-2 language codes. (See LANGUAGES)
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>-psm N</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Set Tesseract to only run a subset of layout analysis and assume
|
|
a certain form of image. The options for <emphasis role="strong">N</emphasis> are:
|
|
</simpara>
|
|
<literallayout class="monospaced">0 = Orientation and script detection (OSD) only.
|
|
1 = Automatic page segmentation with OSD.
|
|
2 = Automatic page segmentation, but no OSD, or OCR.
|
|
3 = Fully automatic page segmentation, but no OSD. (Default)
|
|
4 = Assume a single column of text of variable sizes.
|
|
5 = Assume a single uniform block of vertically aligned text.
|
|
6 = Assume a single uniform block of text.
|
|
7 = Treat the image as a single text line.
|
|
8 = Treat the image as a single word.
|
|
9 = Treat the image as a single word in a circle.
|
|
10 = Treat the image as a single character.</literallayout>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>configfile</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
The name of a config to use. A config is a plaintext file which
|
|
contains a list of variables and their values, one per line, with a
|
|
space separating variable from value. Interesting config files
|
|
include:<?asciidoc-br?>
|
|
</simpara>
|
|
<itemizedlist>
|
|
<listitem>
|
|
<simpara>
|
|
hocr - Output in hOCR format instead of as a text file.
|
|
</simpara>
|
|
</listitem>
|
|
<listitem>
|
|
<simpara>
|
|
pdf - Output in pdf instead of a text file.
|
|
</simpara>
|
|
</listitem>
|
|
</itemizedlist>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
<simpara><emphasis role="strong">Nota Bene:</emphasis> The options <emphasis>-l lang</emphasis> and <emphasis>-psm N</emphasis> must occur
|
|
before any <emphasis>configfile</emphasis>.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_single_options">
|
|
<title>SINGLE OPTIONS</title>
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>-v</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Returns the current version of the tesseract(1) executable.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>--list-langs</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
list available languages for tesseract engine. Can be used with --tessdata-dir.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>--print-parameters</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
print tesseract parameters to the stdout.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
</refsect1>
|
|
<refsect1 id="_languages">
|
|
<title>LANGUAGES</title>
|
|
<simpara>There are currently language packs available for the following languages:</simpara>
|
|
<simpara><emphasis role="strong">ara</emphasis> (Arabic),
|
|
<emphasis role="strong">aze</emphasis> (Azerbauijani),
|
|
<emphasis role="strong">bul</emphasis> (Bulgarian),
|
|
<emphasis role="strong">cat</emphasis> (Catalan),
|
|
<emphasis role="strong">ces</emphasis> (Czech),
|
|
<emphasis role="strong">chi_sim</emphasis> (Simplified Chinese),
|
|
<emphasis role="strong">chi_tra</emphasis> (Traditional Chinese),
|
|
<emphasis role="strong">chr</emphasis> (Cherokee),
|
|
<emphasis role="strong">dan</emphasis> (Danish),
|
|
<emphasis role="strong">dan-frak</emphasis> (Danish (Fraktur)),
|
|
<emphasis role="strong">deu</emphasis> (German),
|
|
<emphasis role="strong">ell</emphasis> (Greek),
|
|
<emphasis role="strong">eng</emphasis> (English),
|
|
<emphasis role="strong">enm</emphasis> (Old English),
|
|
<emphasis role="strong">epo</emphasis> (Esperanto),
|
|
<emphasis role="strong">est</emphasis> (Estonian),
|
|
<emphasis role="strong">fin</emphasis> (Finnish),
|
|
<emphasis role="strong">fra</emphasis> (French),
|
|
<emphasis role="strong">frm</emphasis> (Old French),
|
|
<emphasis role="strong">glg</emphasis> (Galician),
|
|
<emphasis role="strong">heb</emphasis> (Hebrew),
|
|
<emphasis role="strong">hin</emphasis> (Hindi),
|
|
<emphasis role="strong">hrv</emphasis> (Croation),
|
|
<emphasis role="strong">hun</emphasis> (Hungarian),
|
|
<emphasis role="strong">ind</emphasis> (Indonesian),
|
|
<emphasis role="strong">ita</emphasis> (Italian),
|
|
<emphasis role="strong">jpn</emphasis> (Japanese),
|
|
<emphasis role="strong">kor</emphasis> (Korean),
|
|
<emphasis role="strong">lav</emphasis> (Latvian),
|
|
<emphasis role="strong">lit</emphasis> (Lithuanian),
|
|
<emphasis role="strong">nld</emphasis> (Dutch),
|
|
<emphasis role="strong">nor</emphasis> (Norwegian),
|
|
<emphasis role="strong">pol</emphasis> (Polish),
|
|
<emphasis role="strong">por</emphasis> (Portuguese),
|
|
<emphasis role="strong">ron</emphasis> (Romanian),
|
|
<emphasis role="strong">rus</emphasis> (Russian),
|
|
<emphasis role="strong">slk</emphasis> (Slovakian),
|
|
<emphasis role="strong">slv</emphasis> (Slovenian),
|
|
<emphasis role="strong">sqi</emphasis> (Albanian),
|
|
<emphasis role="strong">spa</emphasis> (Spanish),
|
|
<emphasis role="strong">srp</emphasis> (Serbian),
|
|
<emphasis role="strong">swe</emphasis> (Swedish),
|
|
<emphasis role="strong">tam</emphasis> (Tamil),
|
|
<emphasis role="strong">tel</emphasis> (Telugu),
|
|
<emphasis role="strong">tgl</emphasis> (Tagalog),
|
|
<emphasis role="strong">tha</emphasis> (Thai),
|
|
<emphasis role="strong">tur</emphasis> (Turkish),
|
|
<emphasis role="strong">ukr</emphasis> (Ukrainian),
|
|
<emphasis role="strong">vie</emphasis> (Vietnamese)</simpara>
|
|
<simpara>To use a non-standard language pack named <emphasis role="strong">foo.traineddata</emphasis>, set the
|
|
<emphasis role="strong">TESSDATA_PREFIX</emphasis> environment variable so the file can be found at
|
|
<emphasis role="strong">TESSDATA_PREFIX</emphasis>/tessdata/<emphasis role="strong">foo</emphasis>.traineddata and give Tesseract the
|
|
argument <emphasis>-l foo</emphasis>.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_config_files_and_augmenting_with_user_data">
|
|
<title>CONFIG FILES AND AUGMENTING WITH USER DATA</title>
|
|
<simpara>Tesseract config files consist of lines with variable-value pairs (space
|
|
separated). The variables are documented as flags in the source code like
|
|
the following one in tesseractclass.h:</simpara>
|
|
<simpara>STRING_VAR_H(tessedit_char_blacklist, "",
|
|
"Blacklist of chars not to recognize");</simpara>
|
|
<simpara>These variables may enable or disable various features of the engine, and
|
|
may cause it to load (or not load) various data. For instance, let’s suppose
|
|
you want to OCR in English, but suppress the normal dictionary and load an
|
|
alternative word list and an alternative list of patterns — these two files
|
|
are the most commonly used extra data files.</simpara>
|
|
<simpara>If your language pack is in /path/to/eng.traineddata and the hocr config
|
|
is in /path/to/configs/hocr then create three new files:</simpara>
|
|
<simpara>/path/to/eng.user-words:</simpara>
|
|
<blockquote>
|
|
<literallayout>the
|
|
quick
|
|
brown
|
|
fox
|
|
jumped</literallayout>
|
|
</blockquote>
|
|
<simpara>/path/to/eng.user-patterns:</simpara>
|
|
<blockquote>
|
|
<literallayout>1-\d\d\d-GOOG-411
|
|
www.\n\\\*.com</literallayout>
|
|
</blockquote>
|
|
<simpara>/path/to/configs/bazaar:</simpara>
|
|
<blockquote>
|
|
<literallayout>load_system_dawg F
|
|
load_freq_dawg F
|
|
user_words_suffix user-words
|
|
user_patterns_suffix user-patterns</literallayout>
|
|
</blockquote>
|
|
<simpara>Now, if you pass the word <emphasis>bazaar</emphasis> as a trailing command line parameter
|
|
to Tesseract, Tesseract will not bother loading the system dictionary nor
|
|
the dictionary of frequent words and will load and use the eng.user-words
|
|
and eng.user-patterns files you provided. The former is a simple word list,
|
|
one per line. The format of the latter is documented in dict/trie.h
|
|
on read_pattern_list().</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_history">
|
|
<title>HISTORY</title>
|
|
<simpara>The engine was developed at Hewlett Packard Laboratories Bristol and at
|
|
Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more
|
|
changes made in 1996 to port to Windows, and some C++izing in 1998. A
|
|
lot of the code was written in C, and then some more was written in C++.
|
|
The C\++ code makes heavy use of a list system using macros. This predates
|
|
stl, was portable before stl, and is more efficient than stl lists, but has
|
|
the big negative that if you do get a segmentation violation, it is hard to
|
|
debug.</simpara>
|
|
<simpara>Version 2.00 brought Unicode (UTF-8) support, six languages, and the ability
|
|
to train Tesseract.</simpara>
|
|
<simpara>Tesseract was included in UNLV’s Fourth Annual Test of OCR Accuracy.
|
|
See <ulink url="http://www.isri.unlv.edu/downloads/AT-1995.pdf">http://www.isri.unlv.edu/downloads/AT-1995.pdf</ulink>. With Tesseract 2.00,
|
|
scripts are now included to allow anyone to reproduce some of these tests.
|
|
See <ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TestingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TestingTesseract</ulink> for more
|
|
details.</simpara>
|
|
<simpara>Tesseract 3.00 adds a number of new languages, including Chinese, Japanese,
|
|
and Korean. It also introduces a new, single-file based system of managing
|
|
language data.</simpara>
|
|
<simpara>Tesseract 3.02 adds BiDirectional text support, the ability to recognize
|
|
multiple languages in a single image, and improved layout analysis.</simpara>
|
|
<simpara>For further details, see the file ReleaseNotes included with the distribution.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_resources">
|
|
<title>RESOURCES</title>
|
|
<simpara>Main web site: <ulink url="https://github.com/tesseract-ocr">https://github.com/tesseract-ocr</ulink><?asciidoc-br?>
|
|
Information on training: <ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
|
</refsect1>
|
|
<refsect1 id="_see_also">
|
|
<title>SEE ALSO</title>
|
|
<simpara>ambiguous_words(1), cntraining(1), combine_tessdata(1), dawg2wordlist(1),
|
|
shape_training(1), mftraining(1), unicharambigs(5), unicharset(5),
|
|
unicharset_extractor(1), wordlist2dawg(1)</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_author">
|
|
<title>AUTHOR</title>
|
|
<simpara>Tesseract development was led at Hewlett-Packard and Google by Ray Smith.
|
|
The development team has included:</simpara>
|
|
<simpara>Ahmad Abdulkader, Chris Newton, Dan Johnson, Dar-Shyang Lee, David Eger,
|
|
Eric Wiseblatt, Faisal Shafait, Hiroshi Takenaka, Joe Liu, Joern Wanke,
|
|
Mark Seaman, Mickey Namiki, Nicholas Beato, Oded Fuhrmann, Phil Cheatle,
|
|
Pingping Xiu, Pong Eksombatchai (Chantat), Ranjith Unnikrishnan, Raquel
|
|
Romano, Ray Smith, Rika Antonova, Robert Moss, Samuel Charron, Sheelagh
|
|
Lloyd, Shobhit Saxena, and Thomas Kielbus.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_copying">
|
|
<title>COPYING</title>
|
|
<simpara>Licensed under the Apache License, Version 2.0</simpara>
|
|
</refsect1>
|
|
</refentry>
|