mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-01 07:59:05 +08:00
58e06c8c45
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@670 d0cd1f9f-072b-0410-8dd7-cf729c803f20
226 lines
9.9 KiB
XML
226 lines
9.9 KiB
XML
<?xml version="1.0" encoding="UTF-8"?>
|
|
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
|
<?asciidoc-toc?>
|
|
<?asciidoc-numbered?>
|
|
<refentry lang="en">
|
|
<refmeta>
|
|
<refentrytitle>tesseract</refentrytitle>
|
|
<manvolnum>1</manvolnum>
|
|
<refmiscinfo class="source"> </refmiscinfo>
|
|
<refmiscinfo class="manual"> </refmiscinfo>
|
|
</refmeta>
|
|
<refnamediv>
|
|
<refname>tesseract</refname>
|
|
<refpurpose>command-line OCR engine</refpurpose>
|
|
</refnamediv>
|
|
<refsynopsisdiv id="_synopsis">
|
|
<simpara><emphasis role="strong">tesseract</emphasis> <emphasis>imagename</emphasis> <emphasis>outbase</emphasis> [<emphasis>-l lang</emphasis>] [<emphasis>-psm N</emphasis>] [<emphasis>configfile</emphasis> …]</simpara>
|
|
</refsynopsisdiv>
|
|
<refsect1 id="_description">
|
|
<title>DESCRIPTION</title>
|
|
<simpara>tesseract(1) is a commercial quality OCR engine originally developed at HP
|
|
between 1985 and 1995. In 1995, this engine was among the top 3 evaluated by
|
|
UNLV. It was open-sourced by HP and UNLV in 2005, and has been developed
|
|
at Google since then.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_options">
|
|
<title>OPTIONS</title>
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>imagename</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
The name of the input image. Most image file formats (anything
|
|
readable by Leptonica) are supported.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>outbase</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
The basename of the output file (to which the appropriate extension
|
|
will be appended). By default the output will be named <emphasis>outbase.txt</emphasis>.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>-l lang</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
The language to use. If none is specified, English is assumed.
|
|
Multiple languages may be specified, separated by plus characters.
|
|
Tesseract uses 3-character ISO 639-2 language codes. (See LANGUAGES)
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>-psm N</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Set Tesseract to only run a subset of layout analysis and assume a certain form of image. The options for <emphasis role="strong">N</emphasis> are:
|
|
</simpara>
|
|
<literallayout class="monospaced">0 = Orientation and script detection (OSD) only.
|
|
1 = Automatic page segmentation with OSD.
|
|
2 = Automatic page segmentation, but no OSD, or OCR.
|
|
3 = Fully automatic page segmentation, but no OSD. (Default)
|
|
4 = Assume a single column of text of variable sizes.
|
|
5 = Assume a single uniform block of vertically aligned text.
|
|
6 = Assume a single uniform block of text.
|
|
7 = Treat the image as a single text line.
|
|
8 = Treat the image as a single word.
|
|
9 = Treat the image as a single word in a circle.
|
|
10 = Treat the image as a single character.</literallayout>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>-v</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Returns the current version of the tesseract(1) executable.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
<emphasis>configfile</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
The name of a config to use. A config is a plaintext file which
|
|
contains a list of variables and their values, one per line, with a
|
|
space separating variable from value. Interesting config files
|
|
include:<?asciidoc-br?>
|
|
</simpara>
|
|
<itemizedlist>
|
|
<listitem>
|
|
<simpara>
|
|
hocr - Output in hOCR format instead of as a text file.
|
|
</simpara>
|
|
</listitem>
|
|
</itemizedlist>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
<simpara><emphasis role="strong">Nota Bene:</emphasis> The options <emphasis>-l lang</emphasis> and <emphasis>-psm N</emphasis> must occur
|
|
before any <emphasis>configfile</emphasis>.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_languages">
|
|
<title>LANGUAGES</title>
|
|
<simpara>There are currently language packs available for the following languages:</simpara>
|
|
<simpara><emphasis role="strong">ara</emphasis> (Arabic),
|
|
<emphasis role="strong">aze</emphasis> (Azerbauijani),
|
|
<emphasis role="strong">bul</emphasis> (Bulgarian),
|
|
<emphasis role="strong">cat</emphasis> (Catalan),
|
|
<emphasis role="strong">ces</emphasis> (Czech),
|
|
<emphasis role="strong">chi_sim</emphasis> (Simplified Chinese),
|
|
<emphasis role="strong">chi_tra</emphasis> (Traditional Chinese),
|
|
<emphasis role="strong">chr</emphasis> (Cherokee),
|
|
<emphasis role="strong">dan</emphasis> (Danish),
|
|
<emphasis role="strong">dan-frak</emphasis> (Danish (Fraktur)),
|
|
<emphasis role="strong">deu</emphasis> (German),
|
|
<emphasis role="strong">ell</emphasis> (Greek),
|
|
<emphasis role="strong">eng</emphasis> (English),
|
|
<emphasis role="strong">enm</emphasis> (Old English),
|
|
<emphasis role="strong">epo</emphasis> (Esperanto),
|
|
<emphasis role="strong">est</emphasis> (Estonian),
|
|
<emphasis role="strong">fin</emphasis> (Finnish),
|
|
<emphasis role="strong">fra</emphasis> (French),
|
|
<emphasis role="strong">frm</emphasis> (Old French),
|
|
<emphasis role="strong">glg</emphasis> (Galician),
|
|
<emphasis role="strong">heb</emphasis> (Hebrew),
|
|
<emphasis role="strong">hin</emphasis> (Hindi),
|
|
<emphasis role="strong">hrv</emphasis> (Croation),
|
|
<emphasis role="strong">hun</emphasis> (Hungarian),
|
|
<emphasis role="strong">ind</emphasis> (Indonesian),
|
|
<emphasis role="strong">ita</emphasis> (Italian),
|
|
<emphasis role="strong">jpn</emphasis> (Japanese),
|
|
<emphasis role="strong">kor</emphasis> (Korean),
|
|
<emphasis role="strong">lav</emphasis> (Latvian),
|
|
<emphasis role="strong">lit</emphasis> (Lithuanian),
|
|
<emphasis role="strong">nld</emphasis> (Dutch),
|
|
<emphasis role="strong">nor</emphasis> (Norwegian),
|
|
<emphasis role="strong">pol</emphasis> (Polish),
|
|
<emphasis role="strong">por</emphasis> (Portuguese),
|
|
<emphasis role="strong">ron</emphasis> (Romanian),
|
|
<emphasis role="strong">rus</emphasis> (Russian),
|
|
<emphasis role="strong">slk</emphasis> (Slovakian),
|
|
<emphasis role="strong">slv</emphasis> (Slovenian),
|
|
<emphasis role="strong">sqi</emphasis> (Albanian),
|
|
<emphasis role="strong">spa</emphasis> (Spanish),
|
|
<emphasis role="strong">srp</emphasis> (Serbian),
|
|
<emphasis role="strong">swe</emphasis> (Swedish),
|
|
<emphasis role="strong">tam</emphasis> (Tamil),
|
|
<emphasis role="strong">tel</emphasis> (Telugu),
|
|
<emphasis role="strong">tgl</emphasis> (Tagalog),
|
|
<emphasis role="strong">tha</emphasis> (Thai),
|
|
<emphasis role="strong">tur</emphasis> (Turkish),
|
|
<emphasis role="strong">ukr</emphasis> (Ukrainian),
|
|
<emphasis role="strong">vie</emphasis> (Vietnamese)</simpara>
|
|
<simpara>To use a non-standard language pack named <emphasis role="strong">foo.traineddata</emphasis>, set the
|
|
<emphasis role="strong">TESSDATA_PREFIX</emphasis> environment variable so the file can be found at
|
|
<emphasis role="strong">TESSDATA_PREFIX</emphasis>/tessdata/<emphasis role="strong">foo</emphasis>.traineddata and give Tesseract the
|
|
argument <emphasis>-l foo</emphasis>.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_history">
|
|
<title>HISTORY</title>
|
|
<simpara>The engine was developed at Hewlett Packard Laboratories Bristol and at
|
|
Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more
|
|
changes made in 1996 to port to Windows, and some C++izing in 1998. A
|
|
lot of the code was written in C, and then some more was written in C++.
|
|
The C\++ code makes heavy use of a list system using macros. This predates
|
|
stl, was portable before stl, and is more efficient than stl lists, but has
|
|
the big negative that if you do get a segmentation violation, it is hard to
|
|
debug.</simpara>
|
|
<simpara>Version 2.00 brought Unicode (UTF-8) support, six languages, and the ability
|
|
to train Tesseract.</simpara>
|
|
<simpara>Tesseract was included in UNLV’s Fourth Annual Test of OCR Accuracy.
|
|
See <ulink url="http://www.isri.unlv.edu/downloads/AT-1995.pdf">http://www.isri.unlv.edu/downloads/AT-1995.pdf</ulink>. With Tesseract 2.00,
|
|
scripts are now included to allow anyone to reproduce some of these tests.
|
|
See <ulink url="http://code.google.com/p/tesseract-ocr/wiki/TestingTesseract">http://code.google.com/p/tesseract-ocr/wiki/TestingTesseract</ulink> for more
|
|
details.</simpara>
|
|
<simpara>Tesseract 3.00 adds a number of new languages, including Chinese, Japanese,
|
|
and Korean. It also introduces a new, single-file based system of managing
|
|
language data.</simpara>
|
|
<simpara>Tesseract 3.02 adds BiDirectional text support, the ability to recognize
|
|
multiple languages in a single image, and improved layout analysis.</simpara>
|
|
<simpara>For further details, see the file ReleaseNotes included with the distribution.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_resources">
|
|
<title>RESOURCES</title>
|
|
<simpara>Main web site: <ulink url="http://code.google.com/p/tesseract-ocr/">http://code.google.com/p/tesseract-ocr/</ulink><?asciidoc-br?>
|
|
Information on training: <ulink url="http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3">http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3</ulink></simpara>
|
|
</refsect1>
|
|
<refsect1 id="_see_also">
|
|
<title>SEE ALSO</title>
|
|
<simpara>ambiguous_words(1), cntraining(1), combine_tessdata(1), dawg2wordlist(1),
|
|
shape_training(1), mftraining(1), unicharambigs(5), unicharset(5),
|
|
unicharset_extractor(1), wordlist2dawg(1)</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_author">
|
|
<title>AUTHOR</title>
|
|
<simpara>Tesseract development was led at Hewlett-Packard and Google by Ray Smith.
|
|
The development team has included:</simpara>
|
|
<simpara>Ahmad Abdulkader, Chris Newton, Dan Johnson, Dar-Shyang Lee, David Eger,
|
|
Eric Wiseblatt, Faisal Shafait, Hiroshi Takenaka, Joe Liu, Joern Wanke,
|
|
Mark Seaman, Mickey Namiki, Nicholas Beato, Oded Fuhrmann, Phil Cheatle,
|
|
Pingping Xiu, Pong Eksombatchai (Chantat), Ranjith Unnikrishnan, Raquel
|
|
Romano, Ray Smith, Rika Antonova, Robert Moss, Samuel Charron, Sheelagh
|
|
Lloyd, Shobhit Saxena, and Thomas Kielbus.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_copying">
|
|
<title>COPYING</title>
|
|
<simpara>Licensed under the Apache License, Version 2.0</simpara>
|
|
</refsect1>
|
|
</refentry>
|