mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-03 23:27:50 +08:00
282 lines
8.8 KiB
XML
282 lines
8.8 KiB
XML
<?xml version="1.0" encoding="UTF-8"?>
|
|
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
|
<?asciidoc-toc?>
|
|
<?asciidoc-numbered?>
|
|
<refentry lang="en">
|
|
<refentryinfo>
|
|
<title>COMBINE_TESSDATA(1)</title>
|
|
</refentryinfo>
|
|
<refmeta>
|
|
<refentrytitle>combine_tessdata</refentrytitle>
|
|
<manvolnum>1</manvolnum>
|
|
<refmiscinfo class="source"> </refmiscinfo>
|
|
<refmiscinfo class="manual"> </refmiscinfo>
|
|
</refmeta>
|
|
<refnamediv>
|
|
<refname>combine_tessdata</refname>
|
|
<refpurpose>combine/extract/overwrite Tesseract data</refpurpose>
|
|
</refnamediv>
|
|
<refsynopsisdiv id="_synopsis">
|
|
<simpara><emphasis role="strong">combine_tessdata</emphasis> [<emphasis>OPTION</emphasis>] <emphasis>FILE</emphasis>…</simpara>
|
|
</refsynopsisdiv>
|
|
<refsect1 id="_description">
|
|
<title>DESCRIPTION</title>
|
|
<simpara>combine_tessdata(1) is the main program to combine/extract/overwrite
|
|
tessdata components in [lang].traineddata files.</simpara>
|
|
<simpara>To combine all the individual tessdata components (unicharset, DAWGs,
|
|
classifier templates, ambiguities, language configs) located at, say,
|
|
/home/$USER/temp/eng.* run:</simpara>
|
|
<literallayout class="monospaced">combine_tessdata /home/$USER/temp/eng.</literallayout>
|
|
<simpara>The result will be a combined tessdata file /home/$USER/temp/eng.traineddata</simpara>
|
|
<simpara>Specify option -e if you would like to extract individual components
|
|
from a combined traineddata file. For example, to extract language config
|
|
file and the unicharset from tessdata/eng.traineddata run:</simpara>
|
|
<literallayout class="monospaced">combine_tessdata -e tessdata/eng.traineddata \
|
|
/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset</literallayout>
|
|
<simpara>The desired config file and unicharset will be written to
|
|
/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset</simpara>
|
|
<simpara>Specify option -o to overwrite individual components of the given
|
|
[lang].traineddata file. For example, to overwrite language config
|
|
and unichar ambiguities files in tessdata/eng.traineddata use:</simpara>
|
|
<literallayout class="monospaced">combine_tessdata -o tessdata/eng.traineddata \
|
|
/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs</literallayout>
|
|
<simpara>As a result, tessdata/eng.traineddata will contain the new language config
|
|
and unichar ambigs, plus all the original DAWGs, classifier templates, etc.</simpara>
|
|
<simpara>Note: the file names of the files to extract to and to overwrite from should
|
|
have the appropriate file suffixes (extensions) indicating their tessdata
|
|
component type (.unicharset for the unicharset, .unicharambigs for unichar
|
|
ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.</simpara>
|
|
<simpara>Specify option -u to unpack all the components to the specified path:</simpara>
|
|
<literallayout class="monospaced">combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.</literallayout>
|
|
<simpara>This will create /home/$USER/temp/eng.* files with individual tessdata
|
|
components from tessdata/eng.traineddata.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_options">
|
|
<title>OPTIONS</title>
|
|
<simpara><emphasis role="strong">-e</emphasis> <emphasis>.traineddata</emphasis> <emphasis>FILE</emphasis>…:
|
|
Extracts the specified components from the .traineddata file</simpara>
|
|
<simpara><emphasis role="strong">-o</emphasis> <emphasis>.traineddata</emphasis> <emphasis>FILE</emphasis>…:
|
|
Overwrites the specified components of the .traineddata file
|
|
with those provided on the comand line.</simpara>
|
|
<simpara><emphasis role="strong">-u</emphasis> <emphasis>.traineddata</emphasis> <emphasis>PATHPREFIX</emphasis>
|
|
Unpacks the .traineddata using the provided prefix.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_caveats">
|
|
<title>CAVEATS</title>
|
|
<simpara><emphasis>Prefix</emphasis> refers to the full file prefix, including period (.)</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_components">
|
|
<title>COMPONENTS</title>
|
|
<simpara>The components in a Tesseract lang.traineddata file as of
|
|
Tesseract 3.02 are briefly described below; For more information on
|
|
many of these files, see
|
|
<ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term>
|
|
lang.config
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Optional) Language-specific overrides to default config variables.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
lang.unicharset
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Required) The list of symbols that Tesseract recognizes, with properties.
|
|
See unicharset(5).
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
lang.unicharambigs
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Optional) This file contains information on pairs of recognized symbols
|
|
which are often confused. For example, <emphasis>rn</emphasis> and <emphasis>m</emphasis>.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
lang.inttemp
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Required) Character shape templates for each unichar. Produced by
|
|
mftraining(1).
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
lang.pffmtable
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Required) The number of features expected for each unichar.
|
|
Produced by mftraining(1) from <emphasis role="strong">.tr</emphasis> files.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
lang.normproto
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Required) Character normalization prototypes generated by cntraining(1)
|
|
from <emphasis role="strong">.tr</emphasis> files.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
lang.punc-dawg
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Optional) A dawg made from punctuation patterns found around words.
|
|
The "word" part is replaced by a single space.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
lang.word-dawg
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Optional) A dawg made from dictionary words from the language.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
lang.number-dawg
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Optional) A dawg made from tokens which originally contained digits.
|
|
Each digit is replaced by a space character.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
lang.freq-dawg
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Optional) A dawg made from the most frequent words which would have
|
|
gone into word-dawg.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
lang.fixed-length-dawgs
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Optional) Several dawgs of different fixed lengths — useful for
|
|
languages like Chinese.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
lang.cube-unicharset
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Optional) A unicharset for cube, if cube was trained on a different set
|
|
of symbols.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
lang.cube-word-dawg
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Optional) A word dawg for cube’s alternate unicharset. Not needed if Cube
|
|
was trained with Tesseract’s unicharset.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
lang.shapetable
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Optional) When present, a shapetable is an extra layer between the character
|
|
classifier and the word recognizer that allows the character classifier to
|
|
return a collection of unichar ids and fonts instead of a single unichar-id
|
|
and font.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
lang.bigram-dawg
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Optional) A dawg of word bigrams where the words are separated by a space
|
|
and each digit is replaced by a <emphasis>?</emphasis>.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
lang.unambig-dawg
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Optional) TODO: Describe.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
lang.params-training-model
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Optional) TODO: Describe.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
</refsect1>
|
|
<refsect1 id="_history">
|
|
<title>HISTORY</title>
|
|
<simpara>combine_tessdata(1) first appeared in version 3.00 of Tesseract</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_see_also">
|
|
<title>SEE ALSO</title>
|
|
<simpara>tesseract(1), wordlist2dawg(1), cntraining(1), mftraining(1), unicharset(5),
|
|
unicharambigs(5)</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_copying">
|
|
<title>COPYING</title>
|
|
<simpara>Copyright (C) 2009, Google Inc.
|
|
Licensed under the Apache License, Version 2.0</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_author">
|
|
<title>AUTHOR</title>
|
|
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
|
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
|
</refsect1>
|
|
</refentry>
|