tesseract/doc/wordlist2dawg.1.xml

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
<?asciidoc-toc?>
<?asciidoc-numbered?>
<refentry lang="en">
<refmeta>
<refentrytitle>wordlist2dawg</refentrytitle>
<manvolnum>1</manvolnum>
<refmiscinfo class="source">&nbsp;</refmiscinfo>
<refmiscinfo class="manual">&nbsp;</refmiscinfo>
</refmeta>
<refnamediv>
    <refname>wordlist2dawg</refname>
    <refpurpose>convert a wordlist to a DAWG for Tesseract</refpurpose>
</refnamediv>
<refsynopsisdiv id="_synopsis">
<simpara><emphasis role="strong">wordlist2dawg</emphasis> <emphasis>WORDLIST</emphasis> <emphasis>DAWG</emphasis> <emphasis>lang.unicharset</emphasis></simpara>
<simpara><emphasis role="strong">wordlist2dawg</emphasis> -t <emphasis>WORDLIST</emphasis> <emphasis>DAWG</emphasis> <emphasis>lang.unicharset</emphasis></simpara>
<simpara><emphasis role="strong">wordlist2dawg</emphasis> -r 1 <emphasis>WORDLIST</emphasis> <emphasis>DAWG</emphasis> <emphasis>lang.unicharset</emphasis></simpara>
<simpara><emphasis role="strong">wordlist2dawg</emphasis> -r 2 <emphasis>WORDLIST</emphasis> <emphasis>DAWG</emphasis> <emphasis>lang.unicharset</emphasis></simpara>
<simpara><emphasis role="strong">wordlist2dawg</emphasis> -l &lt;short&gt; &lt;long&gt; <emphasis>WORDLIST</emphasis> <emphasis>DAWG</emphasis> <emphasis>lang.unicharset</emphasis></simpara>
</refsynopsisdiv>
<refsect1 id="_description">
<title>DESCRIPTION</title>
<simpara>wordlist2dawg(1) converts a wordlist to a Directed Acyclic Word Graph
(DAWG) for use with Tesseract.  A DAWG is a compressed, space and time
efficient representation of a word list.</simpara>
</refsect1>
<refsect1 id="_options">
<title>OPTIONS</title>
<simpara>-t
        Verify that a given dawg file is equivalent to a given wordlist.</simpara>
<simpara>-r 1
        Reverse a word if it contains an RTL character.</simpara>
<simpara>-r 2
        Reverse all words.</simpara>
<simpara>-l &lt;short&gt; &lt;long&gt;
        Produce a file with several dawgs in it, one each for words
        of length &lt;short&gt;, &lt;short+1&gt;,&#8230; &lt;long&gt;</simpara>
</refsect1>
<refsect1 id="_arguments">
<title>ARGUMENTS</title>
<simpara><emphasis>WORDLIST</emphasis>
        A plain text file in UTF-8, one word per line.</simpara>
<simpara><emphasis>DAWG</emphasis>
        The output DAWG to write.</simpara>
<simpara><emphasis>lang.unicharset</emphasis>
        The unicharset of the language. This is the unicharset
        generated by mftraining(1).</simpara>
</refsect1>
<refsect1 id="_see_also">
<title>SEE ALSO</title>
<simpara>tesseract(1), combine_tessdata(1), dawg2wordlist(1)</simpara>
<simpara><ulink url="http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3">http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3</ulink></simpara>
</refsect1>
<refsect1 id="_copying">
<title>COPYING</title>
<simpara>Copyright (C) 2006 Google, Inc.
Licensed under the Apache License, Version 2.0</simpara>
</refsect1>
<refsect1 id="_author">
<title>AUTHOR</title>
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
</refsect1>
</refentry>