git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@483 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
joregan 2010-09-30 02:18:45 +00:00
parent 0759ee7e17
commit 5575d8db02
2 changed files with 87 additions and 0 deletions

41
doc/wordlist2dawg.1.asc Normal file
View File

@ -0,0 +1,41 @@
WORDLIST2DAWG(1)
================
NAME
----
wordlist2dawg - convert a wordlist to a DAWG for Tesseract
SYNOPSIS
--------
*wordlist2dawg* 'WORDLIST' 'DAWG' 'lang.unicharset'
DESCRIPTION
-----------
wordlist2dawg(1) converts a wordlist to a Directed Acyclic
Word Graph (DAWG) for use with Tesseract.
The wordlists are split into two: one with high frequency
words, and one with the rest.
OPTIONS
-------
'WORDLIST'
A plain text file in UTF-8, one word per line
'DAWG'
The output DAWG to write
'lang.unicharset'
The unicharset of the language. This is the unicharset
generated by mftraining(1)
SEE ALSO
--------
tesseract(1), mftraining(1)
<http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3>
COPYING
-------
Copyright (c) 2006 Google, Inc.
Licensed under the Apache License, Version 2.0

46
doc/wordlist2dawg.1.xml Normal file
View File

@ -0,0 +1,46 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
<?asciidoc-toc?>
<?asciidoc-numbered?>
<refentry lang="en">
<refmeta>
<refentrytitle>wordlist2dawg</refentrytitle>
<manvolnum>1</manvolnum>
<refmiscinfo class="source">&nbsp;</refmiscinfo>
<refmiscinfo class="manual">&nbsp;</refmiscinfo>
</refmeta>
<refnamediv>
<refname>wordlist2dawg</refname>
<refpurpose>convert a wordlist to a DAWG for Tesseract</refpurpose>
</refnamediv>
<refsynopsisdiv id="_synopsis">
<simpara><emphasis role="strong">wordlist2dawg</emphasis> <emphasis>WORDLIST</emphasis> <emphasis>DAWG</emphasis> <emphasis>lang.unicharset</emphasis></simpara>
</refsynopsisdiv>
<refsect1 id="_description">
<title>DESCRIPTION</title>
<simpara>wordlist2dawg(1) converts a wordlist to a Directed Acyclic
Word Graph (DAWG) for use with Tesseract.</simpara>
<simpara>The wordlists are split into two: one with high frequency
words, and one with the rest.</simpara>
</refsect1>
<refsect1 id="_options">
<title>OPTIONS</title>
<simpara><emphasis>WORDLIST</emphasis>
A plain text file in UTF-8, one word per line</simpara>
<simpara><emphasis>DAWG</emphasis>
The output DAWG to write</simpara>
<simpara><emphasis>lang.unicharset</emphasis>
The unicharset of the language. This is the unicharset
generated by mftraining(1)</simpara>
</refsect1>
<refsect1 id="_see_also">
<title>SEE ALSO</title>
<simpara>tesseract(1), mftraining(1)</simpara>
<simpara><ulink url="http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3">http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3</ulink></simpara>
</refsect1>
<refsect1 id="_copying">
<title>COPYING</title>
<simpara>Copyright (c) 2006 Google, Inc.
Licensed under the Apache License, Version 2.0</simpara>
</refsect1>
</refentry>