mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-11 23:19:04 +08:00
106 lines
3.5 KiB
XML
106 lines
3.5 KiB
XML
<?xml version="1.0" encoding="UTF-8"?>
|
|
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
|
|
<?asciidoc-toc?>
|
|
<?asciidoc-numbered?>
|
|
<refentry lang="en">
|
|
<refentryinfo>
|
|
<title>SHAPECLUSTERING(1)</title>
|
|
</refentryinfo>
|
|
<refmeta>
|
|
<refentrytitle>shapeclustering</refentrytitle>
|
|
<manvolnum>1</manvolnum>
|
|
<refmiscinfo class="source"> </refmiscinfo>
|
|
<refmiscinfo class="manual"> </refmiscinfo>
|
|
</refmeta>
|
|
<refnamediv>
|
|
<refname>shapeclustering</refname>
|
|
<refpurpose>shape clustering training for Tesseract</refpurpose>
|
|
</refnamediv>
|
|
<refsynopsisdiv id="_synopsis">
|
|
<simpara>shapeclustering -D <emphasis>output_dir</emphasis>
|
|
-U <emphasis>unicharset</emphasis> -O <emphasis>mfunicharset</emphasis>
|
|
-F <emphasis>font_props</emphasis> -X <emphasis>xheights</emphasis>
|
|
<emphasis>FILE</emphasis>…</simpara>
|
|
</refsynopsisdiv>
|
|
<refsect1 id="_description">
|
|
<title>DESCRIPTION</title>
|
|
<simpara>shapeclustering(1) takes extracted feature .tr files (generated by
|
|
tesseract(1) run in a special mode from box files) and produces a
|
|
file <emphasis role="strong">shapetable</emphasis> and an enhanced unicharset. This program is still
|
|
experimental, and is not required (yet) for training Tesseract.</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_options">
|
|
<title>OPTIONS</title>
|
|
<variablelist>
|
|
<varlistentry>
|
|
<term>
|
|
-U <emphasis>FILE</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
The unicharset generated by unicharset_extractor(1).
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
-D <emphasis>dir</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
Directory to write output files to.
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
-F <emphasis>font_properties_file</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Input) font properties file, where each line is of the following form, where each field other than the font name is 0 or 1:
|
|
</simpara>
|
|
<literallayout class="monospaced">'font_name' 'italic' 'bold' 'fixed_pitch' 'serif' 'fraktur'</literallayout>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
-X <emphasis>xheights_file</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
(Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ]
|
|
</simpara>
|
|
<literallayout class="monospaced">'font_name' 'xheight'</literallayout>
|
|
</listitem>
|
|
</varlistentry>
|
|
<varlistentry>
|
|
<term>
|
|
-O <emphasis>FILE</emphasis>
|
|
</term>
|
|
<listitem>
|
|
<simpara>
|
|
The output unicharset that will be given to combine_tessdata(1).
|
|
</simpara>
|
|
</listitem>
|
|
</varlistentry>
|
|
</variablelist>
|
|
</refsect1>
|
|
<refsect1 id="_see_also">
|
|
<title>SEE ALSO</title>
|
|
<simpara>tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1),
|
|
unicharset(5)</simpara>
|
|
<simpara><ulink url="https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract">https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract</ulink></simpara>
|
|
</refsect1>
|
|
<refsect1 id="_copying">
|
|
<title>COPYING</title>
|
|
<simpara>Copyright (C) Google, 2011
|
|
Licensed under the Apache License, Version 2.0</simpara>
|
|
</refsect1>
|
|
<refsect1 id="_author">
|
|
<title>AUTHOR</title>
|
|
<simpara>The Tesseract OCR engine was written by Ray Smith and his research groups
|
|
at Hewlett Packard (1985-1995) and Google (2006-present).</simpara>
|
|
</refsect1>
|
|
</refentry>
|