mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-07 18:27:48 +08:00
192 lines
7.3 KiB
C
192 lines
7.3 KiB
C
|
/////////////////////////////////////////////////////////////////////
|
||
|
// File: ocrpara.h
|
||
|
// Description: OCR Paragraph Output Type
|
||
|
// Author: David Eger
|
||
|
// Created: 2010-11-15
|
||
|
//
|
||
|
// (C) Copyright 2010, Google Inc.
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
//
|
||
|
///////////////////////////////////////////////////////////////////////
|
||
|
|
||
|
#ifndef TESSERACT_CCSTRUCT_OCRPARA_H_
|
||
|
#define TESSERACT_CCSTRUCT_OCRPARA_H_
|
||
|
|
||
|
#include "publictypes.h"
|
||
|
#include "elst.h"
|
||
|
#include "strngs.h"
|
||
|
|
||
|
class ParagraphModel;
|
||
|
|
||
|
struct PARA : public ELIST_LINK {
|
||
|
public:
|
||
|
PARA() : model(NULL), is_list_item(false),
|
||
|
is_very_first_or_continuation(false), has_drop_cap(false) {}
|
||
|
|
||
|
// We do not own the model, we just reference it.
|
||
|
// model may be NULL if there is not a good model for this paragraph.
|
||
|
const ParagraphModel *model;
|
||
|
|
||
|
bool is_list_item;
|
||
|
|
||
|
// The first paragraph on a page often lacks a first line indent, but should
|
||
|
// still be modeled by the same model as other body text paragraphs on the
|
||
|
// page.
|
||
|
bool is_very_first_or_continuation;
|
||
|
|
||
|
// Does this paragraph begin with a drop cap?
|
||
|
bool has_drop_cap;
|
||
|
};
|
||
|
|
||
|
ELISTIZEH(PARA)
|
||
|
|
||
|
// A geometric model of paragraph indentation and alignment.
|
||
|
//
|
||
|
// Measurements are in pixels. The meaning of the integer arguments changes
|
||
|
// depending upon the value of justification. Distances less than or equal
|
||
|
// to tolerance apart we take as "equivalent" for the purpose of model
|
||
|
// matching, and in the examples below, we assume tolerance is zero.
|
||
|
//
|
||
|
// justification = LEFT:
|
||
|
// margin the "ignored" margin to the left block edge.
|
||
|
// first_indent indent from the left margin to a typical first text line.
|
||
|
// body_indent indent from the left margin of a typical body text line.
|
||
|
//
|
||
|
// justification = RIGHT:
|
||
|
// margin the "ignored" margin to the right block edge.
|
||
|
// first_indent indent from the right margin to a typical first text line.
|
||
|
// body_indent indent from the right margin of a typical body text line.
|
||
|
//
|
||
|
// justification = CENTER:
|
||
|
// margin ignored
|
||
|
// first_indent ignored
|
||
|
// body_indent ignored
|
||
|
//
|
||
|
// ====== Extended example, assuming each letter is ten pixels wide: =======
|
||
|
//
|
||
|
// +--------------------------------+
|
||
|
// | Awesome | ParagraphModel(CENTER, 0, 0, 0)
|
||
|
// | Centered Title |
|
||
|
// | Paragraph Detection |
|
||
|
// | OCR TEAM |
|
||
|
// | 10 November 2010 |
|
||
|
// | |
|
||
|
// | Look here, I have a paragraph.| ParagraphModel(LEFT, 0, 20, 0)
|
||
|
// |This paragraph starts at the top|
|
||
|
// |of the page and takes 3 lines. |
|
||
|
// | Here I have a second paragraph| ParagraphModel(LEFT, 0, 20, 0)
|
||
|
// |which indicates that the first |
|
||
|
// |paragraph is not a continuation |
|
||
|
// |from a previous page, as it is |
|
||
|
// |indented just like this second |
|
||
|
// |paragraph. |
|
||
|
// | Here is a block quote. It | ParagraphModel(LEFT, 30, 0, 0)
|
||
|
// | looks like the prior text |
|
||
|
// | but it is indented more |
|
||
|
// | and is fully justified. |
|
||
|
// | So how does one deal with | ParagraphModel(LEFT, 0, 20, 0)
|
||
|
// |centered text, block quotes, |
|
||
|
// |normal paragraphs, and lists |
|
||
|
// |like what follows? |
|
||
|
// |1. Make a plan. | ParagraphModel(LEFT, 0, 0, 30)
|
||
|
// |2. Use a heuristic, for example,| ParagraphModel(LEFT, 0, 0, 30)
|
||
|
// | looking for lines where the |
|
||
|
// | first word of the next line |
|
||
|
// | would fit on the previous |
|
||
|
// | line. |
|
||
|
// |8. Try to implement the plan in | ParagraphModel(LEFT, 0, 0, 30)
|
||
|
// | Python and try it out. |
|
||
|
// |4. Determine how to fix the | ParagraphModel(LEFT, 0, 0, 30)
|
||
|
// | mistakes. |
|
||
|
// |5. Repeat. | ParagraphModel(LEFT, 0, 0, 30)
|
||
|
// | For extra painful penalty work| ParagraphModel(LEFT, 0, 20, 0)
|
||
|
// |you can try to identify source |
|
||
|
// |code. Ouch! |
|
||
|
// +--------------------------------+
|
||
|
class ParagraphModel {
|
||
|
public:
|
||
|
ParagraphModel(tesseract::ParagraphJustification justification,
|
||
|
int margin,
|
||
|
int first_indent,
|
||
|
int body_indent,
|
||
|
int tolerance)
|
||
|
: justification_(justification),
|
||
|
margin_(margin),
|
||
|
first_indent_(first_indent),
|
||
|
body_indent_(body_indent),
|
||
|
tolerance_(tolerance) {
|
||
|
// Make one of {first_indent, body_indent} is 0.
|
||
|
int added_margin = first_indent;
|
||
|
if (body_indent < added_margin)
|
||
|
added_margin = body_indent;
|
||
|
margin_ += added_margin;
|
||
|
first_indent_ -= added_margin;
|
||
|
body_indent_ -= added_margin;
|
||
|
}
|
||
|
|
||
|
ParagraphModel()
|
||
|
: justification_(tesseract::JUSTIFICATION_UNKNOWN),
|
||
|
margin_(0),
|
||
|
first_indent_(0),
|
||
|
body_indent_(0),
|
||
|
tolerance_(0) { }
|
||
|
|
||
|
// ValidFirstLine() and ValidBodyLine() take arguments describing a text line
|
||
|
// in a block of text which we are trying to model:
|
||
|
// lmargin, lindent: these add up to the distance from the leftmost ink
|
||
|
// in the text line to the surrounding text block's left
|
||
|
// edge.
|
||
|
// rmargin, rindent: these add up to the distance from the rightmost ink
|
||
|
// in the text line to the surrounding text block's right
|
||
|
// edge.
|
||
|
// The caller determines the division between "margin" and "indent", which
|
||
|
// only actually affect whether we think the line may be centered.
|
||
|
//
|
||
|
// If the amount of whitespace matches the amount of whitespace expected on
|
||
|
// the relevant side of the line (within tolerance_) we say it matches.
|
||
|
|
||
|
// Return whether a given text line could be a first paragraph line according
|
||
|
// to this paragraph model.
|
||
|
bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const;
|
||
|
|
||
|
// Return whether a given text line could be a first paragraph line according
|
||
|
// to this paragraph model.
|
||
|
bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const;
|
||
|
|
||
|
tesseract::ParagraphJustification justification() const {
|
||
|
return justification_;
|
||
|
}
|
||
|
int margin() const { return margin_; }
|
||
|
int first_indent() const { return first_indent_; }
|
||
|
int body_indent() const { return body_indent_; }
|
||
|
int tolerance() const { return tolerance_; }
|
||
|
bool is_flush() const {
|
||
|
return (justification_ == tesseract::JUSTIFICATION_LEFT ||
|
||
|
justification_ == tesseract::JUSTIFICATION_RIGHT) &&
|
||
|
abs(first_indent_ - body_indent_) <= tolerance_;
|
||
|
}
|
||
|
|
||
|
// Return whether this model is likely to agree with the other model on most
|
||
|
// paragraphs they are marked.
|
||
|
bool Comparable(const ParagraphModel &other) const;
|
||
|
|
||
|
STRING ToString() const;
|
||
|
|
||
|
private:
|
||
|
tesseract::ParagraphJustification justification_;
|
||
|
int margin_;
|
||
|
int first_indent_;
|
||
|
int body_indent_;
|
||
|
int tolerance_;
|
||
|
};
|
||
|
|
||
|
#endif // TESSERACT_CCSTRUCT_OCRPARA_H_
|