mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-11 05:27:49 +08:00
200 lines
7.5 KiB
C++
200 lines
7.5 KiB
C++
/**********************************************************************
|
|
* File: word.h
|
|
* Description: Code for the WERD class.
|
|
* Author: Ray Smith
|
|
* Created: Tue Oct 08 14:32:12 BST 1991
|
|
*
|
|
* (C) Copyright 1991, Hewlett-Packard Ltd.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
#ifndef WERD_H
|
|
#define WERD_H
|
|
|
|
#include "params.h"
|
|
#include "bits16.h"
|
|
#include "elst2.h"
|
|
#include "strngs.h"
|
|
#include "blckerr.h"
|
|
#include "stepblob.h"
|
|
|
|
enum WERD_FLAGS
|
|
{
|
|
W_SEGMENTED, //< correctly segmented
|
|
W_ITALIC, //< italic text
|
|
W_BOLD, //< bold text
|
|
W_BOL, //< start of line
|
|
W_EOL, //< end of line
|
|
W_NORMALIZED, //< flags
|
|
W_SCRIPT_HAS_XHEIGHT, //< x-height concept makes sense.
|
|
W_SCRIPT_IS_LATIN, //< Special case latin for y. splitting.
|
|
W_DONT_CHOP, //< fixed pitch chopped
|
|
W_REP_CHAR, //< repeated character
|
|
W_FUZZY_SP, //< fuzzy space
|
|
W_FUZZY_NON, //< fuzzy nonspace
|
|
W_INVERSE //< white on black
|
|
};
|
|
|
|
enum DISPLAY_FLAGS
|
|
{
|
|
/* Display flags bit number allocations */
|
|
DF_BOX, //< Bounding box
|
|
DF_TEXT, //< Correct ascii
|
|
DF_POLYGONAL, //< Polyg approx
|
|
DF_EDGE_STEP, //< Edge steps
|
|
DF_BN_POLYGONAL, //< BL normalisd polyapx
|
|
DF_BLAMER //< Blamer information
|
|
};
|
|
|
|
class ROW; //forward decl
|
|
|
|
class WERD : public ELIST2_LINK {
|
|
public:
|
|
WERD() {}
|
|
// WERD constructed with:
|
|
// blob_list - blobs of the word (we take this list's contents)
|
|
// blanks - number of blanks before the word
|
|
// text - correct text (outlives WERD)
|
|
WERD(C_BLOB_LIST *blob_list, uinT8 blanks, const char *text);
|
|
|
|
// WERD constructed from:
|
|
// blob_list - blobs in the word
|
|
// clone - werd to clone flags, etc from.
|
|
WERD(C_BLOB_LIST *blob_list, WERD *clone);
|
|
|
|
// Construct a WERD from a single_blob and clone the flags from this.
|
|
// W_BOL and W_EOL flags are set according to the given values.
|
|
WERD* ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob);
|
|
|
|
~WERD() {
|
|
}
|
|
|
|
// assignment
|
|
WERD & operator= (const WERD &source);
|
|
|
|
// This method returns a new werd constructed using the blobs in the input
|
|
// all_blobs list, which correspond to the blobs in this werd object. The
|
|
// blobs used to construct the new word are consumed and removed from the
|
|
// input all_blobs list.
|
|
// Returns NULL if the word couldn't be constructed.
|
|
// Returns original blobs for which no matches were found in the output list
|
|
// orphan_blobs (appends).
|
|
WERD *ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs,
|
|
C_BLOB_LIST *orphan_blobs);
|
|
|
|
// Accessors for reject / DUFF blobs in various formats
|
|
C_BLOB_LIST *rej_cblob_list() { // compact format
|
|
return &rej_cblobs;
|
|
}
|
|
|
|
// Accessors for good blobs in various formats.
|
|
C_BLOB_LIST *cblob_list() { // get compact blobs
|
|
return &cblobs;
|
|
}
|
|
|
|
uinT8 space() { // access function
|
|
return blanks;
|
|
}
|
|
void set_blanks(uinT8 new_blanks) {
|
|
blanks = new_blanks;
|
|
}
|
|
int script_id() const {
|
|
return script_id_;
|
|
}
|
|
void set_script_id(int id) {
|
|
script_id_ = id;
|
|
}
|
|
|
|
// Returns the (default) bounding box including all the dots.
|
|
TBOX bounding_box() const; // compute bounding box
|
|
// Returns the bounding box including the desired combination of upper and
|
|
// lower noise/diacritic elements.
|
|
TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;
|
|
// Returns the bounding box of only the good blobs.
|
|
TBOX true_bounding_box() const;
|
|
|
|
const char *text() const { return correct.string(); }
|
|
void set_text(const char *new_text) { correct = new_text; }
|
|
|
|
BOOL8 flag(WERD_FLAGS mask) const { return flags.bit(mask); }
|
|
void set_flag(WERD_FLAGS mask, BOOL8 value) { flags.set_bit(mask, value); }
|
|
|
|
BOOL8 display_flag(uinT8 flag) const { return disp_flags.bit(flag); }
|
|
void set_display_flag(uinT8 flag, BOOL8 value) {
|
|
disp_flags.set_bit(flag, value);
|
|
}
|
|
|
|
WERD *shallow_copy(); // shallow copy word
|
|
|
|
// reposition word by vector
|
|
void move(const ICOORD vec);
|
|
|
|
// join other's blobs onto this werd, emptying out other.
|
|
void join_on(WERD* other);
|
|
|
|
// copy other's blobs onto this word, leaving other intact.
|
|
void copy_on(WERD* other);
|
|
|
|
// tprintf word metadata (but not blob innards)
|
|
void print();
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
|
// plot word on window in a uniform colour
|
|
void plot(ScrollView *window, ScrollView::Color colour);
|
|
|
|
// Get the next color in the (looping) rainbow.
|
|
static ScrollView::Color NextColor(ScrollView::Color colour);
|
|
|
|
// plot word on window in a rainbow of colours
|
|
void plot(ScrollView *window);
|
|
|
|
// plot rejected blobs in a rainbow of colours
|
|
void plot_rej_blobs(ScrollView *window);
|
|
#endif // GRAPHICS_DISABLED
|
|
|
|
// Removes noise from the word by moving small outlines to the rej_cblobs
|
|
// list, based on the size_threshold.
|
|
void CleanNoise(float size_threshold);
|
|
|
|
// Extracts all the noise outlines and stuffs the pointers into the given
|
|
// vector of outlines. Afterwards, the outlines vector owns the pointers.
|
|
void GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines);
|
|
// Adds the selected outlines to the indcated real blobs, and puts the rest
|
|
// back in rej_cblobs where they came from. Where the target_blobs entry is
|
|
// NULL, a run of wanted outlines is put into a single new blob.
|
|
// Ownership of the outlines is transferred back to the word. (Hence
|
|
// GenericVector and not PointerVector.)
|
|
// Returns true if any new blob was added to the start of the word, which
|
|
// suggests that it might need joining to the word before it, and likewise
|
|
// sets make_next_word_fuzzy true if any new blob was added to the end.
|
|
bool AddSelectedOutlines(const GenericVector<bool> &wanted,
|
|
const GenericVector<C_BLOB *> &target_blobs,
|
|
const GenericVector<C_OUTLINE *> &outlines,
|
|
bool *make_next_word_fuzzy);
|
|
|
|
private:
|
|
uinT8 blanks; // no of blanks
|
|
uinT8 dummy; // padding
|
|
BITS16 flags; // flags about word
|
|
BITS16 disp_flags; // display flags
|
|
inT16 script_id_; // From unicharset.
|
|
STRING correct; // correct text
|
|
C_BLOB_LIST cblobs; // compacted blobs
|
|
C_BLOB_LIST rej_cblobs; // DUFF blobs
|
|
};
|
|
|
|
ELIST2IZEH (WERD)
|
|
#include "ocrrow.h" // placed here due to
|
|
// compare words by increasing order of left edge, suitable for qsort(3)
|
|
int word_comparator(const void *word1p, const void *word2p);
|
|
#endif
|