mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-30 23:49:05 +08:00
023e1b340e
* api: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccmain: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccstruct: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * classify: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * cutil: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * dict: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * textord: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * training: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * wordrec: Replace Tesseract data types by POSIX data types Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccutil: Replace Tesseract data types by POSIX data types Now all Tesseract data types which are no longer needed can be removed from ccutil/host.h. Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccmain: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccstruct: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * classify: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * dict: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * lstm: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * textord: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * wordrec: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * ccutil: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX Remove the macros which are now unused from ccutil/host.h. Remove also the obsolete history comments. Signed-off-by: Stefan Weil <sw@weilnetz.de> * Fix build error caused by ambiguous ClipToRange Error message vom Appveyor CI: C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2672: 'ClipToRange': no matching overloaded function found [C:\projects\tesseract\build\libtesseract.vcxproj] C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2782: 'T ClipToRange(const T &,const T &,const T &)': template parameter 'T' is ambiguous [C:\projects\tesseract\build\libtesseract.vcxproj] c:\projects\tesseract\ccutil\helpers.h(122): note: see declaration of 'ClipToRange' C:\projects\tesseract\ccstruct\coutln.cpp(818): note: could be 'char' C:\projects\tesseract\ccstruct\coutln.cpp(818): note: or 'int' Signed-off-by: Stefan Weil <sw@weilnetz.de> * unittest: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de> * arch: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX Signed-off-by: Stefan Weil <sw@weilnetz.de>
491 lines
22 KiB
C++
491 lines
22 KiB
C++
///////////////////////////////////////////////////////////////////////
|
|
// File: wordrec.h
|
|
// Description: wordrec class.
|
|
// Author: Samuel Charron
|
|
//
|
|
// (C) Copyright 2006, Google Inc.
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
#ifndef TESSERACT_WORDREC_WORDREC_H_
|
|
#define TESSERACT_WORDREC_WORDREC_H_
|
|
|
|
#include "associate.h"
|
|
#include "classify.h"
|
|
#include "dict.h"
|
|
#include "language_model.h"
|
|
#include "ratngs.h"
|
|
#include "matrix.h"
|
|
#include "gradechop.h"
|
|
#include "seam.h"
|
|
#include "findseam.h"
|
|
#include "callcpp.h"
|
|
|
|
class WERD_RES;
|
|
|
|
namespace tesseract {
|
|
|
|
// A class for storing which nodes are to be processed by the segmentation
|
|
// search. There is a single SegSearchPending for each column in the ratings
|
|
// matrix, and it indicates whether the segsearch should combine all
|
|
// BLOB_CHOICES in the column, or just the given row with the parents
|
|
// corresponding to *this SegSearchPending, and whether only updated parent
|
|
// ViterbiStateEntries should be combined, or all, with the BLOB_CHOICEs.
|
|
class SegSearchPending {
|
|
public:
|
|
SegSearchPending()
|
|
: classified_row_(-1),
|
|
revisit_whole_column_(false),
|
|
column_classified_(false) {}
|
|
|
|
// Marks the whole column as just classified. Used to start a search on
|
|
// a newly initialized ratings matrix.
|
|
void SetColumnClassified() {
|
|
column_classified_ = true;
|
|
}
|
|
// Marks the matrix entry at the given row as just classified.
|
|
// Used after classifying a new matrix cell.
|
|
// Additional to, not overriding a previous RevisitWholeColumn.
|
|
void SetBlobClassified(int row) {
|
|
classified_row_ = row;
|
|
}
|
|
// Marks the whole column as needing work, but not just classified.
|
|
// Used when the parent vse list is updated.
|
|
// Additional to, not overriding a previous SetBlobClassified.
|
|
void RevisitWholeColumn() {
|
|
revisit_whole_column_ = true;
|
|
}
|
|
|
|
// Clears *this to indicate no work to do.
|
|
void Clear() {
|
|
classified_row_ = -1;
|
|
revisit_whole_column_ = false;
|
|
column_classified_ = false;
|
|
}
|
|
|
|
// Returns true if there are updates to do in the column that *this
|
|
// represents.
|
|
bool WorkToDo() const {
|
|
return revisit_whole_column_ || column_classified_ || classified_row_ >= 0;
|
|
}
|
|
// Returns true if the given row was just classified.
|
|
bool IsRowJustClassified(int row) const {
|
|
return row == classified_row_ || column_classified_;
|
|
}
|
|
// Returns the single row to process if there is only one, otherwise -1.
|
|
int SingleRow() const {
|
|
return revisit_whole_column_ || column_classified_ ? -1 : classified_row_;
|
|
}
|
|
|
|
private:
|
|
// If non-negative, indicates the single row in the ratings matrix that has
|
|
// just been classified, and so should be combined with all the parents in the
|
|
// column that this SegSearchPending represents.
|
|
// Operates independently of revisit_whole_column.
|
|
int classified_row_;
|
|
// If revisit_whole_column is true, then all BLOB_CHOICEs in this column will
|
|
// be processed, but classified_row can indicate a row that is newly
|
|
// classified. Overridden if column_classified is true.
|
|
bool revisit_whole_column_;
|
|
// If column_classified is true, parent vses are processed with all rows
|
|
// regardless of whether they are just updated, overriding
|
|
// revisit_whole_column and classified_row.
|
|
bool column_classified_;
|
|
};
|
|
|
|
|
|
/* ccmain/tstruct.cpp *********************************************************/
|
|
class FRAGMENT:public ELIST_LINK
|
|
{
|
|
public:
|
|
FRAGMENT() { //constructor
|
|
}
|
|
FRAGMENT(EDGEPT *head_pt, //start
|
|
EDGEPT *tail_pt); //end
|
|
|
|
ICOORD head; //coords of start
|
|
ICOORD tail; //coords of end
|
|
EDGEPT *headpt; //start point
|
|
EDGEPT *tailpt; //end point
|
|
};
|
|
ELISTIZEH(FRAGMENT)
|
|
|
|
|
|
class Wordrec : public Classify {
|
|
public:
|
|
// config parameters *******************************************************
|
|
BOOL_VAR_H(merge_fragments_in_matrix, TRUE,
|
|
"Merge the fragments in the ratings matrix and delete them "
|
|
"after merging");
|
|
BOOL_VAR_H(wordrec_no_block, FALSE, "Don't output block information");
|
|
BOOL_VAR_H(wordrec_enable_assoc, TRUE, "Associator Enable");
|
|
BOOL_VAR_H(force_word_assoc, FALSE,
|
|
"force associator to run regardless of what enable_assoc is."
|
|
"This is used for CJK where component grouping is necessary.");
|
|
double_VAR_H(wordrec_worst_state, 1, "Worst segmentation state");
|
|
BOOL_VAR_H(fragments_guide_chopper, FALSE,
|
|
"Use information from fragments to guide chopping process");
|
|
INT_VAR_H(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped");
|
|
double_VAR_H(tessedit_certainty_threshold, -2.25, "Good blob limit");
|
|
INT_VAR_H(chop_debug, 0, "Chop debug");
|
|
BOOL_VAR_H(chop_enable, 1, "Chop enable");
|
|
BOOL_VAR_H(chop_vertical_creep, 0, "Vertical creep");
|
|
INT_VAR_H(chop_split_length, 10000, "Split Length");
|
|
INT_VAR_H(chop_same_distance, 2, "Same distance");
|
|
INT_VAR_H(chop_min_outline_points, 6, "Min Number of Points on Outline");
|
|
INT_VAR_H(chop_seam_pile_size, 150, "Max number of seams in seam_pile");
|
|
BOOL_VAR_H(chop_new_seam_pile, 1, "Use new seam_pile");
|
|
INT_VAR_H(chop_inside_angle, -50, "Min Inside Angle Bend");
|
|
INT_VAR_H(chop_min_outline_area, 2000, "Min Outline Area");
|
|
double_VAR_H(chop_split_dist_knob, 0.5, "Split length adjustment");
|
|
double_VAR_H(chop_overlap_knob, 0.9, "Split overlap adjustment");
|
|
double_VAR_H(chop_center_knob, 0.15, "Split center adjustment");
|
|
INT_VAR_H(chop_centered_maxwidth, 90, "Width of (smaller) chopped blobs "
|
|
"above which we don't care that a chop is not near the center.");
|
|
double_VAR_H(chop_sharpness_knob, 0.06, "Split sharpness adjustment");
|
|
double_VAR_H(chop_width_change_knob, 5.0, "Width change adjustment");
|
|
double_VAR_H(chop_ok_split, 100.0, "OK split limit");
|
|
double_VAR_H(chop_good_split, 50.0, "Good split limit");
|
|
INT_VAR_H(chop_x_y_weight, 3, "X / Y length weight");
|
|
INT_VAR_H(segment_adjust_debug, 0, "Segmentation adjustment debug");
|
|
BOOL_VAR_H(assume_fixed_pitch_char_segment, FALSE,
|
|
"include fixed-pitch heuristics in char segmentation");
|
|
INT_VAR_H(wordrec_debug_level, 0, "Debug level for wordrec");
|
|
INT_VAR_H(wordrec_max_join_chunks, 4,
|
|
"Max number of broken pieces to associate");
|
|
BOOL_VAR_H(wordrec_skip_no_truth_words, false,
|
|
"Only run OCR for words that had truth recorded in BlamerBundle");
|
|
BOOL_VAR_H(wordrec_debug_blamer, false, "Print blamer debug messages");
|
|
BOOL_VAR_H(wordrec_run_blamer, false, "Try to set the blame for errors");
|
|
INT_VAR_H(segsearch_debug_level, 0, "SegSearch debug level");
|
|
INT_VAR_H(segsearch_max_pain_points, 2000,
|
|
"Maximum number of pain points stored in the queue");
|
|
INT_VAR_H(segsearch_max_futile_classifications, 10,
|
|
"Maximum number of pain point classifications per word.");
|
|
double_VAR_H(segsearch_max_char_wh_ratio, 2.0,
|
|
"Maximum character width-to-height ratio");
|
|
BOOL_VAR_H(save_alt_choices, true,
|
|
"Save alternative paths found during chopping "
|
|
"and segmentation search");
|
|
|
|
// methods from wordrec/*.cpp ***********************************************
|
|
Wordrec();
|
|
virtual ~Wordrec();
|
|
|
|
// Fills word->alt_choices with alternative paths found during
|
|
// chopping/segmentation search that are kept in best_choices.
|
|
void SaveAltChoices(const LIST &best_choices, WERD_RES *word);
|
|
|
|
// Fills character choice lattice in the given BlamerBundle
|
|
// using the given ratings matrix and best choice list.
|
|
void FillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
|
|
const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);
|
|
|
|
// Calls fill_lattice_ member function
|
|
// (assumes that fill_lattice_ is not NULL).
|
|
void CallFillLattice(const MATRIX &ratings,
|
|
const WERD_CHOICE_LIST &best_choices,
|
|
const UNICHARSET &unicharset,
|
|
BlamerBundle *blamer_bundle) {
|
|
(this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
|
|
}
|
|
|
|
// tface.cpp
|
|
void program_editup(const char *textbase, TessdataManager *init_classifier,
|
|
TessdataManager *init_dict);
|
|
void cc_recog(WERD_RES *word);
|
|
void program_editdown(int32_t elasped_time);
|
|
void set_pass1();
|
|
void set_pass2();
|
|
int end_recog();
|
|
BLOB_CHOICE_LIST *call_matcher(TBLOB* blob);
|
|
int dict_word(const WERD_CHOICE &word);
|
|
// wordclass.cpp
|
|
BLOB_CHOICE_LIST *classify_blob(TBLOB *blob,
|
|
const char *string,
|
|
C_COL color,
|
|
BlamerBundle *blamer_bundle);
|
|
|
|
// segsearch.cpp
|
|
// SegSearch works on the lower diagonal matrix of BLOB_CHOICE_LISTs.
|
|
// Each entry in the matrix represents the classification choice
|
|
// for a chunk, i.e. an entry in row 2, column 1 represents the list
|
|
// of ratings for the chunks 1 and 2 classified as a single blob.
|
|
// The entries on the diagonal of the matrix are classifier choice lists
|
|
// for a single chunk from the maximal segmentation.
|
|
//
|
|
// The ratings matrix given to SegSearch represents the segmentation
|
|
// graph / trellis for the current word. The nodes in the graph are the
|
|
// individual BLOB_CHOICEs in each of the BLOB_CHOICE_LISTs in the ratings
|
|
// matrix. The children of each node (nodes connected by outgoing links)
|
|
// are the entries in the column that is equal to node's row+1. The parents
|
|
// (nodes connected by the incoming links) are the entries in the row that
|
|
// is equal to the node's column-1. Here is an example ratings matrix:
|
|
//
|
|
// 0 1 2 3 4
|
|
// -------------------------
|
|
// 0| c,( |
|
|
// 1| d l,1 |
|
|
// 2| o |
|
|
// 3| c,( |
|
|
// 4| g,y l,1 |
|
|
// -------------------------
|
|
//
|
|
// In the example above node "o" has children (outgoing connection to nodes)
|
|
// "c","(","g","y" and parents (incoming connections from nodes) "l","1","d".
|
|
//
|
|
// The objective of the search is to find the least cost path, where the cost
|
|
// is determined by the language model components and the properties of the
|
|
// cut between the blobs on the path. SegSearch starts by populating the
|
|
// matrix with the all the entries that were classified by the chopper and
|
|
// finding the initial best path. Based on the classifier ratings, language
|
|
// model scores and the properties of each cut, a list of "pain points" is
|
|
// constructed - those are the points on the path where the choices do not
|
|
// look consistent with the neighboring choices, the cuts look particularly
|
|
// problematic, or the certainties of the blobs are low. The most troublesome
|
|
// "pain point" is picked from the list and the new entry in the ratings
|
|
// matrix corresponding to this "pain point" is filled in. Then the language
|
|
// model state is updated to reflect the new classification and the new
|
|
// "pain points" are added to the list and the next most troublesome
|
|
// "pain point" is determined. This continues until either the word choice
|
|
// composed from the best paths in the segmentation graph is "good enough"
|
|
// (e.g. above a certain certainty threshold, is an unambiguous dictionary
|
|
// word, etc) or there are no more "pain points" to explore.
|
|
//
|
|
// If associate_blobs is set to false no new classifications will be done
|
|
// to combine blobs. Segmentation search will run only one "iteration"
|
|
// on the classifications already recorded in chunks_record.ratings.
|
|
//
|
|
// Note: this function assumes that word_res, best_choice_bundle arguments
|
|
// are not NULL.
|
|
void SegSearch(WERD_RES* word_res,
|
|
BestChoiceBundle* best_choice_bundle,
|
|
BlamerBundle* blamer_bundle);
|
|
|
|
// Setup and run just the initial segsearch on an established matrix,
|
|
// without doing any additional chopping or joining.
|
|
// (Internal factored version that can be used as part of the main SegSearch.)
|
|
void InitialSegSearch(WERD_RES* word_res, LMPainPoints* pain_points,
|
|
GenericVector<SegSearchPending>* pending,
|
|
BestChoiceBundle* best_choice_bundle,
|
|
BlamerBundle* blamer_bundle);
|
|
|
|
// Runs SegSearch() function (above) without needing a best_choice_bundle
|
|
// or blamer_bundle. Used for testing.
|
|
void DoSegSearch(WERD_RES* word_res);
|
|
|
|
// chop.cpp
|
|
PRIORITY point_priority(EDGEPT *point);
|
|
void add_point_to_list(PointHeap* point_heap, EDGEPT *point);
|
|
// Returns true if the edgept supplied as input is an inside angle. This
|
|
// is determined by the angular change of the vectors from point to point.
|
|
bool is_inside_angle(EDGEPT *pt);
|
|
int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3);
|
|
EDGEPT *pick_close_point(EDGEPT *critical_point,
|
|
EDGEPT *vertical_point,
|
|
int *best_dist);
|
|
void prioritize_points(TESSLINE *outline, PointHeap* points);
|
|
void new_min_point(EDGEPT *local_min, PointHeap* points);
|
|
void new_max_point(EDGEPT *local_max, PointHeap* points);
|
|
void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point,
|
|
EDGEPT** best_point,
|
|
EDGEPT_CLIST *new_points);
|
|
|
|
// chopper.cpp
|
|
SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number,
|
|
bool italic_blob, const GenericVector<SEAM*>& seams);
|
|
SEAM *chop_numbered_blob(TWERD *word, int32_t blob_number,
|
|
bool italic_blob, const GenericVector<SEAM*>& seams);
|
|
SEAM *chop_overlapping_blob(const GenericVector<TBOX>& boxes,
|
|
bool italic_blob,
|
|
WERD_RES *word_res, int *blob_number);
|
|
SEAM *improve_one_blob(const GenericVector<BLOB_CHOICE*> &blob_choices,
|
|
DANGERR *fixpt,
|
|
bool split_next_to_fragment,
|
|
bool italic_blob,
|
|
WERD_RES *word,
|
|
int *blob_number);
|
|
SEAM *chop_one_blob(const GenericVector<TBOX> &boxes,
|
|
const GenericVector<BLOB_CHOICE*> &blob_choices,
|
|
WERD_RES *word_res,
|
|
int *blob_number);
|
|
void chop_word_main(WERD_RES *word);
|
|
void improve_by_chopping(float rating_cert_scale,
|
|
WERD_RES *word,
|
|
BestChoiceBundle *best_choice_bundle,
|
|
BlamerBundle *blamer_bundle,
|
|
LMPainPoints *pain_points,
|
|
GenericVector<SegSearchPending>* pending);
|
|
int select_blob_to_split(const GenericVector<BLOB_CHOICE*> &blob_choices,
|
|
float rating_ceiling,
|
|
bool split_next_to_fragment);
|
|
int select_blob_to_split_from_fixpt(DANGERR *fixpt);
|
|
|
|
// findseam.cpp
|
|
void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue* seams);
|
|
void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split,
|
|
PRIORITY priority, SEAM **seam_result, TBLOB *blob,
|
|
SeamPile *seam_pile);
|
|
void combine_seam(const SeamPile& seam_pile,
|
|
const SEAM* seam, SeamQueue* seam_queue);
|
|
SEAM *pick_good_seam(TBLOB *blob);
|
|
void try_point_pairs (EDGEPT * points[MAX_NUM_POINTS],
|
|
int16_t num_points,
|
|
SeamQueue* seam_queue,
|
|
SeamPile* seam_pile,
|
|
SEAM ** seam, TBLOB * blob);
|
|
void try_vertical_splits(EDGEPT * points[MAX_NUM_POINTS],
|
|
int16_t num_points,
|
|
EDGEPT_CLIST *new_points,
|
|
SeamQueue* seam_queue,
|
|
SeamPile* seam_pile,
|
|
SEAM ** seam, TBLOB * blob);
|
|
|
|
// gradechop.cpp
|
|
PRIORITY grade_split_length(register SPLIT *split);
|
|
PRIORITY grade_sharpness(register SPLIT *split);
|
|
|
|
// outlines.cpp
|
|
bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1,
|
|
EDGEPT **near_pt);
|
|
|
|
// pieces.cpp
|
|
virtual BLOB_CHOICE_LIST *classify_piece(const GenericVector<SEAM*>& seams,
|
|
int16_t start,
|
|
int16_t end,
|
|
const char* description,
|
|
TWERD *word,
|
|
BlamerBundle *blamer_bundle);
|
|
// Try to merge fragments in the ratings matrix and put the result in
|
|
// the corresponding row and column
|
|
void merge_fragments(MATRIX *ratings,
|
|
int16_t num_blobs);
|
|
// Recursively go through the ratings matrix to find lists of fragments
|
|
// to be merged in the function merge_and_put_fragment_lists.
|
|
// current_frag is the position of the piece we are looking for.
|
|
// current_row is the row in the rating matrix we are currently at.
|
|
// start is the row we started initially, so that we can know where
|
|
// to append the results to the matrix. num_frag_parts is the total
|
|
// number of pieces we are looking for and num_blobs is the size of the
|
|
// ratings matrix.
|
|
void get_fragment_lists(int16_t current_frag,
|
|
int16_t current_row,
|
|
int16_t start,
|
|
int16_t num_frag_parts,
|
|
int16_t num_blobs,
|
|
MATRIX *ratings,
|
|
BLOB_CHOICE_LIST *choice_lists);
|
|
// Merge the fragment lists in choice_lists and append it to the
|
|
// ratings matrix
|
|
void merge_and_put_fragment_lists(int16_t row,
|
|
int16_t column,
|
|
int16_t num_frag_parts,
|
|
BLOB_CHOICE_LIST *choice_lists,
|
|
MATRIX *ratings);
|
|
// Filter the fragment list so that the filtered_choices only contain
|
|
// fragments that are in the correct position. choices is the list
|
|
// that we are going to filter. fragment_pos is the position in the
|
|
// fragment that we are looking for and num_frag_parts is the the
|
|
// total number of pieces. The result will be appended to
|
|
// filtered_choices.
|
|
void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices,
|
|
int fragment_pos,
|
|
int num_frag_parts,
|
|
BLOB_CHOICE_LIST *filtered_choices);
|
|
|
|
// Member variables.
|
|
|
|
LanguageModel *language_model_;
|
|
PRIORITY pass2_ok_split;
|
|
// Stores the best choice for the previous word in the paragraph.
|
|
// This variable is modified by PAGE_RES_IT when iterating over
|
|
// words to OCR on the page.
|
|
WERD_CHOICE *prev_word_best_choice_;
|
|
// Sums of blame reasons computed by the blamer.
|
|
GenericVector<int> blame_reasons_;
|
|
// Function used to fill char choice lattices.
|
|
void (Wordrec::*fill_lattice_)(const MATRIX &ratings,
|
|
const WERD_CHOICE_LIST &best_choices,
|
|
const UNICHARSET &unicharset,
|
|
BlamerBundle *blamer_bundle);
|
|
|
|
protected:
|
|
inline bool SegSearchDone(int num_futile_classifications) {
|
|
return (language_model_->AcceptableChoiceFound() ||
|
|
num_futile_classifications >=
|
|
segsearch_max_futile_classifications);
|
|
}
|
|
|
|
// Updates the language model state recorded for the child entries specified
|
|
// in pending[starting_col]. Enqueues the children of the updated entries
|
|
// into pending and proceeds to update (and remove from pending) all the
|
|
// remaining entries in pending[col] (col >= starting_col). Upon termination
|
|
// of this function all the pending[col] lists will be empty.
|
|
//
|
|
// The arguments:
|
|
//
|
|
// starting_col: index of the column in chunks_record->ratings from
|
|
// which the update should be started
|
|
//
|
|
// pending: list of entries listing chunks_record->ratings entries
|
|
// that should be updated
|
|
//
|
|
// pain_points: priority heap listing the pain points generated by
|
|
// the language model
|
|
//
|
|
// temp_pain_points: temporary storage for tentative pain points generated
|
|
// by the language model after a single call to LanguageModel::UpdateState()
|
|
// (the argument is passed in rather than created before each
|
|
// LanguageModel::UpdateState() call to avoid dynamic memory re-allocation)
|
|
//
|
|
// best_choice_bundle: a collection of variables that should be updated
|
|
// if a new best choice is found
|
|
//
|
|
void UpdateSegSearchNodes(
|
|
float rating_cert_scale,
|
|
int starting_col,
|
|
GenericVector<SegSearchPending>* pending,
|
|
WERD_RES *word_res,
|
|
LMPainPoints *pain_points,
|
|
BestChoiceBundle *best_choice_bundle,
|
|
BlamerBundle *blamer_bundle);
|
|
|
|
// Process the given pain point: classify the corresponding blob, enqueue
|
|
// new pain points to join the newly classified blob with its neighbors.
|
|
void ProcessSegSearchPainPoint(float pain_point_priority,
|
|
const MATRIX_COORD &pain_point,
|
|
const char* pain_point_type,
|
|
GenericVector<SegSearchPending>* pending,
|
|
WERD_RES *word_res,
|
|
LMPainPoints *pain_points,
|
|
BlamerBundle *blamer_bundle);
|
|
// Resets enough of the results so that the Viterbi search is re-run.
|
|
// Needed when the n-gram model is enabled, as the multi-length comparison
|
|
// implementation will re-value existing paths to worse values.
|
|
void ResetNGramSearch(WERD_RES* word_res,
|
|
BestChoiceBundle* best_choice_bundle,
|
|
GenericVector<SegSearchPending>* pending);
|
|
|
|
// Add pain points for classifying blobs on the correct segmentation path
|
|
// (so that we can evaluate correct segmentation path and discover the reason
|
|
// for incorrect result).
|
|
void InitBlamerForSegSearch(WERD_RES *word_res,
|
|
LMPainPoints *pain_points,
|
|
BlamerBundle *blamer_bundle,
|
|
STRING *blamer_debug);
|
|
};
|
|
|
|
|
|
} // namespace tesseract
|
|
|
|
#endif // TESSERACT_WORDREC_WORDREC_H_
|