mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-11 15:09:03 +08:00
01026af5a2
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@652 d0cd1f9f-072b-0410-8dd7-cf729c803f20
437 lines
16 KiB
C++
437 lines
16 KiB
C++
/* -*-C-*-
|
|
********************************************************************************
|
|
*
|
|
* File: pieces.c (Formerly pieces.c)
|
|
* Description:
|
|
* Author: Mark Seaman, OCR Technology
|
|
* Created: Fri Oct 16 14:37:00 1987
|
|
* Modified: Mon May 20 12:12:35 1991 (Mark Seaman) marks@hpgrlt
|
|
* Language: C
|
|
* Package: N/A
|
|
* Status: Reusable Software Component
|
|
*
|
|
* (c) Copyright 1987, Hewlett-Packard Company.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
*********************************************************************************/
|
|
/*----------------------------------------------------------------------
|
|
I n c l u d e s
|
|
----------------------------------------------------------------------*/
|
|
|
|
#include "blobs.h"
|
|
#include "freelist.h"
|
|
#include "helpers.h"
|
|
#include "matchtab.h"
|
|
#include "matrix.h"
|
|
#include "ndminx.h"
|
|
#include "plotseg.h"
|
|
#include "ratngs.h"
|
|
#include "seam.h"
|
|
#include "states.h"
|
|
#include "wordclass.h"
|
|
#include "wordrec.h"
|
|
|
|
// Include automatically generated configuration file if running autoconf.
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config_auto.h"
|
|
#endif
|
|
|
|
/*----------------------------------------------------------------------
|
|
F u n c t i o n s
|
|
----------------------------------------------------------------------*/
|
|
|
|
|
|
/**********************************************************************
|
|
* bounds_of_piece
|
|
*
|
|
* Find the bounds of the piece that will be created by joining the
|
|
* requested collection of pieces together.
|
|
**********************************************************************/
|
|
TBOX bounds_of_piece(TBOX *bounds, inT16 start, inT16 end) {
|
|
TBOX all_together = bounds[start];
|
|
for (int x = start + 1; x <= end; x++) {
|
|
all_together += bounds[x];
|
|
}
|
|
return all_together;
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* classify_piece
|
|
*
|
|
* Create a larger piece from a collection of smaller ones. Classify
|
|
* it and return the results. Take the large piece apart to leave
|
|
* the collection of small pieces un modified.
|
|
**********************************************************************/
|
|
namespace tesseract {
|
|
BLOB_CHOICE_LIST *Wordrec::classify_piece(TBLOB *pieces,
|
|
const DENORM& denorm,
|
|
SEAMS seams,
|
|
inT16 start,
|
|
inT16 end,
|
|
BlamerBundle *blamer_bundle) {
|
|
BLOB_CHOICE_LIST *choices;
|
|
TBLOB *blob;
|
|
inT16 x;
|
|
|
|
join_pieces(pieces, seams, start, end);
|
|
for (blob = pieces, x = 0; x < start; x++) {
|
|
blob = blob->next;
|
|
}
|
|
choices = classify_blob(blob, denorm, "pieces:", White, blamer_bundle);
|
|
|
|
break_pieces(blob, seams, start, end);
|
|
#ifndef GRAPHICS_DISABLED
|
|
if (wordrec_display_segmentations > 2) {
|
|
STATE current_state;
|
|
SEARCH_STATE chunk_groups;
|
|
set_n_ones (¤t_state, array_count(seams));
|
|
chunk_groups = bin_to_chunks(¤t_state, array_count(seams));
|
|
display_segmentation(pieces, chunk_groups);
|
|
window_wait(segm_window);
|
|
memfree(chunk_groups);
|
|
}
|
|
#endif
|
|
|
|
return (choices);
|
|
}
|
|
|
|
template<class BLOB_CHOICE>
|
|
int SortByUnicharID(const void *void1, const void *void2) {
|
|
const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1);
|
|
const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2);
|
|
|
|
return p1->unichar_id() - p2->unichar_id();
|
|
}
|
|
|
|
template<class BLOB_CHOICE>
|
|
int SortByRating(const void *void1, const void *void2) {
|
|
const BLOB_CHOICE *p1 = *reinterpret_cast<const BLOB_CHOICE * const *>(void1);
|
|
const BLOB_CHOICE *p2 = *reinterpret_cast<const BLOB_CHOICE * const *>(void2);
|
|
|
|
if (p1->rating() < p2->rating())
|
|
return 1;
|
|
return -1;
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* fill_filtered_fragment_list
|
|
*
|
|
* Filter the fragment list so that the filtered_choices only contain
|
|
* fragments that are in the correct position. choices is the list
|
|
* that we are going to filter. fragment_pos is the position in the
|
|
* fragment that we are looking for and num_frag_parts is the the
|
|
* total number of pieces. The result will be appended to
|
|
* filtered_choices.
|
|
**********************************************************************/
|
|
void Wordrec::fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices,
|
|
int fragment_pos,
|
|
int num_frag_parts,
|
|
BLOB_CHOICE_LIST *filtered_choices) {
|
|
BLOB_CHOICE_IT filtered_choices_it(filtered_choices);
|
|
BLOB_CHOICE_IT choices_it(choices);
|
|
|
|
for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
|
|
choices_it.forward()) {
|
|
UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
|
|
const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id);
|
|
|
|
if (frag != NULL && frag->get_pos() == fragment_pos &&
|
|
frag->get_total() == num_frag_parts) {
|
|
// Recover the unichar_id of the unichar that this fragment is
|
|
// a part of
|
|
BLOB_CHOICE *b = new BLOB_CHOICE(*choices_it.data());
|
|
int original_unichar = unicharset.unichar_to_id(frag->get_unichar());
|
|
b->set_unichar_id(original_unichar);
|
|
filtered_choices_it.add_to_end(b);
|
|
}
|
|
}
|
|
|
|
filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>);
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* merge_and_put_fragment_lists
|
|
*
|
|
* Merge the fragment lists in choice_lists and append it to the
|
|
* ratings matrix.
|
|
**********************************************************************/
|
|
void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column,
|
|
inT16 num_frag_parts,
|
|
BLOB_CHOICE_LIST *choice_lists,
|
|
MATRIX *ratings) {
|
|
BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];
|
|
|
|
for (int i = 0; i < num_frag_parts; i++) {
|
|
choice_lists_it[i].set_to_list(&choice_lists[i]);
|
|
choice_lists_it[i].mark_cycle_pt();
|
|
}
|
|
|
|
BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
|
|
if (merged_choice == NULL)
|
|
merged_choice = new BLOB_CHOICE_LIST;
|
|
|
|
bool end_of_list = false;
|
|
BLOB_CHOICE_IT merged_choice_it(merged_choice);
|
|
while (!end_of_list) {
|
|
// Find the maximum unichar_id of the current entry the iterators
|
|
// are pointing at
|
|
UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
|
|
int max_list = 0;
|
|
for (int i = 0; i < num_frag_parts; i++) {
|
|
UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
|
|
if (max_unichar_id < unichar_id) {
|
|
max_unichar_id = unichar_id;
|
|
max_list = i;
|
|
}
|
|
}
|
|
|
|
// Move the each iterators until it gets to an entry that has a
|
|
// value greater than or equal to max_unichar_id
|
|
for (int i = 0; i < num_frag_parts; i++) {
|
|
UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
|
|
while (!choice_lists_it[i].cycled_list() &&
|
|
unichar_id < max_unichar_id) {
|
|
choice_lists_it[i].forward();
|
|
unichar_id = choice_lists_it[i].data()->unichar_id();
|
|
}
|
|
if (choice_lists_it[i].cycled_list()) {
|
|
end_of_list = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (end_of_list)
|
|
break;
|
|
|
|
// Checks if the fragments are parts of the same character
|
|
UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
|
|
bool same_unichar = true;
|
|
for (int i = 1; i < num_frag_parts; i++) {
|
|
UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
|
|
if (unichar_id != first_unichar_id) {
|
|
same_unichar = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (same_unichar) {
|
|
// Add the merged character to the result
|
|
UNICHAR_ID merged_unichar_id = first_unichar_id;
|
|
inT16 merged_fontinfo_id = choice_lists_it[0].data()->fontinfo_id();
|
|
inT16 merged_fontinfo_id2 = choice_lists_it[0].data()->fontinfo_id2();
|
|
inT16 merged_min_xheight = choice_lists_it[0].data()->min_xheight();
|
|
inT16 merged_max_xheight = choice_lists_it[0].data()->max_xheight();
|
|
int merged_script_id = choice_lists_it[0].data()->script_id();
|
|
bool merged_adapted = choice_lists_it[0].data()->adapted();
|
|
|
|
float merged_rating = 0, merged_certainty = 0;
|
|
for (int i = 0; i < num_frag_parts; i++) {
|
|
float rating = choice_lists_it[i].data()->rating();
|
|
float certainty = choice_lists_it[i].data()->certainty();
|
|
|
|
if (i == 0 || certainty < merged_certainty)
|
|
merged_certainty = certainty;
|
|
merged_rating += rating;
|
|
|
|
choice_lists_it[i].forward();
|
|
if (choice_lists_it[i].cycled_list())
|
|
end_of_list = true;
|
|
IntersectRange(choice_lists_it[i].data()->min_xheight(),
|
|
choice_lists_it[i].data()->max_xheight(),
|
|
&merged_min_xheight, &merged_max_xheight);
|
|
}
|
|
|
|
merged_choice_it.add_to_end(new BLOB_CHOICE(merged_unichar_id,
|
|
merged_rating,
|
|
merged_certainty,
|
|
merged_fontinfo_id,
|
|
merged_fontinfo_id2,
|
|
merged_script_id,
|
|
merged_min_xheight,
|
|
merged_max_xheight,
|
|
merged_adapted));
|
|
}
|
|
}
|
|
|
|
if (classify_debug_level)
|
|
print_ratings_list("Merged Fragments", merged_choice,
|
|
unicharset);
|
|
|
|
if (merged_choice->empty())
|
|
delete merged_choice;
|
|
else
|
|
ratings->put(row, column, merged_choice);
|
|
|
|
delete [] choice_lists_it;
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* get_fragment_lists
|
|
*
|
|
* Recursively go through the ratings matrix to find lists of fragments
|
|
* to be merged in the function merge_and_put_fragment_lists.
|
|
* current_frag is the postion of the piece we are looking for.
|
|
* current_row is the row in the rating matrix we are currently at.
|
|
* start is the row we started initially, so that we can know where
|
|
* to append the results to the matrix. num_frag_parts is the total
|
|
* number of pieces we are looking for and num_blobs is the size of the
|
|
* ratings matrix.
|
|
**********************************************************************/
|
|
void Wordrec::get_fragment_lists(inT16 current_frag, inT16 current_row,
|
|
inT16 start, inT16 num_frag_parts,
|
|
inT16 num_blobs, MATRIX *ratings,
|
|
BLOB_CHOICE_LIST *choice_lists) {
|
|
if (current_frag == num_frag_parts) {
|
|
merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts,
|
|
choice_lists, ratings);
|
|
return;
|
|
}
|
|
|
|
for (inT16 x = current_row; x < num_blobs; x++) {
|
|
BLOB_CHOICE_LIST *choices = ratings->get(current_row, x);
|
|
if (choices == NULL)
|
|
continue;
|
|
|
|
fill_filtered_fragment_list(choices, current_frag, num_frag_parts,
|
|
&choice_lists[current_frag]);
|
|
if (!choice_lists[current_frag].empty()) {
|
|
get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts,
|
|
num_blobs, ratings, choice_lists);
|
|
choice_lists[current_frag].clear();
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* merge_fragments
|
|
*
|
|
* Try to merge fragments in the ratings matrix and put the result in
|
|
* the corresponding row and column
|
|
**********************************************************************/
|
|
void Wordrec::merge_fragments(MATRIX *ratings, inT16 num_blobs) {
|
|
BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks];
|
|
for (inT16 start = 0; start < num_blobs; start++) {
|
|
for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks;
|
|
frag_parts++) {
|
|
get_fragment_lists(0, start, start, frag_parts, num_blobs,
|
|
ratings, choice_lists);
|
|
}
|
|
}
|
|
|
|
// Delete fragments from the rating matrix
|
|
for (inT16 x = 0; x < num_blobs; x++) {
|
|
for (inT16 y = x; y < num_blobs; y++) {
|
|
BLOB_CHOICE_LIST *choices = ratings->get(x, y);
|
|
if (choices != NULL) {
|
|
BLOB_CHOICE_IT choices_it(choices);
|
|
for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
|
|
choices_it.forward()) {
|
|
UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
|
|
const CHAR_FRAGMENT *frag =
|
|
unicharset.get_fragment(choice_unichar_id);
|
|
if (frag != NULL)
|
|
delete choices_it.extract();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* get_piece_rating
|
|
*
|
|
* Check to see if this piece has already been classified. If it has
|
|
* return that rating. Otherwise build the piece from the smaller
|
|
* pieces, classify it, store the rating for later, and take the piece
|
|
* apart again.
|
|
**********************************************************************/
|
|
BLOB_CHOICE_LIST *Wordrec::get_piece_rating(MATRIX *ratings,
|
|
TBLOB *blobs,
|
|
const DENORM& denorm,
|
|
SEAMS seams,
|
|
inT16 start,
|
|
inT16 end,
|
|
BlamerBundle *blamer_bundle) {
|
|
BLOB_CHOICE_LIST *choices = ratings->get(start, end);
|
|
if (choices == NOT_CLASSIFIED) {
|
|
choices = classify_piece(blobs,
|
|
denorm,
|
|
seams,
|
|
start,
|
|
end,
|
|
blamer_bundle);
|
|
ratings->put(start, end, choices);
|
|
if (wordrec_debug_level > 1) {
|
|
tprintf("get_piece_rating(): updated ratings matrix\n");
|
|
ratings->print(getDict().getUnicharset());
|
|
}
|
|
}
|
|
return (choices);
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* record_blob_bounds
|
|
*
|
|
* Set up and initialize an array that holds the bounds of a set of
|
|
* blobs. Caller should delete[] the array.
|
|
**********************************************************************/
|
|
TBOX *Wordrec::record_blob_bounds(TBLOB *blobs) {
|
|
int nblobs = count_blobs(blobs);
|
|
TBOX *bboxes = new TBOX[nblobs];
|
|
|
|
inT16 x = 0;
|
|
for (TBLOB* blob = blobs; blob != NULL; blob = blob->next) {
|
|
bboxes[x] = blob->bounding_box();
|
|
x++;
|
|
}
|
|
return bboxes;
|
|
}
|
|
|
|
|
|
/**********************************************************************
|
|
* record_piece_ratings
|
|
*
|
|
* Save the choices for all the pieces that have been classified into
|
|
* a matrix that can be used to look them up later. A two dimensional
|
|
* matrix is created. The indices correspond to the starting and
|
|
* ending initial piece number.
|
|
**********************************************************************/
|
|
MATRIX *Wordrec::record_piece_ratings(TBLOB *blobs) {
|
|
inT16 num_blobs = count_blobs(blobs);
|
|
TBOX *bounds = record_blob_bounds(blobs);
|
|
MATRIX *ratings = new MATRIX(num_blobs);
|
|
|
|
for (int x = 0; x < num_blobs; x++) {
|
|
for (int y = x; y < num_blobs; y++) {
|
|
TBOX piecebox = bounds_of_piece(bounds, x, y);
|
|
BLOB_CHOICE_LIST *choices = blob_match_table.get_match_by_box(piecebox);
|
|
if (choices != NULL) {
|
|
ratings->put(x, y, choices);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (merge_fragments_in_matrix)
|
|
merge_fragments(ratings, num_blobs);
|
|
|
|
delete []bounds;
|
|
return ratings;
|
|
}
|
|
|
|
} // namespace tesseract
|