tesseract/dict/reduce.cpp
theraysmith 520077bd41 Fixed name collision with jpeg library
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@164 d0cd1f9f-072b-0410-8dd7-cf729c803f20
2008-04-22 00:42:51 +00:00

425 lines
13 KiB
C++

/* -*-C-*-
********************************************************************************
*
* File: reduce.cpp
* Description: Functions to reduce a TRIE into a DAWG
* Author: Mark Seaman, OCR Technology
* Created: Fri Oct 16 14:37:00 1987
* Modified: Wed Jun 19 16:51:29 1991 (Mark Seaman) marks@hpgrlt
* Language: C
* Package: N/A
* Status: Reusable Software Component
*
* (c) Copyright 1987, Hewlett-Packard Company, all rights reserved.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
********************************************************************************
*/
/*
----------------------------------------------------------------------
I n c l u d e s
----------------------------------------------------------------------
*/
#include "reduce.h"
#include "makedawg.h"
#include "cutil.h"
#ifdef __UNIX__
#include <assert.h>
#endif
/*
----------------------------------------------------------------------
T y p e s
----------------------------------------------------------------------
*/
/*
----------------------------------------------------------------------
V a r i a b l e s
----------------------------------------------------------------------
*/
static inT32 debug_1 = 0;
/*
----------------------------------------------------------------------
M a c r o s
----------------------------------------------------------------------
*/
/*
----------------------------------------------------------------------
F u n c t i o n s
----------------------------------------------------------------------
*/
/**********************************************************************
* collapse_source_nodes
*
* A pair of edges has been found that can be reduced. This function
* accomplishes that reduction by collapsing the two nodes into a
* single node.
**********************************************************************/
void collapse_source_nodes (EDGE_ARRAY dawg,
NODE_REF source_node_1,
NODE_REF source_node_2,
NODE_REF dest_node,
inT32 max_num_edges,
inT32 reserved_edges) {
inT32 num_links;
EDGE_REF edge;
/* NODE_REF new_source_1; */
num_links = num_forward_edges (dawg, source_node_2);
/* if (debug_1)
printf ("Node = %d, Input 1 = %d, Input 2 = %6d, num_links = %d\n",
dest_node, source_node_1, source_node_2, num_links);
if (debug) {
printf ("Node = %d, Input 1 = %d, Input 2 = %6d, num_links = %d\n",
dest_node, source_node_1, source_node_2, num_links);
print_dawg_node (dawg, source_node_1);
print_dawg_node (dawg, source_node_2);
new_line ();
}
*/
/* Remove forward links in */
edge = source_node_2; /* source_node_2 - dest_node */
if (forward_edge (dawg, edge)) {
do {
remove_edge_linkage (dawg, dest_node, source_node_2,
BACKWARD_EDGE,
edge_letter (dawg, edge),
end_of_word (dawg, edge));
} edge_loop (dawg, edge);
}
/* Fix backward links */
edge = source_node_2; /* in source_node_2 */
edge += num_forward_edges (dawg, source_node_2);
if (backward_edge (dawg, edge)) {
do {
move_node_if_needed (dawg, &source_node_1,
max_num_edges, reserved_edges);
add_edge_linkage (dawg, source_node_1, next_node (dawg, edge),
BACKWARD_EDGE,
edge_letter (dawg, edge),
end_of_word (dawg, edge));
/* Node moved */
relocate_edge (dawg, next_node (dawg, edge),
source_node_2, source_node_1);
} edge_loop (dawg, edge);
}
delete_node (dawg, source_node_2);
/* if (debug) {
printf ("Number of edges source = %d, dest = %d\n",
edges_in_node (dawg, new_source_1),
edges_in_node (dawg, dest_node));
print_int ("Number of edges = ", edges_in_node (dawg, dest_node));
print_dawg_node (dawg, new_source_1);
print_string ("_________________________________________");
new_line ();
}
*/
}
/**********************************************************************
* eliminate_redundant_edges
*
* Compare these two edges in this node to see if they point to two
* nodes that could be collapsed. If they do, then perform the
* reduction and return TRUE. If not, return FALSE.
**********************************************************************/
inT32 eliminate_redundant_edges (EDGE_ARRAY dawg,
NODE_REF node,
EDGE_REF edge_1,
EDGE_REF edge_2,
inT32 max_num_edges,
inT32 reserved_edges) {
static inT32 elim_count = 0;
static inT32 keep_count = 0;
if (same_output (dawg,
next_node (dawg, edge_1),
next_node (dawg, edge_2))) {
elim_count++;
collapse_source_nodes (dawg,
next_node (dawg, edge_1),
next_node (dawg, edge_2),
node,
max_num_edges, reserved_edges);
/* if (debug_1) {
printf ("Collapsing node %d\n", node);
print_dawg_node (dawg, node);
printf ("Candidate edges = %d, %d\n", edge_1, edge_2);
printf ("Candidate nodes = %d, %d\n\n",
next_node (dawg, edge_1), next_node (dawg, edge_2));
new_line ();
}
*/
return (TRUE);
}
else {
keep_count++;
}
return (FALSE);
}
/**********************************************************************
* letter_order
*
* Compare two edges to see which one of the letters is larger.
**********************************************************************/
inT32 letter_order (const void* edge1_ptr,
const void* edge2_ptr) {
if (letter_of_edge(*((EDGE_RECORD*) edge1_ptr)) <
letter_of_edge(*((EDGE_RECORD*) edge2_ptr)))
return (-1);
if (letter_of_edge(*((EDGE_RECORD*) edge1_ptr)) >
letter_of_edge(*((EDGE_RECORD*) edge2_ptr)))
return (1);
return (0);
}
/*
printf ("%c%c %c%c ",
edge1.letter, (edge1.flags & WORD_END_FLAG ? '*' : ' '),
edge2.letter, (edge2.flags & WORD_END_FLAG ? '*' : ' '));
printf ("\n");
printf ("+\n");
printf ("-\n");
*/
void print_n_edges (EDGE_RECORD *edge1,
inT32 n) {
EDGE_RECORD *edge;
edge = edge1;
while (n-- > 0) {
printf ("%c ", letter_of_edge(edge[0]));
edge++;
}
new_line ();
}
/**********************************************************************
* reduce_lettered_edges
*
* The edge parameter is pointing to the first edge in a group of edges
* in this node with a particular letter value. Look through these edges
* to see if any of them can be collapsed. If so do it. When all edges
* with this letter have been reduced then return to the caller.
* If further reduction is possible with this same letter then the
* edge parameter is not incremented. When no further reduction is
* possible then FALSE is returned.
**********************************************************************/
inT32 reduce_lettered_edges (EDGE_ARRAY dawg,
EDGE_REF *edge,
NODE_REF node,
NODE_MARKER reduced_nodes,
inT32 max_num_edges,
inT32 reserved_edges) {
EDGE_REF edge_1;
EDGE_REF edge_2;
inT32 fixed_one;
inT32 did_something = FALSE;
if (debug_1)
printf ("reduce_lettered_edges (edge=" REFFORMAT ")\n", *edge);
/* Loop for each back edge */
edge_1 = *edge;
while ((! last_edge (dawg, edge_1)) &&
edge_letter (dawg, edge_1) == edge_letter (dawg, *edge)) {
edge_2 = edge_1 + 1; /* Compare all back edges */
do {
if (edge_letter (dawg, edge_1) < edge_letter (dawg, edge_2))
break;
if (debug_1) {
printf (REFFORMAT " (%c), " REFFORMAT " (%c) ",
edge_1, edge_letter (dawg, edge_1),
edge_2, edge_letter (dawg, edge_2));
}
if (edge_2 != edge_1 &&
edge_letter (dawg, edge_2) == edge_letter (dawg, edge_1) &&
end_of_word (dawg, edge_2) == end_of_word (dawg, edge_1) &&
eliminate_redundant_edges (dawg, node, edge_1, edge_2,
max_num_edges, reserved_edges)) {
reduced_nodes [next_node (dawg, edge_1)] = 0;
fixed_one = TRUE;
did_something = TRUE;
}
else {
if (debug_1) printf (" .");
fixed_one = FALSE;
}
if (debug_1) printf ("\n");
} while (fixed_one ?
edge_occupied (dawg, edge_2) :
(! last_edge (dawg, edge_2++)));
edge_1++;
}
if (! did_something) {
if (last_edge (dawg, edge_1))
return (FALSE);
else
*edge = edge_1;
}
return (TRUE);
}
/**********************************************************************
* reduce_node_input
*
* Eliminate any redundant edges from this node in the DAWG.
**********************************************************************/
void reduce_node_input (EDGE_ARRAY dawg,
NODE_REF node,
NODE_MARKER reduced_nodes,
inT32 max_num_edges,
inT32 reserved_edges) {
EDGE_REF edge_1;
inT32 forward_edges = num_forward_edges (dawg, node);
inT32 backward_edges = edges_in_node (dawg, node) - forward_edges;
static inT32 num_nodes_reduced = 0;
if (debug_1) {
printf ("reduce_node_input (node=" REFFORMAT ")\n", node);
print_dawg_node (dawg, node);
}
if (++num_nodes_reduced % 100 == 0) {
printf ("%d nodes reduced\n", num_nodes_reduced);
if (debug_1 && num_nodes_reduced % 1000 == 0) {
write_full_dawg ("temp-save", dawg, max_num_edges);
}
}
qsort ((void *) &edge_of (dawg, node + forward_edges),
backward_edges,
sizeof (EDGE_RECORD),
letter_order);
/* if (debug_1) {
printf ("__________________________\n");
print_dawg_node (dawg, node);
}
*/
edge_1 = node + forward_edges;
while (reduce_lettered_edges (dawg, &edge_1, node, reduced_nodes,
max_num_edges, reserved_edges));
reduced_nodes [node] = 1; /* Mark as reduced */
if (debug_1) {
printf ("After reduction:\n");
print_dawg_node (dawg, node);
}
edge_1 = node + num_forward_edges (dawg, node); /* Reduce next level */
if (backward_edge (dawg, edge_1)) {
do {
if (next_node (dawg, edge_1) &&
reduced_nodes [next_node (dawg, edge_1)] == 0)
reduce_node_input (dawg, next_node (dawg, edge_1), reduced_nodes,
max_num_edges, reserved_edges);
} edge_loop (dawg, edge_1);
}
}
/**********************************************************************
* same_output
*
* Check to see if these two nodes have identical output. If so then
* they can be collapsed into a single node.
**********************************************************************/
inT32 same_output (EDGE_ARRAY dawg,
NODE_REF node1,
NODE_REF node2) {
if (debug_1) printf ("Edge nodes = " REFFORMAT " , " \
REFFORMAT " \n", node1, node2);
if (num_forward_edges (dawg, node1) == 1 &&
num_forward_edges (dawg, node2) == 1) {
if (debug_1) printf (" * ");
return (TRUE);
}
else {
if (debug_1) {
printf (" %d,%d \n",
num_forward_edges (dawg, node1),
num_forward_edges (dawg, node2));
print_dawg_node (dawg, node1);
print_dawg_node (dawg, node2);
}
return (FALSE);
}
}
/**********************************************************************
* trie_to_dawg
*
* Change a Trie data structure into a DAWG by eliminating the redund
**********************************************************************/
void trie_to_dawg (EDGE_ARRAY dawg,
inT32 max_num_edges,
inT32 reserved_edges) {
NODE_MARKER reduced_nodes;
inT32 x;
max_new_attempts = 100000;
compact_dawg (dawg, max_num_edges, reserved_edges);
reduced_nodes = (NODE_MARKER) malloc (max_num_edges);
for (x=0; x<max_num_edges; x++) reduced_nodes [x] = 0;
reduce_node_input (dawg, 0, reduced_nodes, max_num_edges, reserved_edges);
free (reduced_nodes);
}