2007-03-08 04:03:40 +08:00
|
|
|
/* -*-C-*-
|
|
|
|
********************************************************************************
|
|
|
|
*
|
|
|
|
* File: bestfirst.c (Formerly bestfirst.c)
|
|
|
|
* Description: Best first search functions
|
|
|
|
* Author: Mark Seaman, OCR Technology
|
|
|
|
* Created: Mon May 14 11:23:29 1990
|
|
|
|
* Modified: Tue Jul 30 16:08:47 1991 (Mark Seaman) marks@hpgrlt
|
|
|
|
* Language: C
|
|
|
|
* Package: N/A
|
|
|
|
* Status: Experimental (Do Not Distribute)
|
|
|
|
*
|
|
|
|
* (c) Copyright 1990, Hewlett-Packard Company.
|
|
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
** you may not use this file except in compliance with the License.
|
|
|
|
** You may obtain a copy of the License at
|
|
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
** See the License for the specific language governing permissions and
|
|
|
|
** limitations under the License.
|
|
|
|
*
|
|
|
|
***************************************************************************/
|
|
|
|
|
|
|
|
/*----------------------------------------------------------------------
|
|
|
|
I n c l u d e s
|
|
|
|
---------------------------------------------------------------------*/
|
2009-06-03 05:57:23 +08:00
|
|
|
#include <assert.h>
|
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
#include "bestfirst.h"
|
2009-07-11 11:05:57 +08:00
|
|
|
#include "baseline.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
#include "bitvec.h"
|
2009-07-11 11:05:57 +08:00
|
|
|
#include "callback.h"
|
|
|
|
#include "dict.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
#include "freelist.h"
|
2009-07-11 11:05:57 +08:00
|
|
|
#include "globals.h"
|
|
|
|
#include "heuristic.h"
|
|
|
|
#include "metrics.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
#include "permute.h"
|
2009-07-11 11:05:57 +08:00
|
|
|
#include "pieces.h"
|
|
|
|
#include "plotseg.h"
|
|
|
|
#include "ratngs.h"
|
|
|
|
#include "states.h"
|
|
|
|
#include "stopper.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
#include "structures.h"
|
2009-07-11 11:05:57 +08:00
|
|
|
#include "tordvars.h"
|
|
|
|
#include "unicharset.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
#include "wordclass.h"
|
2009-07-11 11:05:57 +08:00
|
|
|
#include "wordrec.h"
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2007-08-31 02:18:35 +08:00
|
|
|
void call_caller();
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
/*----------------------------------------------------------------------
|
|
|
|
V a r i a b l e s
|
|
|
|
---------------------------------------------------------------------*/
|
|
|
|
int num_joints; /* Number of chunks - 1 */
|
|
|
|
int num_pushed = 0;
|
|
|
|
int num_popped = 0;
|
|
|
|
|
2009-07-11 11:05:57 +08:00
|
|
|
INT_VAR(wordrec_num_seg_states, 30, "Segmentation states");
|
|
|
|
|
|
|
|
double_VAR(wordrec_worst_state, 1, "Worst segmentation state");
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
/**/
|
|
|
|
/*----------------------------------------------------------------------
|
|
|
|
F u n c t i o n s
|
|
|
|
----------------------------------------------------------------------*/
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* best_first_search
|
|
|
|
*
|
|
|
|
* Find the best segmentation by doing a best first search of the
|
|
|
|
* solution space.
|
|
|
|
**********************************************************************/
|
2009-07-11 11:05:57 +08:00
|
|
|
namespace tesseract {
|
|
|
|
void Wordrec::best_first_search(CHUNKS_RECORD *chunks_record,
|
|
|
|
WERD_CHOICE *best_choice,
|
|
|
|
WERD_CHOICE *raw_choice,
|
|
|
|
STATE *state,
|
|
|
|
DANGERR *fixpt,
|
|
|
|
STATE *best_state) {
|
2007-03-08 04:03:40 +08:00
|
|
|
SEARCH_RECORD *the_search;
|
2008-04-22 08:35:16 +08:00
|
|
|
inT16 keep_going;
|
2009-07-11 11:05:57 +08:00
|
|
|
STATE guided_state; // not used
|
|
|
|
|
|
|
|
num_joints = chunks_record->ratings->dimension() - 1;
|
|
|
|
the_search = new_search(chunks_record, num_joints,
|
|
|
|
best_choice, raw_choice, state);
|
|
|
|
|
|
|
|
// The default state is initialized as the best choice. In order to apply
|
|
|
|
// segmentation adjustment, or any other contextual processing in permute,
|
|
|
|
// we give the best choice a poor rating to force the processed raw choice
|
|
|
|
// to be promoted to best choice.
|
|
|
|
the_search->best_choice->set_rating(100000.0);
|
|
|
|
evaluate_state(chunks_record, the_search, fixpt);
|
|
|
|
if (permute_debug) {
|
|
|
|
tprintf("\n\n\n =========== BestFirstSearch ==============\n");
|
|
|
|
best_choice->print("**Initial BestChoice**");
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
2007-08-31 02:18:35 +08:00
|
|
|
save_best_state(chunks_record);
|
2007-03-08 04:03:40 +08:00
|
|
|
#endif
|
2007-08-31 02:18:35 +08:00
|
|
|
start_recording();
|
2009-07-11 11:05:57 +08:00
|
|
|
FLOAT32 worst_priority = 2.0f * prioritize_state(chunks_record, the_search);
|
|
|
|
if (worst_priority < wordrec_worst_state)
|
|
|
|
worst_priority = wordrec_worst_state;
|
|
|
|
if (segment_debug) {
|
|
|
|
print_state("BestFirstSearch", best_state, num_joints);
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
guided_state = *state;
|
|
|
|
do {
|
|
|
|
/* Look for answer */
|
|
|
|
if (!hash_lookup (the_search->closed_states, the_search->this_state)) {
|
|
|
|
|
2009-07-11 11:05:57 +08:00
|
|
|
if (tord_blob_skip) {
|
2007-03-08 04:03:40 +08:00
|
|
|
free_state (the_search->this_state);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
guided_state = *(the_search->this_state);
|
2009-07-11 11:05:57 +08:00
|
|
|
keep_going = evaluate_state(chunks_record, the_search, fixpt);
|
2007-03-08 04:03:40 +08:00
|
|
|
hash_add (the_search->closed_states, the_search->this_state);
|
|
|
|
|
|
|
|
if (!keep_going ||
|
2009-07-11 11:05:57 +08:00
|
|
|
(the_search->num_states > wordrec_num_seg_states) ||
|
|
|
|
(tord_blob_skip)) {
|
|
|
|
if (segment_debug)
|
|
|
|
tprintf("Breaking best_first_search on keep_going %s numstates %d\n",
|
|
|
|
((keep_going) ? "T" :"F"), the_search->num_states);
|
2007-03-08 04:03:40 +08:00
|
|
|
free_state (the_search->this_state);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2009-07-11 11:05:57 +08:00
|
|
|
FLOAT32 new_worst_priority = 2.0f * prioritize_state(chunks_record,
|
|
|
|
the_search);
|
|
|
|
if (new_worst_priority < worst_priority) {
|
|
|
|
if (segment_debug)
|
|
|
|
tprintf("Lowering WorstPriority %f --> %f\n",
|
|
|
|
worst_priority, new_worst_priority);
|
|
|
|
// Tighten the threshold for admitting new paths as better search
|
|
|
|
// candidates are found. After lowering this threshold, we can safely
|
|
|
|
// popout everything that is worse than this score also.
|
|
|
|
worst_priority = new_worst_priority;
|
|
|
|
}
|
2007-08-31 02:18:35 +08:00
|
|
|
expand_node(worst_priority, chunks_record, the_search);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
free_state (the_search->this_state);
|
|
|
|
num_popped++;
|
|
|
|
the_search->this_state = pop_queue (the_search->open_states);
|
2009-07-11 11:05:57 +08:00
|
|
|
if (segment_debug && !the_search->this_state)
|
|
|
|
tprintf("No more states to evalaute after %d evals", num_popped);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
while (the_search->this_state);
|
|
|
|
|
|
|
|
state->part1 = the_search->best_state->part1;
|
|
|
|
state->part2 = the_search->best_state->part2;
|
2007-08-31 02:18:35 +08:00
|
|
|
stop_recording();
|
2009-07-11 11:05:57 +08:00
|
|
|
if (permute_debug) {
|
|
|
|
tprintf("\n\n\n =========== BestFirstSearch ==============\n");
|
|
|
|
// best_choice->debug_string(getDict().getUnicharset()).string());
|
|
|
|
best_choice->print("**Final BestChoice**");
|
|
|
|
}
|
|
|
|
// save the best_state stats
|
2007-08-31 02:18:35 +08:00
|
|
|
delete_search(the_search);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 11:05:57 +08:00
|
|
|
} // namespace tesseract
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* chunks_width
|
|
|
|
*
|
2009-07-11 11:05:57 +08:00
|
|
|
* Return the width of a chunk which is a composed of several blobs
|
|
|
|
* blobs[start_blob..last_blob] inclusively,
|
|
|
|
* whose individual widths and gaps are record in width_record in the form
|
|
|
|
* width_record->num_char = n
|
|
|
|
* width_record->widths[2*n-1] = w0,g0,w1,g1..w(n-1),g(n-1)
|
2007-03-08 04:03:40 +08:00
|
|
|
**********************************************************************/
|
2009-07-11 11:05:57 +08:00
|
|
|
int chunks_width(WIDTH_RECORD *width_record, int start_blob, int last_blob) {
|
2007-03-08 04:03:40 +08:00
|
|
|
int result = 0;
|
2009-07-11 11:05:57 +08:00
|
|
|
for (int x = start_blob * 2; x <= last_blob * 2; x++)
|
2007-03-08 04:03:40 +08:00
|
|
|
result += width_record->widths[x];
|
|
|
|
return (result);
|
|
|
|
}
|
|
|
|
|
2009-07-11 11:05:57 +08:00
|
|
|
/**********************************************************************
|
|
|
|
* chunks_gap
|
|
|
|
*
|
|
|
|
* Return the width of between the specified chunk and next.
|
|
|
|
**********************************************************************/
|
|
|
|
int chunks_gap(WIDTH_RECORD *width_record, int last_chunk) {
|
|
|
|
return (last_chunk < width_record->num_chars - 1) ?
|
|
|
|
width_record->widths[last_chunk * 2 + 1] : 0;
|
|
|
|
}
|
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* delete_search
|
|
|
|
*
|
|
|
|
* Terminate the current search and free all the memory involved.
|
|
|
|
**********************************************************************/
|
2007-08-31 02:18:35 +08:00
|
|
|
void delete_search(SEARCH_RECORD *the_search) {
|
2007-03-08 04:03:40 +08:00
|
|
|
float closeness;
|
|
|
|
|
|
|
|
closeness = (the_search->num_joints ?
|
2009-06-03 05:57:23 +08:00
|
|
|
(hamming_distance(reinterpret_cast<uinT32*>(the_search->first_state),
|
|
|
|
reinterpret_cast<uinT32*>(the_search->best_state), 2) /
|
|
|
|
(float) the_search->num_joints) : 0.0f);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
record_search_status (the_search->num_states,
|
|
|
|
the_search->before_best, closeness);
|
|
|
|
|
|
|
|
free_state (the_search->first_state);
|
|
|
|
free_state (the_search->best_state);
|
|
|
|
|
|
|
|
free_hash_table (the_search->closed_states);
|
|
|
|
FreeHeapData (the_search->open_states, (void_dest) free_state);
|
|
|
|
|
2007-08-31 02:18:35 +08:00
|
|
|
memfree(the_search);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* evaluate_chunks
|
|
|
|
*
|
|
|
|
* A particular word level segmentation has been chosen. Evaluation
|
|
|
|
* this to find the word list that corresponds to it.
|
|
|
|
**********************************************************************/
|
2009-07-11 11:05:57 +08:00
|
|
|
namespace tesseract {
|
|
|
|
BLOB_CHOICE_LIST_VECTOR *Wordrec::evaluate_chunks(CHUNKS_RECORD *chunks_record,
|
|
|
|
SEARCH_STATE search_state) {
|
|
|
|
BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
|
|
|
|
BLOB_CHOICE_LIST *blob_choices;
|
|
|
|
BLOB_CHOICE_IT blob_choice_it;
|
2007-03-08 04:03:40 +08:00
|
|
|
int i;
|
|
|
|
int x = 0;
|
|
|
|
int y;
|
|
|
|
|
|
|
|
/* Iterate sub-paths */
|
|
|
|
for (i = 1; i <= search_state[0] + 1; i++) {
|
|
|
|
if (i > search_state[0])
|
|
|
|
y = count_blobs (chunks_record->chunks) - 1;
|
|
|
|
else
|
|
|
|
y = x + search_state[i];
|
|
|
|
|
2009-07-11 11:05:57 +08:00
|
|
|
if (tord_blob_skip) {
|
|
|
|
delete char_choices;
|
2007-03-08 04:03:40 +08:00
|
|
|
return (NULL);
|
|
|
|
} /* Process one square */
|
|
|
|
/* Classify if needed */
|
2009-07-11 11:05:57 +08:00
|
|
|
blob_choices = get_piece_rating(chunks_record->ratings,
|
|
|
|
chunks_record->chunks,
|
|
|
|
chunks_record->splits,
|
|
|
|
x, y);
|
|
|
|
|
|
|
|
if (blob_choices == NULL) {
|
|
|
|
delete char_choices;
|
2007-03-08 04:03:40 +08:00
|
|
|
return (NULL);
|
|
|
|
}
|
2009-07-11 11:05:57 +08:00
|
|
|
|
2007-03-08 04:03:40 +08:00
|
|
|
/* Add permuted ratings */
|
2009-07-11 11:05:57 +08:00
|
|
|
blob_choice_it.set_to_list(blob_choices);
|
|
|
|
last_segmentation[i - 1].certainty = blob_choice_it.data()->certainty();
|
|
|
|
last_segmentation[i - 1].match = blob_choice_it.data()->rating();
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
last_segmentation[i - 1].width =
|
|
|
|
chunks_width (chunks_record->chunk_widths, x, y);
|
|
|
|
last_segmentation[i - 1].gap =
|
|
|
|
chunks_gap (chunks_record->chunk_widths, y);
|
|
|
|
|
2009-07-11 11:05:57 +08:00
|
|
|
*char_choices += blob_choices;
|
2007-03-08 04:03:40 +08:00
|
|
|
x = y + 1;
|
|
|
|
}
|
|
|
|
return (char_choices);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* evaluate_state
|
|
|
|
*
|
|
|
|
* Evaluate the segmentation that is represented by this state in the
|
|
|
|
* best first search. Add this state to the "states_seen" list.
|
|
|
|
**********************************************************************/
|
2009-07-11 11:05:57 +08:00
|
|
|
inT16 Wordrec::evaluate_state(CHUNKS_RECORD *chunks_record,
|
|
|
|
SEARCH_RECORD *the_search,
|
|
|
|
DANGERR *fixpt) {
|
|
|
|
BLOB_CHOICE_LIST_VECTOR *char_choices;
|
2007-03-08 04:03:40 +08:00
|
|
|
SEARCH_STATE chunk_groups;
|
2009-07-11 11:05:57 +08:00
|
|
|
float rating_limit = the_search->best_choice->rating();
|
2008-04-22 08:35:16 +08:00
|
|
|
inT16 keep_going = TRUE;
|
2007-03-08 04:03:40 +08:00
|
|
|
PIECES_STATE widths;
|
|
|
|
|
|
|
|
the_search->num_states++;
|
2009-07-11 11:05:57 +08:00
|
|
|
chunk_groups = bin_to_chunks(the_search->this_state,
|
|
|
|
the_search->num_joints);
|
2007-03-08 04:03:40 +08:00
|
|
|
bin_to_pieces (the_search->this_state, the_search->num_joints, widths);
|
2009-07-11 11:05:57 +08:00
|
|
|
getDict().LogNewSegmentation(widths);
|
|
|
|
|
|
|
|
char_choices = evaluate_chunks(chunks_record, chunk_groups);
|
|
|
|
wordseg_rating_adjust_factor = -1.0f;
|
|
|
|
if (char_choices != NULL && char_choices->length() > 0) {
|
|
|
|
// Compute the segmentation cost and include the cost in word rating.
|
|
|
|
// TODO(dsl): We should change the SEARCH_RECORD to store this cost
|
|
|
|
// from state evaluation and avoid recomputing it here.
|
|
|
|
prioritize_state(chunks_record, the_search);
|
|
|
|
wordseg_rating_adjust_factor = the_search->segcost_bias;
|
|
|
|
getDict().permute_characters(*char_choices, rating_limit,
|
|
|
|
the_search->best_choice,
|
|
|
|
the_search->raw_choice);
|
|
|
|
bool replaced = false;
|
|
|
|
if (getDict().AcceptableChoice(char_choices, the_search->best_choice,
|
|
|
|
*(the_search->raw_choice), fixpt,
|
|
|
|
ASSOCIATOR_CALLER, &replaced)) {
|
2007-03-08 04:03:40 +08:00
|
|
|
keep_going = FALSE;
|
2009-07-11 11:05:57 +08:00
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 11:05:57 +08:00
|
|
|
wordseg_rating_adjust_factor = -1.0f;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
2009-07-11 11:05:57 +08:00
|
|
|
if (wordrec_display_segmentations) {
|
2007-03-08 04:03:40 +08:00
|
|
|
display_segmentation (chunks_record->chunks, chunk_groups);
|
2009-07-11 11:05:57 +08:00
|
|
|
if (wordrec_display_segmentations > 1)
|
2007-08-31 02:18:35 +08:00
|
|
|
window_wait(segm_window);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2009-07-11 11:05:57 +08:00
|
|
|
if (rating_limit != the_search->best_choice->rating()) {
|
2007-03-08 04:03:40 +08:00
|
|
|
the_search->before_best = the_search->num_states;
|
|
|
|
the_search->best_state->part1 = the_search->this_state->part1;
|
|
|
|
the_search->best_state->part2 = the_search->this_state->part2;
|
2007-08-31 02:18:35 +08:00
|
|
|
replace_char_widths(chunks_record, chunk_groups);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
else if (char_choices != NULL)
|
|
|
|
fixpt->index = -1;
|
|
|
|
|
2009-07-11 11:05:57 +08:00
|
|
|
if (char_choices != NULL) delete char_choices;
|
2007-08-31 02:18:35 +08:00
|
|
|
memfree(chunk_groups);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
return (keep_going);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* rebuild_current_state
|
|
|
|
*
|
|
|
|
* Evaluate the segmentation that is represented by this state in the
|
|
|
|
* best first search. Add this state to the "states_seen" list.
|
|
|
|
**********************************************************************/
|
2009-07-11 11:05:57 +08:00
|
|
|
BLOB_CHOICE_LIST_VECTOR *Wordrec::rebuild_current_state(
|
|
|
|
TBLOB *blobs,
|
|
|
|
SEAMS seam_list,
|
|
|
|
STATE *state,
|
|
|
|
BLOB_CHOICE_LIST_VECTOR *old_choices,
|
|
|
|
int fx,
|
|
|
|
bool force_rebuild,
|
|
|
|
const WERD_CHOICE &best_choice,
|
|
|
|
const MATRIX *ratings) {
|
|
|
|
// Initialize search_state, num_joints, x, y.
|
|
|
|
int num_joints = array_count(seam_list);
|
2008-04-22 08:35:16 +08:00
|
|
|
#ifndef GRAPHICS_DISABLED
|
2009-07-11 11:05:57 +08:00
|
|
|
if (wordrec_display_segmentations) {
|
2008-04-22 08:35:16 +08:00
|
|
|
print_state("Rebuiling state", state, num_joints);
|
|
|
|
}
|
|
|
|
#endif
|
2009-07-11 11:05:57 +08:00
|
|
|
SEARCH_STATE search_state = bin_to_chunks(state, num_joints);
|
|
|
|
int x = 0;
|
|
|
|
int y;
|
|
|
|
int i;
|
2007-03-08 04:03:40 +08:00
|
|
|
for (i = 1; i <= search_state[0]; i++) {
|
|
|
|
y = x + search_state[i];
|
|
|
|
x = y + 1;
|
|
|
|
}
|
|
|
|
y = count_blobs (blobs) - 1;
|
2009-07-11 11:05:57 +08:00
|
|
|
|
|
|
|
// Initialize char_choices, expanded_fragment_lengths:
|
|
|
|
// e.g. if fragment_lengths = {1 1 2 3 1},
|
|
|
|
// expanded_fragment_lengths_str = {1 1 2 2 3 3 3 1}.
|
|
|
|
BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
|
|
|
|
STRING expanded_fragment_lengths_str = "";
|
|
|
|
bool state_has_fragments = false;
|
|
|
|
const char *fragment_lengths = NULL;
|
|
|
|
|
|
|
|
if (best_choice.length() > 0) {
|
|
|
|
fragment_lengths = best_choice.fragment_lengths();
|
|
|
|
}
|
|
|
|
if (fragment_lengths) {
|
|
|
|
for (int i = 0; i < best_choice.length(); ++i) {
|
|
|
|
*char_choices += NULL;
|
|
|
|
if (fragment_lengths[i] > 1) {
|
|
|
|
state_has_fragments = true;
|
|
|
|
}
|
|
|
|
for (int j = 0; j < fragment_lengths[i]; ++j) {
|
|
|
|
expanded_fragment_lengths_str += fragment_lengths[i];
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 11:05:57 +08:00
|
|
|
} else {
|
|
|
|
for (i = 0; i <= search_state[0]; ++i) {
|
|
|
|
expanded_fragment_lengths_str += (char)1;
|
|
|
|
*char_choices += NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Finish early if force_rebuld is false and there are no fragments to merge.
|
|
|
|
if (!force_rebuild && !state_has_fragments) {
|
|
|
|
delete char_choices;
|
|
|
|
memfree(search_state);
|
|
|
|
return old_choices;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set up variables for concatenating fragments.
|
|
|
|
const char *word_lengths_ptr = NULL;
|
|
|
|
const char *word_ptr = NULL;
|
|
|
|
if (state_has_fragments) {
|
|
|
|
// Make word_lengths_ptr point to the last element in
|
|
|
|
// best_choice->unichar_lengths().
|
|
|
|
word_lengths_ptr = best_choice.unichar_lengths().string();
|
|
|
|
word_lengths_ptr += (strlen(word_lengths_ptr)-1);
|
|
|
|
// Make word_str point to the beginning of the last
|
|
|
|
// unichar in best_choice->unichar_string().
|
|
|
|
word_ptr = best_choice.unichar_string().string();
|
|
|
|
word_ptr += (strlen(word_ptr)-*word_lengths_ptr);
|
|
|
|
}
|
|
|
|
const char *expanded_fragment_lengths =
|
|
|
|
expanded_fragment_lengths_str.string();
|
|
|
|
bool merging_fragment = false;
|
|
|
|
int true_y = -1;
|
|
|
|
char unichar[UNICHAR_LEN + 1];
|
|
|
|
int fragment_pieces = -1;
|
|
|
|
float rating = 0.0;
|
|
|
|
float certainty = -MAX_FLOAT32;
|
|
|
|
|
|
|
|
// Populate char_choices list such that it corresponds to search_state.
|
|
|
|
//
|
|
|
|
// If we are rebuilding a state that contains character fragments:
|
|
|
|
// -- combine blobs that belong to character fragments
|
|
|
|
// -- re-classify the blobs to obtain choices list for the merged blob
|
|
|
|
// -- ensure that correct classification appears in the new choices list
|
|
|
|
// NOTE: a choice composed form original fragment choices will be always
|
|
|
|
// added to the new choices list for each character composed from
|
|
|
|
// fragments (even if the choice for the corresponding character appears
|
|
|
|
// in the re-classified choices list of for the newly merged blob).
|
|
|
|
BLOB_CHOICE_IT temp_it;
|
|
|
|
int char_choices_index = char_choices->length() - 1;
|
|
|
|
for (i = search_state[0]; i >= 0; i--) {
|
|
|
|
BLOB_CHOICE_LIST *current_choices = join_blobs_and_classify(
|
|
|
|
blobs, seam_list, x, y, fx, ratings, old_choices);
|
|
|
|
// Combine character fragments.
|
|
|
|
if (expanded_fragment_lengths[i] > 1) {
|
|
|
|
// Start merging character fragments.
|
|
|
|
if (!merging_fragment) {
|
|
|
|
merging_fragment = true;
|
|
|
|
true_y = y;
|
|
|
|
fragment_pieces = expanded_fragment_lengths[i];
|
|
|
|
rating = 0.0;
|
|
|
|
certainty = -MAX_FLOAT32;
|
|
|
|
strncpy(unichar, word_ptr, *word_lengths_ptr);
|
|
|
|
unichar[*word_lengths_ptr] = '\0';
|
|
|
|
}
|
|
|
|
// Take into account the fact that we could have joined pieces
|
|
|
|
// since we first recorded the ending point of a fragment (true_y).
|
|
|
|
true_y -= y - x;
|
|
|
|
// Populate fragment with updated values and look for the
|
|
|
|
// fragment with the same values in current_choices.
|
|
|
|
// Update rating and certainty of the character being composed.
|
|
|
|
fragment_pieces--;
|
|
|
|
CHAR_FRAGMENT fragment;
|
|
|
|
fragment.set_all(unichar, fragment_pieces,
|
|
|
|
expanded_fragment_lengths[i]);
|
|
|
|
temp_it.set_to_list(current_choices);
|
|
|
|
for (temp_it.mark_cycle_pt(); !temp_it.cycled_list();
|
|
|
|
temp_it.forward()) {
|
|
|
|
const CHAR_FRAGMENT *current_fragment =
|
|
|
|
getDict().getUnicharset().get_fragment(temp_it.data()->unichar_id());
|
|
|
|
if (current_fragment && fragment.equals(current_fragment)) {
|
|
|
|
rating += temp_it.data()->rating();
|
|
|
|
if (temp_it.data()->certainty() > certainty) {
|
|
|
|
certainty = temp_it.data()->certainty();
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 11:05:57 +08:00
|
|
|
assert(!temp_it.cycled_list()); // make sure we found the fragment
|
|
|
|
// Free current_choices for the fragmented character.
|
|
|
|
delete current_choices;
|
|
|
|
|
|
|
|
// Finish composing character from fragments.
|
|
|
|
if (fragment_pieces == 0) {
|
|
|
|
// Populate current_choices with the classification of
|
|
|
|
// the blob merged from blobs of each character fragment.
|
|
|
|
current_choices = join_blobs_and_classify(blobs, seam_list, x,
|
|
|
|
true_y, fx, ratings, NULL);
|
|
|
|
BLOB_CHOICE *merged_choice =
|
|
|
|
new BLOB_CHOICE(getDict().getUnicharset().unichar_to_id(unichar),
|
|
|
|
rating, certainty, 0, NO_PERM);
|
|
|
|
|
|
|
|
// Insert merged_blob into current_choices, such that current_choices
|
|
|
|
// are still sorted in non-descending order by rating.
|
|
|
|
ASSERT_HOST(!current_choices->empty());
|
|
|
|
temp_it.set_to_list(current_choices);
|
|
|
|
for (temp_it.mark_cycle_pt();
|
|
|
|
!temp_it.cycled_list() &&
|
|
|
|
merged_choice->rating() > temp_it.data()->rating();
|
|
|
|
temp_it.forward());
|
|
|
|
temp_it.add_before_stay_put(merged_choice);
|
|
|
|
|
|
|
|
// Done merging this fragmented character.
|
|
|
|
merging_fragment = false;
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
2009-07-11 11:05:57 +08:00
|
|
|
if (!merging_fragment) {
|
|
|
|
// Get rid of fragments in current_choices.
|
|
|
|
temp_it.set_to_list(current_choices);
|
|
|
|
for (temp_it.mark_cycle_pt(); !temp_it.cycled_list();
|
|
|
|
temp_it.forward()) {
|
|
|
|
if (getDict().getUnicharset().get_fragment(
|
|
|
|
temp_it.data()->unichar_id())) {
|
|
|
|
delete temp_it.extract();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
char_choices->set(current_choices, char_choices_index);
|
|
|
|
char_choices_index--;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 11:05:57 +08:00
|
|
|
// Update word_ptr and word_lengths_ptr.
|
|
|
|
if (word_lengths_ptr != NULL && word_ptr != NULL) {
|
|
|
|
word_lengths_ptr--;
|
|
|
|
word_ptr -= (*word_lengths_ptr);
|
|
|
|
}
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
y = x - 1;
|
|
|
|
x = y - search_state[i];
|
|
|
|
}
|
2009-07-11 11:05:57 +08:00
|
|
|
old_choices->delete_data_pointers();
|
|
|
|
delete old_choices;
|
2007-08-31 02:18:35 +08:00
|
|
|
memfree(search_state);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
2009-07-11 11:05:57 +08:00
|
|
|
return (char_choices);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
2009-07-11 11:05:57 +08:00
|
|
|
} // namespace tesseract
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* expand_node
|
|
|
|
*
|
|
|
|
* Create the states that are attached to this one. Check to see that
|
|
|
|
* each one has not already been visited. If not add it to the priority
|
|
|
|
* queue.
|
|
|
|
**********************************************************************/
|
2009-07-11 11:05:57 +08:00
|
|
|
namespace tesseract {
|
|
|
|
void Wordrec::expand_node(FLOAT32 worst_priority,
|
|
|
|
CHUNKS_RECORD *chunks_record,
|
|
|
|
SEARCH_RECORD *the_search) {
|
2007-03-08 04:03:40 +08:00
|
|
|
STATE old_state;
|
2009-07-11 11:05:57 +08:00
|
|
|
int nodes_added = 0;
|
2007-03-08 04:03:40 +08:00
|
|
|
int x;
|
2009-07-11 11:05:57 +08:00
|
|
|
uinT32 mask = 1 << (the_search->num_joints - 1 - 32);
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
old_state.part1 = the_search->this_state->part1;
|
|
|
|
old_state.part2 = the_search->this_state->part2;
|
|
|
|
|
2009-07-11 11:05:57 +08:00
|
|
|
// We need to expand the search more intelligently, or we get stuck
|
|
|
|
// with a bad starting segmentation in a long word sequence as in CJK.
|
|
|
|
// Expand a child node only if it is within the global bound, and no
|
|
|
|
// worse than 2x of its parent.
|
|
|
|
// TODO(dsl): There is some redudency here in recomputing the priority,
|
|
|
|
// and in filtering of old_merit and worst_priority.
|
|
|
|
the_search->this_state->part2 = old_state.part2;
|
2007-03-08 04:03:40 +08:00
|
|
|
for (x = the_search->num_joints; x > 32; x--) {
|
|
|
|
the_search->this_state->part1 = mask ^ old_state.part1;
|
2009-07-11 11:05:57 +08:00
|
|
|
if (!hash_lookup (the_search->closed_states, the_search->this_state)) {
|
|
|
|
FLOAT32 new_merit = prioritize_state(chunks_record, the_search);
|
|
|
|
if (segment_debug && permute_debug) {
|
|
|
|
cprintf ("....checking state: %8.3f ", new_merit);
|
|
|
|
print_state ("", the_search->this_state, num_joints);
|
|
|
|
}
|
|
|
|
if (new_merit < worst_priority) {
|
|
|
|
push_queue (the_search->open_states, the_search->this_state,
|
|
|
|
worst_priority, new_merit);
|
|
|
|
nodes_added++;
|
|
|
|
}
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
mask >>= 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (the_search->num_joints > 32) {
|
|
|
|
mask = 1 << 31;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
mask = 1 << (the_search->num_joints - 1);
|
|
|
|
}
|
|
|
|
|
2009-07-11 11:05:57 +08:00
|
|
|
the_search->this_state->part1 = old_state.part1;
|
2007-03-08 04:03:40 +08:00
|
|
|
while (x--) {
|
|
|
|
the_search->this_state->part2 = mask ^ old_state.part2;
|
2009-07-11 11:05:57 +08:00
|
|
|
if (!hash_lookup (the_search->closed_states, the_search->this_state)) {
|
|
|
|
FLOAT32 new_merit = prioritize_state(chunks_record, the_search);
|
|
|
|
if (segment_debug && permute_debug) {
|
|
|
|
cprintf ("....checking state: %8.3f ", new_merit);
|
|
|
|
print_state ("", the_search->this_state, num_joints);
|
|
|
|
}
|
|
|
|
if (new_merit < worst_priority) {
|
|
|
|
push_queue(the_search->open_states, the_search->this_state,
|
|
|
|
worst_priority, new_merit);
|
|
|
|
nodes_added++;
|
|
|
|
}
|
|
|
|
}
|
2007-03-08 04:03:40 +08:00
|
|
|
mask >>= 1;
|
|
|
|
}
|
|
|
|
}
|
2009-07-11 11:05:57 +08:00
|
|
|
} // namespace tesseract
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* new_search
|
|
|
|
*
|
|
|
|
* Create and initialize a new search record.
|
|
|
|
**********************************************************************/
|
|
|
|
SEARCH_RECORD *new_search(CHUNKS_RECORD *chunks_record,
|
|
|
|
int num_joints,
|
2009-07-11 11:05:57 +08:00
|
|
|
WERD_CHOICE *best_choice,
|
|
|
|
WERD_CHOICE *raw_choice,
|
2007-03-08 04:03:40 +08:00
|
|
|
STATE *state) {
|
|
|
|
SEARCH_RECORD *this_search;
|
|
|
|
|
|
|
|
this_search = (SEARCH_RECORD *) memalloc (sizeof (SEARCH_RECORD));
|
|
|
|
|
2009-07-11 11:05:57 +08:00
|
|
|
this_search->open_states = MakeHeap (wordrec_num_seg_states * 20);
|
2007-03-08 04:03:40 +08:00
|
|
|
this_search->closed_states = new_hash_table ();
|
|
|
|
|
|
|
|
if (state)
|
|
|
|
this_search->this_state = new_state (state);
|
|
|
|
else
|
|
|
|
cprintf ("error: bad initial state in new_search\n");
|
|
|
|
|
|
|
|
this_search->first_state = new_state (this_search->this_state);
|
|
|
|
this_search->best_state = new_state (this_search->this_state);
|
|
|
|
|
|
|
|
this_search->best_choice = best_choice;
|
|
|
|
this_search->raw_choice = raw_choice;
|
|
|
|
|
|
|
|
this_search->num_joints = num_joints;
|
|
|
|
this_search->num_states = 0;
|
|
|
|
this_search->before_best = 0;
|
2009-07-11 11:05:57 +08:00
|
|
|
this_search->segcost_bias = 0;
|
2007-03-08 04:03:40 +08:00
|
|
|
|
|
|
|
return (this_search);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* pop_queue
|
|
|
|
*
|
|
|
|
* Get this state from the priority queue. It should be the state that
|
|
|
|
* has the greatest urgency to be evaluated.
|
|
|
|
**********************************************************************/
|
2007-08-31 02:18:35 +08:00
|
|
|
STATE *pop_queue(HEAP *queue) {
|
2007-03-08 04:03:40 +08:00
|
|
|
HEAPENTRY entry;
|
|
|
|
|
|
|
|
if (GetTopOfHeap (queue, &entry) == OK) {
|
|
|
|
#ifndef GRAPHICS_DISABLED
|
2009-07-11 11:05:57 +08:00
|
|
|
if (wordrec_display_segmentations) {
|
2007-03-08 04:03:40 +08:00
|
|
|
cprintf ("eval state: %8.3f ", entry.Key);
|
|
|
|
print_state ("", (STATE *) entry.Data, num_joints);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
return ((STATE *) entry.Data);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* push_queue
|
|
|
|
*
|
|
|
|
* Add this state into the priority queue.
|
|
|
|
**********************************************************************/
|
2007-08-31 02:18:35 +08:00
|
|
|
void push_queue(HEAP *queue, STATE *state, FLOAT32 worst_priority,
|
|
|
|
FLOAT32 priority) {
|
2007-03-08 04:03:40 +08:00
|
|
|
HEAPENTRY entry;
|
|
|
|
|
2009-07-11 11:05:57 +08:00
|
|
|
if (priority < worst_priority) {
|
|
|
|
if (SizeOfHeap (queue) >= MaxSizeOfHeap(queue)) {
|
|
|
|
if (segment_debug) tprintf("Heap is Full\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (segment_debug)
|
|
|
|
tprintf("\tpushing %d node %f\n", num_pushed, priority);
|
2007-03-08 04:03:40 +08:00
|
|
|
entry.Data = (char *) new_state (state);
|
|
|
|
num_pushed++;
|
|
|
|
entry.Key = priority;
|
2007-08-31 02:18:35 +08:00
|
|
|
HeapStore(queue, &entry);
|
2007-03-08 04:03:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**********************************************************************
|
|
|
|
* replace_char_widths
|
|
|
|
*
|
|
|
|
* Replace the value of the char_width field in the chunks_record with
|
|
|
|
* the updated width measurements from the last_segmentation.
|
|
|
|
**********************************************************************/
|
2007-08-31 02:18:35 +08:00
|
|
|
void replace_char_widths(CHUNKS_RECORD *chunks_record, SEARCH_STATE state) {
|
2007-03-08 04:03:40 +08:00
|
|
|
WIDTH_RECORD *width_record;
|
|
|
|
int num_blobs;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
free_widths (chunks_record->char_widths);
|
|
|
|
|
|
|
|
num_blobs = state[0] + 1;
|
|
|
|
width_record = (WIDTH_RECORD *) memalloc (sizeof (int) * num_blobs * 2);
|
|
|
|
width_record->num_chars = num_blobs;
|
|
|
|
|
|
|
|
for (i = 0; i < num_blobs; i++) {
|
|
|
|
|
|
|
|
width_record->widths[2 * i] = last_segmentation[i].width;
|
|
|
|
|
|
|
|
if (i + 1 < num_blobs)
|
|
|
|
width_record->widths[2 * i + 1] = last_segmentation[i].gap;
|
|
|
|
}
|
|
|
|
chunks_record->char_widths = width_record;
|
|
|
|
}
|
2009-07-11 11:05:57 +08:00
|
|
|
|
|
|
|
namespace tesseract {
|
|
|
|
BLOB_CHOICE_LIST *Wordrec::join_blobs_and_classify(
|
|
|
|
TBLOB *blobs, SEAMS seam_list,
|
|
|
|
int x, int y, int fx, const MATRIX *ratings,
|
|
|
|
BLOB_CHOICE_LIST_VECTOR *old_choices) {
|
|
|
|
BLOB_CHOICE_LIST *choices = NULL;
|
|
|
|
// First check to see if we can look up the classificaiton
|
|
|
|
// in old_choices (if there is no need to merge blobs).
|
|
|
|
if (x == y && old_choices != NULL && ratings == NULL) {
|
|
|
|
choices = old_choices->get(x);
|
|
|
|
old_choices->set(NULL, x);
|
|
|
|
return choices;
|
|
|
|
}
|
|
|
|
// The ratings matrix filled in by the associator will contain the most
|
|
|
|
// up-to-date classification info. Thus we look up the classification there
|
|
|
|
// first, and only call classify_blob() if the classification is not found.
|
|
|
|
if (ratings != NULL) {
|
|
|
|
BLOB_CHOICE_LIST *choices_ptr = ratings->get(x, y);
|
|
|
|
if (choices_ptr != NOT_CLASSIFIED) {
|
|
|
|
choices = new BLOB_CHOICE_LIST();
|
|
|
|
choices->deep_copy(choices_ptr, &BLOB_CHOICE::deep_copy);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (x != y) {
|
|
|
|
join_pieces(blobs, seam_list, x, y);
|
|
|
|
|
|
|
|
int blobindex; // current blob
|
|
|
|
TBLOB *p_blob;
|
|
|
|
TBLOB *blob;
|
|
|
|
TBLOB *next_blob;
|
|
|
|
for (blob = blobs, blobindex = 0, p_blob = NULL;
|
|
|
|
blobindex < x; blobindex++) {
|
|
|
|
p_blob = blob;
|
|
|
|
blob = blob->next;
|
|
|
|
}
|
|
|
|
while (blobindex < y) {
|
|
|
|
next_blob = blob->next;
|
|
|
|
blob->next = next_blob->next;
|
|
|
|
oldblob(next_blob); // junk dead blobs
|
|
|
|
blobindex++;
|
|
|
|
}
|
|
|
|
if (choices == NULL) {
|
|
|
|
choices = classify_blob(p_blob, blob, blob->next,
|
|
|
|
NULL, "rebuild", Orange);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return choices;
|
|
|
|
}
|
|
|
|
} // namespace tesseract
|