tesseract/wordrec/heuristic.cpp

/* -*-C-*-
 ********************************************************************************
 *
 * File:        heuristic.c  (Formerly heuristic.c)
 * Description:
 * Author:       Mark Seaman, OCR Technology
 * Created:      Fri Oct 16 14:37:00 1987
 * Modified:     Wed Jul 10 14:15:08 1991 (Mark Seaman) marks@hpgrlt
 * Language:     C
 * Package:      N/A
 * Status:       Reusable Software Component
 *
 * (c) Copyright 1987, Hewlett-Packard Company.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 *********************************************************************************/
/*----------------------------------------------------------------------
              I n c l u d e s
----------------------------------------------------------------------*/
#include "heuristic.h"
#include "baseline.h"
#include "metrics.h"
#include "freelist.h"
#include <math.h>

/*----------------------------------------------------------------------
              M a c r o s
----------------------------------------------------------------------*/
#define MAX_SQUAT       2.0      /* Width ratio */
#define BAD_RATING   1000.0      /* No valid blob */

/*----------------------------------------------------------------------
              F u n c t i o n s
----------------------------------------------------------------------*/
/**********************************************************************
 * prioritize_state
 *
 * Create a priority for this state.  It represents the urgency of
 * checking this state.
 **********************************************************************/
FLOAT32 prioritize_state(CHUNKS_RECORD *chunks_record,
                         SEARCH_RECORD *the_search,
                         STATE *old_state) {
  FLOAT32 width_pri;
  FLOAT32 match_pri;

  match_pri = rating_priority (chunks_record, the_search->this_state,
    old_state, the_search->num_joints);

  width_pri = width_priority (chunks_record, the_search->this_state,
    the_search->num_joints) * 1000.0;

  record_priorities(the_search, old_state, match_pri, width_pri); 

  return (width_pri + match_pri);
}


/**********************************************************************
 * rating_priority
 *
 * Assign a segmentation priority based on the ratings of the blobs
 * (in that segmentation) that have been classified.  The average
 * "goodness" (i.e. rating / weight) for each blob is used to indicate
 * the segmentation priority.
 **********************************************************************/
FLOAT32 rating_priority(CHUNKS_RECORD *chunks_record,
                        STATE *state,
                        STATE *old_state,
                        int num_joints) {
  PIECES_STATE blob_chunks;
  INT16 x;
  INT16 y;
  CHOICES this_choice;
  INT16 first_chunk = 0;
  INT16 last_chunk;
  INT16 ratings = 0;
  INT16 weights = 0;

  bin_to_pieces(state, num_joints, blob_chunks); 

  for (x = 0; blob_chunks[x]; x++) {
                                 // Iterate each blob
    last_chunk = first_chunk + blob_chunks[x] - 1;

    this_choice = matrix_get (chunks_record->ratings,
      first_chunk, last_chunk);

    if (this_choice == NIL)
      return (BAD_RATING);

    if (this_choice != NOT_CLASSIFIED) {

      ratings += (INT16) best_probability (this_choice);
      for (y = first_chunk; y <= last_chunk; y++) {
        weights += (INT16) (chunks_record->weights[y]);
      }
    }
    first_chunk += blob_chunks[x];
  }
  if (weights <= 0)
    weights = 1;
  return ((FLOAT32) ratings / weights);
}


/**********************************************************************
 * state_char_widths
 *
 * Return a character width record corresponding to the character
 * width that will be generated in this segmentation state.
 **********************************************************************/
WIDTH_RECORD *state_char_widths(WIDTH_RECORD *chunk_widths,
                                STATE *state,
                                int num_joints,
                                SEARCH_STATE *search_state) {
  WIDTH_RECORD *width_record;
  int num_blobs;
  int x;
  int y;
  int i;
  SEARCH_STATE new_chunks;

  new_chunks = bin_to_chunks (state, num_joints);

  num_blobs = new_chunks[0] + 1;
  width_record = (WIDTH_RECORD *) memalloc (sizeof (int) * num_blobs * 2);
  width_record->num_chars = num_blobs;

  x = 0;
  for (i = 1; i <= new_chunks[0] + 1; i++) {
    if (i > new_chunks[0])
      y = num_joints;
    else
      y = x + new_chunks[i];

    width_record->widths[2 * i - 2] = chunks_width (chunk_widths, x, y);

    if (i <= new_chunks[0])
      width_record->widths[2 * i - 1] = chunks_gap (chunk_widths, y);

    x = y + 1;
  }

  *search_state = new_chunks;
  return (width_record);
}


/**********************************************************************
 * width_priority
 *
 * Return a priority value for this word segmentation based on the
 * character widths present in the new segmentation.
 **********************************************************************/
FLOAT32 width_priority(CHUNKS_RECORD *chunks_record,
                       STATE *state,
                       int num_joints) {
  SEARCH_STATE new_chunks;
  FLOAT32 result = 0.0;
  WIDTH_RECORD *width_record;
  FLOAT32 squat;
  int x;

  width_record = state_char_widths (chunks_record->chunk_widths,
    state, num_joints, &new_chunks);
  for (x = 0; x < width_record->num_chars; x++) {

    squat = width_record->widths[2 * x];
    if (!baseline_enable) {
      squat /= chunks_record->row->lineheight;
    }
    else {
      squat /= BASELINE_SCALE;
    }

    if (squat > MAX_SQUAT)
      result += squat - MAX_SQUAT;

  }

  memfree(new_chunks); 
  free_widths(width_record); 

  return (result);
}
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`/* --C--`
			`********************************************************************************`
			`*`
			`* File: heuristic.c (Formerly heuristic.c)`
			`* Description:`
			`* Author: Mark Seaman, OCR Technology`
			`* Created: Fri Oct 16 14:37:00 1987`
			`* Modified: Wed Jul 10 14:15:08 1991 (Mark Seaman) marks@hpgrlt`
			`* Language: C`
			`* Package: N/A`
			`* Status: Reusable Software Component`
			`*`
			`* (c) Copyright 1987, Hewlett-Packard Company.`
			`** Licensed under the Apache License, Version 2.0 (the "License");`
			`** you may not use this file except in compliance with the License.`
			`** You may obtain a copy of the License at`
			`** http://www.apache.org/licenses/LICENSE-2.0`
			`** Unless required by applicable law or agreed to in writing, software`
			`** distributed under the License is distributed on an "AS IS" BASIS,`
			`** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`** See the License for the specific language governing permissions and`
			`** limitations under the License.`
			`*`
			`*********************************************************************************/`
			`/*----------------------------------------------------------------------`
			`I n c l u d e s`
			`----------------------------------------------------------------------*/`
			`#include "heuristic.h"`
			`#include "baseline.h"`
			`#include "metrics.h"`
			`#include "freelist.h"`
			`#include <math.h>`

			`/*----------------------------------------------------------------------`
			`M a c r o s`
			`----------------------------------------------------------------------*/`
			`#define MAX_SQUAT 2.0 /* Width ratio */`
			`#define BAD_RATING 1000.0 /* No valid blob */`

			`/*----------------------------------------------------------------------`
			`F u n c t i o n s`
			`----------------------------------------------------------------------*/`
			`/**********************************************************************`
			`* prioritize_state`
			`*`
			`* Create a priority for this state. It represents the urgency of`
			`* checking this state.`
			`**********************************************************************/`
			`FLOAT32 prioritize_state(CHUNKS_RECORD *chunks_record,`
			`SEARCH_RECORD *the_search,`
			`STATE *old_state) {`
			`FLOAT32 width_pri;`
			`FLOAT32 match_pri;`

			`match_pri = rating_priority (chunks_record, the_search->this_state,`
			`old_state, the_search->num_joints);`

			`width_pri = width_priority (chunks_record, the_search->this_state,`
			`the_search->num_joints) * 1000.0;`

			`record_priorities(the_search, old_state, match_pri, width_pri);`

			`return (width_pri + match_pri);`
			`}`


			`/**********************************************************************`
			`* rating_priority`
			`*`
			`* Assign a segmentation priority based on the ratings of the blobs`
			`* (in that segmentation) that have been classified. The average`
			`* "goodness" (i.e. rating / weight) for each blob is used to indicate`
			`* the segmentation priority.`
			`**********************************************************************/`
			`FLOAT32 rating_priority(CHUNKS_RECORD *chunks_record,`
			`STATE *state,`
			`STATE *old_state,`
			`int num_joints) {`
			`PIECES_STATE blob_chunks;`
			`INT16 x;`
			`INT16 y;`
			`CHOICES this_choice;`
			`INT16 first_chunk = 0;`
			`INT16 last_chunk;`
			`INT16 ratings = 0;`
			`INT16 weights = 0;`

			`bin_to_pieces(state, num_joints, blob_chunks);`

			`for (x = 0; blob_chunks[x]; x++) {`
			`// Iterate each blob`
			`last_chunk = first_chunk + blob_chunks[x] - 1;`

			`this_choice = matrix_get (chunks_record->ratings,`
			`first_chunk, last_chunk);`

			`if (this_choice == NIL)`
			`return (BAD_RATING);`

			`if (this_choice != NOT_CLASSIFIED) {`

			`ratings += (INT16) best_probability (this_choice);`
			`for (y = first_chunk; y <= last_chunk; y++) {`
			`weights += (INT16) (chunks_record->weights[y]);`
			`}`
			`}`
			`first_chunk += blob_chunks[x];`
			`}`
			`if (weights <= 0)`
			`weights = 1;`
			`return ((FLOAT32) ratings / weights);`
			`}`


			`/**********************************************************************`
			`* state_char_widths`
			`*`
			`* Return a character width record corresponding to the character`
			`* width that will be generated in this segmentation state.`
			`**********************************************************************/`
			`WIDTH_RECORD state_char_widths(WIDTH_RECORD chunk_widths,`
			`STATE *state,`
			`int num_joints,`
			`SEARCH_STATE *search_state) {`
			`WIDTH_RECORD *width_record;`
			`int num_blobs;`
			`int x;`
			`int y;`
			`int i;`
			`SEARCH_STATE new_chunks;`

			`new_chunks = bin_to_chunks (state, num_joints);`

			`num_blobs = new_chunks[0] + 1;`
			`width_record = (WIDTH_RECORD ) memalloc (sizeof (int) num_blobs * 2);`
			`width_record->num_chars = num_blobs;`

			`x = 0;`
			`for (i = 1; i <= new_chunks[0] + 1; i++) {`
			`if (i > new_chunks[0])`
			`y = num_joints;`
			`else`
			`y = x + new_chunks[i];`

			`width_record->widths[2 * i - 2] = chunks_width (chunk_widths, x, y);`

			`if (i <= new_chunks[0])`
			`width_record->widths[2 * i - 1] = chunks_gap (chunk_widths, y);`

			`x = y + 1;`
			`}`

			`*search_state = new_chunks;`
			`return (width_record);`
			`}`


			`/**********************************************************************`
			`* width_priority`
			`*`
			`* Return a priority value for this word segmentation based on the`
			`* character widths present in the new segmentation.`
			`**********************************************************************/`
			`FLOAT32 width_priority(CHUNKS_RECORD *chunks_record,`
			`STATE *state,`
			`int num_joints) {`
			`SEARCH_STATE new_chunks;`
			`FLOAT32 result = 0.0;`
			`WIDTH_RECORD *width_record;`
			`FLOAT32 squat;`
			`int x;`

			`width_record = state_char_widths (chunks_record->chunk_widths,`
			`state, num_joints, &new_chunks);`
			`for (x = 0; x < width_record->num_chars; x++) {`

			`squat = width_record->widths[2 * x];`
			`if (!baseline_enable) {`
			`squat /= chunks_record->row->lineheight;`
			`}`
			`else {`
			`squat /= BASELINE_SCALE;`
			`}`

			`if (squat > MAX_SQUAT)`
			`result += squat - MAX_SQUAT;`

			`}`

			`memfree(new_chunks);`
			`free_widths(width_record);`

			`return (result);`
			`}`