mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 06:30:14 +08:00
3a13d80d24
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@293 d0cd1f9f-072b-0410-8dd7-cf729c803f20
121 lines
4.7 KiB
C++
121 lines
4.7 KiB
C++
///////////////////////////////////////////////////////////////////////
|
|
// File: conversion.cpp
|
|
// Description: Collection of utility functions for A_CHOICE conversions.
|
|
// TODO(daria): delete this file when conversion to unichar_ids
|
|
// is finished and all permuters are completely updated/replaced.
|
|
// Author: Daria Antonova
|
|
// Created: Mon Jun 23 11:26:43 PDT 2008
|
|
//
|
|
// (C) Copyright 2007, Google Inc.
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
#include "conversion.h"
|
|
|
|
#include "callcpp.h"
|
|
#include "choicearr.h"
|
|
#include "choices.h"
|
|
#include "dict.h"
|
|
#include "ratngs.h"
|
|
#include "stopper.h"
|
|
#include "unicharset.h"
|
|
|
|
namespace tesseract {
|
|
int Dict::valid_word(const char *string) {
|
|
WERD_CHOICE word(string, getUnicharset());
|
|
return valid_word(word);
|
|
}
|
|
|
|
void Dict::LogNewWordChoice(A_CHOICE *a_choice,
|
|
FLOAT32 adjust_factor,
|
|
const float certainties[],
|
|
const UNICHARSET &unicharset) {
|
|
WERD_CHOICE word_choice(strlen(a_choice->lengths));
|
|
convert_to_word_choice(a_choice, unicharset, &word_choice);
|
|
LogNewChoice(word_choice, adjust_factor, certainties, false);
|
|
}
|
|
} // namespace tesseract
|
|
|
|
// Fills in the given WERD_CHOICE with information from the given A_CHOICE.
|
|
// Assumes that word_choice pointer is not NULL.
|
|
void convert_to_word_choice(const A_CHOICE *a_choice,
|
|
const UNICHARSET ¤t_unicharset,
|
|
WERD_CHOICE *word_choice) {
|
|
if (a_choice == NULL) return;
|
|
const char *string = a_choice->string;
|
|
const char *lengths = a_choice->lengths;
|
|
const char *fragment_lengths = a_choice->fragment_lengths;
|
|
int offset = 0;
|
|
for (int x = 0; x < strlen(a_choice->lengths); ++x) {
|
|
UNICHAR_ID unichar_id =
|
|
current_unicharset.unichar_to_id(string + offset, lengths[x]);
|
|
word_choice->append_unichar_id(unichar_id, fragment_lengths[x], 0.0, 0.0);
|
|
offset += lengths[x];
|
|
}
|
|
word_choice->set_rating(a_choice->rating);
|
|
word_choice->set_certainty(a_choice->certainty);
|
|
word_choice->set_permuter(a_choice->permuter);
|
|
word_choice->set_fragment_mark(a_choice->fragment_mark);
|
|
}
|
|
|
|
// Returns the best of two choices and deletes the other (worse) choice.
|
|
// A choice is better if it has a non-empty string and has a lower
|
|
// rating than the other choice. If the ratings are the same,
|
|
// a_choice is preferred over choice.
|
|
// If the best choice is in the A_CHOICE form, copies it to a new
|
|
// WERD_CHOICE and deletes A_CHOICE.
|
|
WERD_CHOICE *get_best_delete_other(const UNICHARSET ¤t_unicharset,
|
|
WERD_CHOICE *choice,
|
|
A_CHOICE *a_choice) {
|
|
if (!a_choice) return choice;
|
|
if (choice != NULL &&
|
|
(choice->rating() < a_choice->rating || a_choice->string == NULL)) {
|
|
free_choice(a_choice);
|
|
return choice;
|
|
} else {
|
|
delete choice;
|
|
WERD_CHOICE *word_choice = new WERD_CHOICE();
|
|
convert_to_word_choice(a_choice, current_unicharset, word_choice);
|
|
free_choice(a_choice);
|
|
return word_choice;
|
|
}
|
|
}
|
|
|
|
// Convert BLOB_CHOICE_LIST_VECTOR to CHOICES_LIST.
|
|
// The caller is responsible for deleting the returned CHOICES_LIST.
|
|
CHOICES_LIST convert_to_choices_list(
|
|
const BLOB_CHOICE_LIST_VECTOR &char_choices,
|
|
const UNICHARSET ¤t_unicharset) {
|
|
CHOICES_LIST old_char_choices = new_choice_list();
|
|
int x;
|
|
BLOB_CHOICE_IT it;
|
|
BLOB_CHOICE *blob_choice;
|
|
char choice_lengths[2] = {0, 0};
|
|
char unichar[UNICHAR_LEN + 1];
|
|
for (x = 0; x < char_choices.length(); ++x) {
|
|
it.set_to_list(char_choices.get(x));
|
|
LIST result = NIL;
|
|
for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
|
|
blob_choice = it.data();
|
|
strcpy(unichar,
|
|
current_unicharset.id_to_unichar(blob_choice->unichar_id()));
|
|
choice_lengths[0] = strlen(unichar);
|
|
result = append_char_choice(result, unichar, choice_lengths,
|
|
blob_choice->rating(),
|
|
blob_choice->certainty(),
|
|
blob_choice->config(), NULL);
|
|
}
|
|
old_char_choices = array_push(old_char_choices, result);
|
|
}
|
|
return old_char_choices;
|
|
}
|