mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 20:59:36 +08:00
73adf693d5
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@654 d0cd1f9f-072b-0410-8dd7-cf729c803f20
90 lines
2.8 KiB
C++
90 lines
2.8 KiB
C++
/**********************************************************************
|
|
* File: char_bigrams.h
|
|
* Description: Declaration of a Character Bigrams Class
|
|
* Author: Ahmad Abdulkader
|
|
* Created: 2007
|
|
*
|
|
* (C) Copyright 2008, Google Inc.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
// The CharBigram class represents the interface to the character bigram
|
|
// table used by Cube
|
|
// A CharBigram object can be constructed from the Char Bigrams file
|
|
// Given a sequence of characters, the "Cost" method returns the Char Bigram
|
|
// cost of the string according to the table
|
|
|
|
#ifndef CHAR_BIGRAMS_H
|
|
#define CHAR_BIGRAMS_H
|
|
|
|
#include <string>
|
|
#include "char_set.h"
|
|
|
|
namespace tesseract {
|
|
|
|
// structure representing a single bigram value
|
|
struct Bigram {
|
|
int cnt;
|
|
int cost;
|
|
};
|
|
|
|
// structure representing the char bigram array of characters
|
|
// following a specific character
|
|
struct CharBigram {
|
|
int total_cnt;
|
|
char_32 max_char;
|
|
Bigram *bigram;
|
|
};
|
|
|
|
// structure representing the whole bigram table
|
|
struct CharBigramTable {
|
|
int total_cnt;
|
|
int worst_cost;
|
|
char_32 max_char;
|
|
CharBigram *char_bigram;
|
|
};
|
|
|
|
class CharBigrams {
|
|
public:
|
|
CharBigrams();
|
|
~CharBigrams();
|
|
// Construct the CharBigrams class from a file
|
|
static CharBigrams *Create(const string &data_file_path,
|
|
const string &lang);
|
|
// Top-level function to return the mean character bigram cost of a
|
|
// sequence of characters. If char_set is not NULL, use
|
|
// tesseract functions to return a case-invariant cost.
|
|
// This avoids unnecessarily penalizing all-one-case words or
|
|
// capitalized words (first-letter upper-case and remaining letters
|
|
// lower-case).
|
|
int Cost(const char_32 *str, CharSet *char_set) const;
|
|
|
|
protected:
|
|
// Returns the character bigram cost of two characters.
|
|
int PairCost(char_32 ch1, char_32 ch2) const;
|
|
// Returns the mean character bigram cost of a sequence of
|
|
// characters. Adds a space at the beginning and end to account for
|
|
// cost of starting and ending characters.
|
|
int MeanCostWithSpaces(const char_32 *char_32_ptr) const;
|
|
|
|
private:
|
|
// Only words this length or greater qualify for case-invariant character
|
|
// bigram cost.
|
|
static const int kMinLengthCaseInvariant = 4;
|
|
|
|
|
|
CharBigramTable bigram_table_;
|
|
};
|
|
}
|
|
|
|
#endif // CHAR_BIGRAMS_H
|