mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-07 10:17:50 +08:00
4523ce9f7d
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20
98 lines
4.3 KiB
C++
98 lines
4.3 KiB
C++
/**********************************************************************
|
|
* File: cube_utils.h
|
|
* Description: Declaration of the Cube Utilities Class
|
|
* Author: Ahmad Abdulkader
|
|
* Created: 2008
|
|
*
|
|
*(C) Copyright 2008, Google Inc.
|
|
** Licensed under the Apache License, Version 2.0(the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
// The CubeUtils class provides miscellaneous utility and helper functions
|
|
// to the rest of the Cube Engine
|
|
|
|
#ifndef CUBE_UTILS_H
|
|
#define CUBE_UTILS_H
|
|
|
|
#include <vector>
|
|
#include <string>
|
|
|
|
#include "allheaders.h"
|
|
#include "const.h"
|
|
#include "char_set.h"
|
|
#include "char_samp.h"
|
|
#include "img.h"
|
|
|
|
namespace tesseract {
|
|
class CubeUtils {
|
|
public:
|
|
CubeUtils();
|
|
~CubeUtils();
|
|
|
|
// Converts a probability value to a cost by getting the -log() of the
|
|
// probability value to a known base
|
|
static int Prob2Cost(double prob_val);
|
|
// Converts a cost to probability by getting the exp(-normalized cost)
|
|
static double Cost2Prob(int cost);
|
|
// Computes the length of a 32-bit char buffer
|
|
static int StrLen(const char_32 *str);
|
|
// Compares two 32-bit char buffers
|
|
static int StrCmp(const char_32 *str1, const char_32 *str2);
|
|
// Duplicates a 32-bit char buffer
|
|
static char_32 *StrDup(const char_32 *str);
|
|
// Creates a CharSamp from an IMAGE and a bounding box
|
|
static CharSamp *CharSampleFromImg(IMAGE *img,
|
|
int left, int top, int wid, int hgt);
|
|
// Creates a CharSamp from an Pix and a bounding box
|
|
static CharSamp *CharSampleFromPix(Pix *pix,
|
|
int left, int top, int wid, int hgt);
|
|
// Creates an IMAGE from a CharSamp
|
|
static IMAGE *ImageFromCharSample(CharSamp *char_samp);
|
|
// Creates a Pix from a CharSamp
|
|
static Pix *PixFromCharSample(CharSamp *char_samp);
|
|
// read the contents of a file to a string
|
|
static bool ReadFileToString(const string &file_name, string *str);
|
|
// split a string into vectors using any of the specified delimiters
|
|
static void SplitStringUsing(const string &str, const string &delims,
|
|
vector<string> *str_vec);
|
|
// UTF-8 to UTF-32 convesion functions
|
|
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32);
|
|
static void UTF32ToUTF8(const char_32 *utf32_str, string *str);
|
|
// Returns true if input word has either 1) all-one-case, or 2)
|
|
// first character upper-case, and remaining characters lower-case.
|
|
// If char_set and unicharset are not NULL, uses tesseract's unicharset
|
|
// functions to determine case properties. Otherwise, uses
|
|
// C-locale-dependent functions, which may be unreliable on
|
|
// non-ASCII characters.
|
|
static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set,
|
|
UNICHARSET *unicharset);
|
|
// Returns char_32 pointer to the lower-case-transformed version of
|
|
// the input string or NULL on error. If char_set or unicharset are
|
|
// NULL, or tesseract and cube do not share unicharsets, returns
|
|
// NULL. Return array must be freed by caller.
|
|
static char_32 *ToLower(const char_32 *str32, CharSet *char_set,
|
|
UNICHARSET *unicharset);
|
|
// Returns char_32 pointer to the upper-case-transformed version of
|
|
// the input string or NULL on error. If char_set or unicharset are
|
|
// NULL, or tesseract and cube do not share unicharsets, returns
|
|
// NULL. Return array must be freed by caller.
|
|
static char_32 *ToUpper(const char_32 *str32, CharSet *char_set,
|
|
UNICHARSET *unicharset);
|
|
private:
|
|
static unsigned char *GetImageData(IMAGE *img,
|
|
int left, int top, int wid, int hgt);
|
|
static unsigned char *GetImageData(Pix *pix,
|
|
int left, int top, int wid, int hgt);
|
|
};
|
|
} // namespace tesseract
|
|
#endif // CUBE_UTILS_H
|