mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-07 18:27:48 +08:00
32d179e0a6
Signed-off-by: Stefan Weil <sw@weilnetz.de>
167 lines
5.8 KiB
C++
167 lines
5.8 KiB
C++
/**********************************************************************
|
|
* File: char_samp.h
|
|
* Description: Declaration of a Character Bitmap Sample Class
|
|
* Author: Ahmad Abdulkader
|
|
* Created: 2007
|
|
*
|
|
* (C) Copyright 2008, Google Inc.
|
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
|
** you may not use this file except in compliance with the License.
|
|
** You may obtain a copy of the License at
|
|
** http://www.apache.org/licenses/LICENSE-2.0
|
|
** Unless required by applicable law or agreed to in writing, software
|
|
** distributed under the License is distributed on an "AS IS" BASIS,
|
|
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
** See the License for the specific language governing permissions and
|
|
** limitations under the License.
|
|
*
|
|
**********************************************************************/
|
|
|
|
// The CharSamp inherits the Bmp8 class that represents images of
|
|
// words, characters and segments throughout Cube
|
|
// CharSamp adds more data members to hold the physical location of the image
|
|
// in a page, page number in a book if available.
|
|
// It also holds the label (GT) of the image that might correspond to a single
|
|
// character or a word
|
|
// It also provides methods for segmenting, scaling and cropping of the sample
|
|
|
|
#ifndef CHAR_SAMP_H
|
|
#define CHAR_SAMP_H
|
|
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <string>
|
|
#include "bmp_8.h"
|
|
#include "string_32.h"
|
|
|
|
namespace tesseract {
|
|
|
|
class CharSamp : public Bmp8 {
|
|
public:
|
|
CharSamp();
|
|
CharSamp(int wid, int hgt);
|
|
CharSamp(int left, int top, int wid, int hgt);
|
|
~CharSamp();
|
|
// accessor methods
|
|
unsigned short Left() const { return left_; }
|
|
unsigned short Right() const { return left_ + wid_; }
|
|
unsigned short Top() const { return top_; }
|
|
unsigned short Bottom() const { return top_ + hgt_; }
|
|
unsigned short Page() const { return page_; }
|
|
unsigned short NormTop() const { return norm_top_; }
|
|
unsigned short NormBottom() const { return norm_bottom_; }
|
|
unsigned short NormAspectRatio() const { return norm_aspect_ratio_; }
|
|
unsigned short FirstChar() const { return first_char_; }
|
|
unsigned short LastChar() const { return last_char_; }
|
|
char_32 Label() const {
|
|
if (label32_ == NULL || LabelLen() != 1) {
|
|
return 0;
|
|
}
|
|
return label32_[0];
|
|
}
|
|
char_32 * StrLabel() const { return label32_; }
|
|
string stringLabel() const;
|
|
|
|
void SetLeft(unsigned short left) { left_ = left; }
|
|
void SetTop(unsigned short top) { top_ = top; }
|
|
void SetPage(unsigned short page) { page_ = page; }
|
|
void SetLabel(char_32 label) {
|
|
if (label32_ != NULL) {
|
|
delete []label32_;
|
|
}
|
|
label32_ = new char_32[2];
|
|
if (label32_ != NULL) {
|
|
label32_[0] = label;
|
|
label32_[1] = 0;
|
|
}
|
|
}
|
|
void SetLabel(const char_32 *label32) {
|
|
if (label32_ != NULL) {
|
|
delete []label32_;
|
|
label32_ = NULL;
|
|
}
|
|
if (label32 != NULL) {
|
|
// remove any byte order marks if any
|
|
if (label32[0] == 0xfeff) {
|
|
label32++;
|
|
}
|
|
int len = LabelLen(label32);
|
|
label32_ = new char_32[len + 1];
|
|
if (label32_ != NULL) {
|
|
memcpy(label32_, label32, len * sizeof(*label32));
|
|
label32_[len] = 0;
|
|
}
|
|
}
|
|
}
|
|
void SetLabel(string str);
|
|
void SetNormTop(unsigned short norm_top) { norm_top_ = norm_top; }
|
|
void SetNormBottom(unsigned short norm_bottom) {
|
|
norm_bottom_ = norm_bottom;
|
|
}
|
|
void SetNormAspectRatio(unsigned short norm_aspect_ratio) {
|
|
norm_aspect_ratio_ = norm_aspect_ratio;
|
|
}
|
|
void SetFirstChar(unsigned short first_char) {
|
|
first_char_ = first_char;
|
|
}
|
|
void SetLastChar(unsigned short last_char) {
|
|
last_char_ = last_char;
|
|
}
|
|
|
|
// Saves the charsamp to a dump file
|
|
bool Save2CharDumpFile(FILE *fp) const;
|
|
// Crops the underlying image and returns a new CharSamp with the
|
|
// same character information but new dimensions. Warning: does not
|
|
// necessarily set the normalized top and bottom correctly since
|
|
// those depend on its location within the word (or CubeSearchObject).
|
|
CharSamp *Crop();
|
|
// Computes the connected components of the char sample
|
|
ConComp **Segment(int *seg_cnt, bool right_2_left, int max_hist_wnd,
|
|
int min_con_comp_size) const;
|
|
// returns a copy of the charsamp that is scaled to the
|
|
// specified width and height
|
|
CharSamp *Scale(int wid, int hgt, bool isotropic = true);
|
|
// returns a Clone of the charsample
|
|
CharSamp *Clone() const;
|
|
// computes the features corresponding to the char sample
|
|
bool ComputeFeatures(int conv_grid_size, float *features);
|
|
// Load a Char Samp from a dump file
|
|
static CharSamp *FromCharDumpFile(CachedFile *fp);
|
|
static CharSamp *FromCharDumpFile(FILE *fp);
|
|
static CharSamp *FromCharDumpFile(unsigned char **raw_data);
|
|
static CharSamp *FromRawData(int left, int top, int wid, int hgt,
|
|
unsigned char *data);
|
|
static CharSamp *FromConComps(ConComp **concomp_array,
|
|
int strt_concomp, int seg_flags_size,
|
|
int *seg_flags, bool *left_most,
|
|
bool *right_most, int word_hgt);
|
|
static int AuxFeatureCnt() { return (5); }
|
|
// Return the length of the label string
|
|
int LabelLen() const { return LabelLen(label32_); }
|
|
static int LabelLen(const char_32 *label32) {
|
|
if (label32 == NULL) {
|
|
return 0;
|
|
}
|
|
int len = 0;
|
|
while (label32[++len] != 0);
|
|
return len;
|
|
}
|
|
private:
|
|
char_32 * label32_;
|
|
unsigned short page_;
|
|
unsigned short left_;
|
|
unsigned short top_;
|
|
// top of sample normalized to a word height of 255
|
|
unsigned short norm_top_;
|
|
// bottom of sample normalized to a word height of 255
|
|
unsigned short norm_bottom_;
|
|
// 255 * ratio of character width to (width + height)
|
|
unsigned short norm_aspect_ratio_;
|
|
unsigned short first_char_;
|
|
unsigned short last_char_;
|
|
};
|
|
|
|
}
|
|
|
|
#endif // CHAR_SAMP_H
|