mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-03 00:49:01 +08:00
Replace remaining STRING by std::string in src/ccmain
Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
parent
d7823a71c2
commit
db9f963411
@ -1658,7 +1658,7 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const UNICHARSET &char_se
|
||||
|
||||
/* Single Leading punctuation char*/
|
||||
|
||||
if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
|
||||
if (s[offset] != '\0' && chs_leading_punct.contains(s[offset]))
|
||||
offset += lengths[i++];
|
||||
leading_punct_count = i;
|
||||
|
||||
@ -1705,10 +1705,10 @@ Allow a single hyphen in a lower case word
|
||||
}
|
||||
|
||||
/* Up to two different, constrained trailing punctuation chars */
|
||||
if (lengths[i] == 1 && s[offset] != '\0' && STRING(chs_trailing_punct1).contains(s[offset]))
|
||||
if (lengths[i] == 1 && s[offset] != '\0' && chs_trailing_punct1.contains(s[offset]))
|
||||
offset += lengths[i++];
|
||||
if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && s[offset - lengths[i - 1]] != s[offset] &&
|
||||
STRING(chs_trailing_punct2).contains(s[offset]))
|
||||
chs_trailing_punct2.contains(s[offset]))
|
||||
offset += lengths[i++];
|
||||
|
||||
if (s[offset] != '\0')
|
||||
|
@ -107,9 +107,9 @@ void Tesseract::unrej_good_chs(WERD_RES *word) {
|
||||
int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
|
||||
int expected_outline_count;
|
||||
|
||||
if (STRING(outlines_odd).contains(c))
|
||||
if (outlines_odd.contains(c))
|
||||
return 0; // Don't use this char
|
||||
else if (STRING(outlines_2).contains(c))
|
||||
else if (outlines_2.contains(c))
|
||||
expected_outline_count = 2;
|
||||
else
|
||||
expected_outline_count = 1;
|
||||
|
@ -216,7 +216,7 @@ void EquationDetect::IdentifySpecialText(BLOBNBOX *blobnbox, const int height_th
|
||||
|
||||
BlobSpecialTextType EquationDetect::EstimateTypeForUnichar(const UNICHARSET &unicharset,
|
||||
const UNICHAR_ID id) const {
|
||||
const STRING s = unicharset.id_to_unichar(id);
|
||||
const std::string s = unicharset.id_to_unichar(id);
|
||||
if (unicharset.get_isalpha(id)) {
|
||||
return BSTT_NONE;
|
||||
}
|
||||
@ -237,8 +237,8 @@ BlobSpecialTextType EquationDetect::EstimateTypeForUnichar(const UNICHARSET &uni
|
||||
|
||||
// Check if it is digit. In addition to the isdigit attribute, we also check
|
||||
// if this character belongs to those likely to be confused with a digit.
|
||||
static const STRING kDigitsChars = "|";
|
||||
if (unicharset.get_isdigit(id) || (s.length() == 1 && kDigitsChars.contains(s[0]))) {
|
||||
static const char kDigitsChars[] = "|";
|
||||
if (unicharset.get_isdigit(id) || (s.length() == 1 && strchr(kDigitsChars, s[0]) != nullptr)) {
|
||||
return BSTT_DIGIT;
|
||||
} else {
|
||||
return BSTT_MATH;
|
||||
@ -286,8 +286,8 @@ void EquationDetect::IdentifySpecialText() {
|
||||
lang_tesseract_->classify_integer_matcher_multiplier.set_value(classify_integer_matcher);
|
||||
|
||||
if (equationdetect_save_spt_image) { // For debug.
|
||||
STRING outfile;
|
||||
GetOutputTiffName("_spt", &outfile);
|
||||
std::string outfile;
|
||||
GetOutputTiffName("_spt", outfile);
|
||||
PaintSpecialTexts(outfile);
|
||||
}
|
||||
}
|
||||
@ -351,11 +351,11 @@ int EquationDetect::FindEquationParts(ColPartitionGrid *part_grid, ColPartitionS
|
||||
part_grid_ = part_grid;
|
||||
best_columns_ = best_columns;
|
||||
resolution_ = lang_tesseract_->source_resolution();
|
||||
STRING outfile;
|
||||
std::string outfile;
|
||||
page_count_++;
|
||||
|
||||
if (equationdetect_save_bi_image) {
|
||||
GetOutputTiffName("_bi", &outfile);
|
||||
GetOutputTiffName("_bi", outfile);
|
||||
pixWrite(outfile.c_str(), lang_tesseract_->pix_binary(), IFF_TIFF_G4);
|
||||
}
|
||||
|
||||
@ -371,7 +371,7 @@ int EquationDetect::FindEquationParts(ColPartitionGrid *part_grid, ColPartitionS
|
||||
IdentifyInlineParts();
|
||||
|
||||
if (equationdetect_save_seed_image) {
|
||||
GetOutputTiffName("_seed", &outfile);
|
||||
GetOutputTiffName("_seed", outfile);
|
||||
PaintColParts(outfile);
|
||||
}
|
||||
|
||||
@ -396,7 +396,7 @@ int EquationDetect::FindEquationParts(ColPartitionGrid *part_grid, ColPartitionS
|
||||
ProcessMathBlockSatelliteParts();
|
||||
|
||||
if (equationdetect_save_merged_image) { // For debug.
|
||||
GetOutputTiffName("_merged", &outfile);
|
||||
GetOutputTiffName("_merged", outfile);
|
||||
PaintColParts(outfile);
|
||||
}
|
||||
|
||||
@ -1383,14 +1383,14 @@ bool EquationDetect::IsNearMathNeighbor(const int y_gap, const ColPartition *nei
|
||||
return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh;
|
||||
}
|
||||
|
||||
void EquationDetect::GetOutputTiffName(const char *name, STRING *image_name) const {
|
||||
ASSERT_HOST(image_name && name);
|
||||
void EquationDetect::GetOutputTiffName(const char *name, std::string &image_name) const {
|
||||
ASSERT_HOST(name);
|
||||
char page[50];
|
||||
snprintf(page, sizeof(page), "%04d", page_count_);
|
||||
*image_name = STRING(lang_tesseract_->imagebasename) + page + name + ".tif";
|
||||
image_name = (lang_tesseract_->imagebasename) + page + name + ".tif";
|
||||
}
|
||||
|
||||
void EquationDetect::PaintSpecialTexts(const STRING &outfile) const {
|
||||
void EquationDetect::PaintSpecialTexts(const std::string &outfile) const {
|
||||
Pix *pix = nullptr, *pixBi = lang_tesseract_->pix_binary();
|
||||
pix = pixConvertTo32(pixBi);
|
||||
ColPartitionGridSearch gsearch(part_grid_);
|
||||
@ -1407,7 +1407,7 @@ void EquationDetect::PaintSpecialTexts(const STRING &outfile) const {
|
||||
pixDestroy(&pix);
|
||||
}
|
||||
|
||||
void EquationDetect::PaintColParts(const STRING &outfile) const {
|
||||
void EquationDetect::PaintColParts(const std::string &outfile) const {
|
||||
Pix *pix = pixConvertTo32(lang_tesseract_->BestPix());
|
||||
ColPartitionGridSearch gsearch(part_grid_);
|
||||
gsearch.StartFullSearch();
|
||||
|
@ -202,16 +202,16 @@ protected:
|
||||
bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const;
|
||||
|
||||
// Generate the tiff file name for output/debug file.
|
||||
void GetOutputTiffName(const char *name, STRING *image_name) const;
|
||||
void GetOutputTiffName(const char *name, std::string &image_name) const;
|
||||
|
||||
// Debugger function that renders ColPartitions on the input image, where:
|
||||
// parts labeled as PT_EQUATION will be painted in red, PT_INLINE_EQUATION
|
||||
// will be painted in green, and other parts will be painted in blue.
|
||||
void PaintColParts(const STRING &outfile) const;
|
||||
void PaintColParts(const std::string &outfile) const;
|
||||
|
||||
// Debugger function that renders the blobs in part_grid_ over the input
|
||||
// image.
|
||||
void PaintSpecialTexts(const STRING &outfile) const;
|
||||
void PaintSpecialTexts(const std::string &outfile) const;
|
||||
|
||||
// Debugger function that print the math blobs density values for a
|
||||
// ColPartition object.
|
||||
|
@ -37,7 +37,6 @@
|
||||
|
||||
#include <tesseract/ocrclass.h> // for ETEXT_DESC
|
||||
#include <tesseract/unichar.h> // for UNICHAR_ID
|
||||
#include "strngs.h" // for STRING
|
||||
|
||||
#include <cstdint> // for INT16_MAX, int16_t, int32_t
|
||||
|
||||
@ -288,7 +287,7 @@ int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
|
||||
((word_done && word->best_choice->unichar_lengths().c_str()[0] == 1 &&
|
||||
word->best_choice->unichar_string()[0] == '1') ||
|
||||
(!word_done &&
|
||||
STRING(conflict_set_I_l_1).contains(word->best_choice->unichar_string()[0])))))) {
|
||||
conflict_set_I_l_1.contains(word->best_choice->unichar_string()[0])))))) {
|
||||
total_score += prev_word_score;
|
||||
if (prev_word_done)
|
||||
done_word_count++;
|
||||
@ -330,7 +329,7 @@ int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
|
||||
prev_char_1 =
|
||||
((word_done && (word->best_choice->unichar_string()[offset] == '1')) ||
|
||||
(!word_done &&
|
||||
STRING(conflict_set_I_l_1).contains(word->best_choice->unichar_string()[offset])));
|
||||
conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])));
|
||||
}
|
||||
/* Find next word */
|
||||
do {
|
||||
@ -356,7 +355,7 @@ bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
|
||||
word->uch_set->get_isdigit(word->best_choice->unichar_string().c_str() + offset,
|
||||
word->best_choice->unichar_lengths()[i]) ||
|
||||
(word->best_choice->permuter() == NUMBER_PERM &&
|
||||
STRING(numeric_punctuation).contains(word->best_choice->unichar_string().c_str()[offset])));
|
||||
numeric_punctuation.contains(word->best_choice->unichar_string().c_str()[offset])));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -24,8 +24,6 @@
|
||||
|
||||
#include <allheaders.h>
|
||||
|
||||
#include "strngs.h"
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
LTRResultIterator::LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
|
||||
@ -46,7 +44,7 @@ LTRResultIterator::~LTRResultIterator() = default;
|
||||
char *LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
|
||||
if (it_->word() == nullptr)
|
||||
return nullptr; // Already at the end!
|
||||
STRING text;
|
||||
std::string text;
|
||||
PAGE_RES_IT res_it(*it_);
|
||||
WERD_CHOICE *best_choice = res_it.word()->best_choice;
|
||||
ASSERT_HOST(best_choice != nullptr);
|
||||
@ -306,7 +304,7 @@ bool LTRResultIterator::EquivalentToTruth(const char *str) const {
|
||||
char *LTRResultIterator::WordTruthUTF8Text() const {
|
||||
if (!HasTruthString())
|
||||
return nullptr;
|
||||
STRING truth_text = it_->word()->blamer_bundle->TruthString();
|
||||
std::string truth_text = it_->word()->blamer_bundle->TruthString();
|
||||
int length = truth_text.length() + 1;
|
||||
char *result = new char[length];
|
||||
strncpy(result, truth_text.c_str(), length);
|
||||
@ -318,7 +316,7 @@ char *LTRResultIterator::WordTruthUTF8Text() const {
|
||||
char *LTRResultIterator::WordNormedUTF8Text() const {
|
||||
if (it_->word() == nullptr)
|
||||
return nullptr; // Already at the end!
|
||||
STRING ocr_text;
|
||||
std::string ocr_text;
|
||||
WERD_CHOICE *best_choice = it_->word()->best_choice;
|
||||
const UNICHARSET *unicharset = it_->word()->uch_set;
|
||||
ASSERT_HOST(best_choice != nullptr);
|
||||
|
@ -32,7 +32,6 @@
|
||||
#include "ratngs.h" // for WERD_CHOICE
|
||||
#include "rect.h" // for TBOX
|
||||
#include "statistc.h" // for STATS
|
||||
#include "strngs.h" // for STRING
|
||||
#include "tprintf.h" // for tprintf
|
||||
#include "unicharset.h" // for UNICHARSET
|
||||
#include "werd.h" // for WERD, W_REP_CHAR
|
||||
@ -91,16 +90,9 @@ static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *fun
|
||||
|
||||
// =============================== Debug Code ================================
|
||||
|
||||
// Convert an integer to a decimal string.
|
||||
static STRING StrOf(int num) {
|
||||
char buffer[30];
|
||||
snprintf(buffer, sizeof(buffer), "%d", num);
|
||||
return STRING(buffer);
|
||||
}
|
||||
|
||||
// Given a row-major matrix of unicode text and a column separator, print
|
||||
// a formatted table. For ASCII, we get good column alignment.
|
||||
static void PrintTable(const std::vector<std::vector<STRING>> &rows, const STRING &colsep) {
|
||||
static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) {
|
||||
std::vector<int> max_col_widths;
|
||||
for (const auto &row : rows) {
|
||||
int num_columns = row.size();
|
||||
@ -119,56 +111,56 @@ static void PrintTable(const std::vector<std::vector<STRING>> &rows, const STRIN
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<STRING> col_width_patterns;
|
||||
std::vector<std::string> col_width_patterns;
|
||||
for (int c = 0; c < max_col_widths.size(); c++) {
|
||||
col_width_patterns.push_back(STRING("%-") + StrOf(max_col_widths[c]) + "s");
|
||||
col_width_patterns.push_back(std::string("%-") + std::to_string(max_col_widths[c]) + "s");
|
||||
}
|
||||
|
||||
for (int r = 0; r < rows.size(); r++) {
|
||||
for (int c = 0; c < rows[r].size(); c++) {
|
||||
if (c > 0)
|
||||
tprintf("%s", colsep.c_str());
|
||||
tprintf("%s", colsep);
|
||||
tprintf(col_width_patterns[c].c_str(), rows[r][c].c_str());
|
||||
}
|
||||
tprintf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
static STRING RtlEmbed(const STRING &word, bool rtlify) {
|
||||
static std::string RtlEmbed(const std::string &word, bool rtlify) {
|
||||
if (rtlify)
|
||||
return STRING(kRLE) + word + STRING(kPDF);
|
||||
return std::string(kRLE) + word + std::string(kPDF);
|
||||
return word;
|
||||
}
|
||||
|
||||
// Print the current thoughts of the paragraph detector.
|
||||
static void PrintDetectorState(const ParagraphTheory &theory,
|
||||
const GenericVector<RowScratchRegisters> &rows) {
|
||||
std::vector<std::vector<STRING>> output;
|
||||
output.push_back(std::vector<STRING>());
|
||||
std::vector<std::vector<std::string>> output;
|
||||
output.push_back(std::vector<std::string>());
|
||||
output.back().push_back("#row");
|
||||
output.back().push_back("space");
|
||||
output.back().push_back("..");
|
||||
output.back().push_back("lword[widthSEL]");
|
||||
output.back().push_back("rword[widthSEL]");
|
||||
RowScratchRegisters::AppendDebugHeaderFields(&output.back());
|
||||
RowScratchRegisters::AppendDebugHeaderFields(output.back());
|
||||
output.back().push_back("text");
|
||||
|
||||
for (int i = 0; i < rows.size(); i++) {
|
||||
output.push_back(std::vector<STRING>());
|
||||
std::vector<STRING> &row = output.back();
|
||||
output.push_back(std::vector<std::string>());
|
||||
std::vector<std::string> &row = output.back();
|
||||
const RowInfo &ri = *rows[i].ri_;
|
||||
row.push_back(StrOf(i));
|
||||
row.push_back(StrOf(ri.average_interword_space));
|
||||
row.push_back(std::to_string(i));
|
||||
row.push_back(std::to_string(ri.average_interword_space));
|
||||
row.push_back(ri.has_leaders ? ".." : " ");
|
||||
row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + "[" + StrOf(ri.lword_box.width()) +
|
||||
row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + "[" + std::to_string(ri.lword_box.width()) +
|
||||
(ri.lword_likely_starts_idea ? "S" : "s") +
|
||||
(ri.lword_likely_ends_idea ? "E" : "e") +
|
||||
(ri.lword_indicates_list_item ? "L" : "l") + "]");
|
||||
row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + "[" + StrOf(ri.rword_box.width()) +
|
||||
row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + "[" + std::to_string(ri.rword_box.width()) +
|
||||
(ri.rword_likely_starts_idea ? "S" : "s") +
|
||||
(ri.rword_likely_ends_idea ? "E" : "e") +
|
||||
(ri.rword_indicates_list_item ? "L" : "l") + "]");
|
||||
rows[i].AppendDebugInfo(theory, &row);
|
||||
rows[i].AppendDebugInfo(theory, row);
|
||||
row.push_back(RtlEmbed(ri.text, !ri.ltr));
|
||||
}
|
||||
PrintTable(output, " ");
|
||||
@ -180,11 +172,11 @@ static void PrintDetectorState(const ParagraphTheory &theory,
|
||||
}
|
||||
}
|
||||
|
||||
static void DebugDump(bool should_print, const STRING &phase, const ParagraphTheory &theory,
|
||||
static void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory,
|
||||
const GenericVector<RowScratchRegisters> &rows) {
|
||||
if (!should_print)
|
||||
return;
|
||||
tprintf("# %s\n", phase.c_str());
|
||||
tprintf("# %s\n", phase);
|
||||
PrintDetectorState(theory, rows);
|
||||
}
|
||||
|
||||
@ -240,7 +232,7 @@ static const char *SkipOne(const char *str, const char *toskip) {
|
||||
// Return whether it is very likely that this is a numeral marker that could
|
||||
// start a list item. Some examples include:
|
||||
// A I iii. VI (2) 3.5. [C-4]
|
||||
static bool LikelyListNumeral(const STRING &word) {
|
||||
static bool LikelyListNumeral(const std::string &word) {
|
||||
const char *kRomans = "ivxlmdIVXLMD";
|
||||
const char *kDigits = "012345789";
|
||||
const char *kOpen = "[{(";
|
||||
@ -274,12 +266,12 @@ static bool LikelyListNumeral(const STRING &word) {
|
||||
return *pos == '\0';
|
||||
}
|
||||
|
||||
static bool LikelyListMark(const STRING &word) {
|
||||
static bool LikelyListMark(const std::string &word) {
|
||||
const char *kListMarks = "0Oo*.,+.";
|
||||
return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr;
|
||||
}
|
||||
|
||||
bool AsciiLikelyListItem(const STRING &word) {
|
||||
bool AsciiLikelyListItem(const std::string &word) {
|
||||
return LikelyListMark(word) || LikelyListNumeral(word);
|
||||
}
|
||||
|
||||
@ -348,7 +340,7 @@ int UnicodeSpanSkipper::SkipAlpha(int pos) {
|
||||
|
||||
static bool LikelyListMarkUnicode(int ch) {
|
||||
if (ch < 0x80) {
|
||||
STRING single_ch;
|
||||
std::string single_ch;
|
||||
single_ch += ch;
|
||||
return LikelyListMark(single_ch);
|
||||
}
|
||||
@ -413,7 +405,7 @@ static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
|
||||
// is_list - this word might be a list number or bullet.
|
||||
// starts_idea - this word is likely to start a sentence.
|
||||
// ends_idea - this word is likely to end a sentence.
|
||||
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8,
|
||||
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
|
||||
bool *is_list, bool *starts_idea, bool *ends_idea) {
|
||||
*is_list = false;
|
||||
*starts_idea = false;
|
||||
@ -459,7 +451,7 @@ void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, c
|
||||
// is_list - this word might be a list number or bullet.
|
||||
// starts_idea - this word is likely to start a sentence.
|
||||
// ends_idea - this word is likely to end a sentence.
|
||||
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8,
|
||||
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
|
||||
bool *is_list, bool *starts_idea, bool *ends_idea) {
|
||||
*is_list = false;
|
||||
*starts_idea = false;
|
||||
@ -492,17 +484,17 @@ void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
|
||||
|
||||
// =============== Implementation of RowScratchRegisters =====================
|
||||
/* static */
|
||||
void RowScratchRegisters::AppendDebugHeaderFields(std::vector<STRING> *header) {
|
||||
header->push_back("[lmarg,lind;rind,rmarg]");
|
||||
header->push_back("model");
|
||||
void RowScratchRegisters::AppendDebugHeaderFields(std::vector<std::string> &header) {
|
||||
header.push_back("[lmarg,lind;rind,rmarg]");
|
||||
header.push_back("model");
|
||||
}
|
||||
|
||||
void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,
|
||||
std::vector<STRING> *dbg) const {
|
||||
std::vector<std::string> &dbg) const {
|
||||
char s[30];
|
||||
snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]", lmargin_, lindent_, rindent_, rmargin_);
|
||||
dbg->push_back(s);
|
||||
STRING model_string;
|
||||
dbg.push_back(s);
|
||||
std::string model_string;
|
||||
model_string += static_cast<char>(GetLineType());
|
||||
model_string += ":";
|
||||
|
||||
@ -513,7 +505,7 @@ void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,
|
||||
if (model_numbers > 0)
|
||||
model_string += ",";
|
||||
if (StrongModel(hypotheses_[h].model)) {
|
||||
model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model));
|
||||
model_string += std::to_string(1 + theory.IndexOf(hypotheses_[h].model));
|
||||
} else if (hypotheses_[h].model == kCrownLeft) {
|
||||
model_string += "CrL";
|
||||
} else if (hypotheses_[h].model == kCrownRight) {
|
||||
@ -524,7 +516,7 @@ void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,
|
||||
if (model_numbers == 0)
|
||||
model_string += "0";
|
||||
|
||||
dbg->push_back(model_string);
|
||||
dbg.push_back(model_string);
|
||||
}
|
||||
|
||||
void RowScratchRegisters::Init(const RowInfo &row) {
|
||||
@ -2323,7 +2315,7 @@ void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
|
||||
|
||||
static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info) {
|
||||
// Set up text, lword_text, and rword_text (mostly for debug printing).
|
||||
STRING fake_text;
|
||||
std::string fake_text;
|
||||
PageIterator pit(static_cast<const PageIterator &>(it));
|
||||
bool first_word = true;
|
||||
if (!pit.Empty(RIL_WORD)) {
|
||||
|
@ -21,8 +21,8 @@
|
||||
#define TESSERACT_CCMAIN_PARAGRAPHS_H_
|
||||
|
||||
#include <list>
|
||||
#include <string>
|
||||
#include "rect.h" // for TBOX
|
||||
#include "strngs.h" // for STRING
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
@ -41,7 +41,7 @@ class GenericVector;
|
||||
class RowInfo {
|
||||
public:
|
||||
// Constant data derived from Tesseract output.
|
||||
STRING text; // the full UTF-8 text of the line.
|
||||
std::string text; // the full UTF-8 text of the line.
|
||||
bool ltr; // whether the majority of the text is left-to-right
|
||||
// TODO(eger) make this more fine-grained.
|
||||
|
||||
@ -56,8 +56,8 @@ public:
|
||||
TBOX lword_box; // in normalized (horiz text rows) space
|
||||
TBOX rword_box; // in normalized (horiz text rows) space
|
||||
|
||||
STRING lword_text; // the UTF-8 text of the leftmost werd
|
||||
STRING rword_text; // the UTF-8 text of the rightmost werd
|
||||
std::string lword_text; // the UTF-8 text of the leftmost werd
|
||||
std::string rword_text; // the UTF-8 text of the rightmost werd
|
||||
|
||||
// The text of a paragraph typically starts with the start of an idea and
|
||||
// ends with the end of an idea. Here we define paragraph as something that
|
||||
|
@ -32,7 +32,7 @@ class WERD_CHOICE;
|
||||
|
||||
// Return whether the given word is likely to be a list item start word.
|
||||
TESS_API
|
||||
bool AsciiLikelyListItem(const STRING &word);
|
||||
bool AsciiLikelyListItem(const std::string &word);
|
||||
|
||||
// Return the first Unicode Codepoint from werd[pos].
|
||||
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
|
||||
@ -40,12 +40,12 @@ int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
|
||||
// Set right word attributes given either a unicharset and werd or a utf8
|
||||
// string.
|
||||
TESS_API
|
||||
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8,
|
||||
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
|
||||
bool *is_list, bool *starts_idea, bool *ends_idea);
|
||||
|
||||
// Set left word attributes given either a unicharset and werd or a utf8 string.
|
||||
TESS_API
|
||||
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8,
|
||||
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
|
||||
bool *is_list, bool *starts_idea, bool *ends_idea);
|
||||
|
||||
enum LineType {
|
||||
@ -171,10 +171,10 @@ public:
|
||||
}
|
||||
|
||||
// Append header fields to a vector of row headings.
|
||||
static void AppendDebugHeaderFields(std::vector<STRING> *header);
|
||||
static void AppendDebugHeaderFields(std::vector<std::string> &header);
|
||||
|
||||
// Append data for this row to a vector of debug strings.
|
||||
void AppendDebugInfo(const ParagraphTheory &theory, std::vector<STRING> *dbg) const;
|
||||
void AppendDebugInfo(const ParagraphTheory &theory, std::vector<std::string> &dbg) const;
|
||||
|
||||
const RowInfo *ri_;
|
||||
|
||||
|
@ -151,11 +151,7 @@ std::string ParamContent::GetValue() const {
|
||||
} else if (param_type_ == VT_DOUBLE) {
|
||||
result += std::to_string(*dIt);
|
||||
} else if (param_type_ == VT_STRING) {
|
||||
if (STRING(*(sIt)).c_str() != nullptr) {
|
||||
result = sIt->c_str();
|
||||
} else {
|
||||
result = "Null";
|
||||
}
|
||||
result = sIt->c_str();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
@ -183,8 +179,8 @@ void ParamContent::SetValue(const char *val) {
|
||||
|
||||
// Gets the up to the first 3 prefixes from s (split by _).
|
||||
// For example, tesseract_foo_bar will be split into tesseract,foo and bar.
|
||||
void ParamsEditor::GetPrefixes(const char *s, STRING *level_one, STRING *level_two,
|
||||
STRING *level_three) {
|
||||
void ParamsEditor::GetPrefixes(const char *s, std::string *level_one, std::string *level_two,
|
||||
std::string *level_three) {
|
||||
std::unique_ptr<char[]> p(new char[1024]);
|
||||
GetFirstWords(s, 1, p.get());
|
||||
*level_one = p.get();
|
||||
@ -234,9 +230,9 @@ SVMenuNode *ParamsEditor::BuildListOfAllLeaves(tesseract::Tesseract *tess) {
|
||||
// Count the # of entries starting with a specific prefix.
|
||||
for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
|
||||
ParamContent *vc = vc_it.data();
|
||||
STRING tag;
|
||||
STRING tag2;
|
||||
STRING tag3;
|
||||
std::string tag;
|
||||
std::string tag2;
|
||||
std::string tag3;
|
||||
|
||||
GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
|
||||
amount[tag.c_str()]++;
|
||||
@ -252,9 +248,9 @@ SVMenuNode *ParamsEditor::BuildListOfAllLeaves(tesseract::Tesseract *tess) {
|
||||
vc_it.move_to_first();
|
||||
for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
|
||||
ParamContent *vc = vc_it.data();
|
||||
STRING tag;
|
||||
STRING tag2;
|
||||
STRING tag3;
|
||||
std::string tag;
|
||||
std::string tag2;
|
||||
std::string tag3;
|
||||
GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
|
||||
|
||||
if (amount[tag.c_str()] == 1) {
|
||||
@ -304,7 +300,7 @@ ParamsEditor::ParamsEditor(tesseract::Tesseract *tess, ScrollView *sv) {
|
||||
|
||||
SVMenuNode *svMenuRoot = BuildListOfAllLeaves(tess);
|
||||
|
||||
STRING paramfile;
|
||||
std::string paramfile;
|
||||
paramfile = tess->datadir;
|
||||
paramfile += VARDIR; // parameters dir
|
||||
paramfile += "edited"; // actual name
|
||||
|
@ -25,7 +25,6 @@
|
||||
|
||||
# include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
|
||||
# include "scrollview.h" // for ScrollView (ptr only), SVEvent (ptr only)
|
||||
# include "strngs.h" // for STRING
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
@ -107,7 +106,7 @@ public:
|
||||
private:
|
||||
// Gets the up to the first 3 prefixes from s (split by _).
|
||||
// For example, tesseract_foo_bar will be split into tesseract,foo and bar.
|
||||
void GetPrefixes(const char *s, STRING *level_one, STRING *level_two, STRING *level_three);
|
||||
void GetPrefixes(const char *s, std::string *level_one, std::string *level_two, std::string *level_three);
|
||||
|
||||
// Gets the first n words (split by _) and puts them in t.
|
||||
// For example, tesseract_foo_bar with N=2 will yield tesseract_foo_.
|
||||
|
@ -794,8 +794,8 @@ bool Tesseract::word_display(PAGE_RES_IT *pr_it) {
|
||||
}
|
||||
|
||||
// Display correct text and blamer information.
|
||||
STRING text;
|
||||
STRING blame;
|
||||
std::string text;
|
||||
std::string blame;
|
||||
if (word->display_flag(DF_TEXT) && word->text() != nullptr) {
|
||||
text = word->text();
|
||||
}
|
||||
@ -810,7 +810,7 @@ bool Tesseract::word_display(PAGE_RES_IT *pr_it) {
|
||||
text = blamer_bundle->TruthString();
|
||||
}
|
||||
text += " -> ";
|
||||
STRING best_choice_str;
|
||||
std::string best_choice_str;
|
||||
if (word_res->best_choice == nullptr) {
|
||||
best_choice_str = "NULL";
|
||||
} else {
|
||||
|
@ -98,7 +98,7 @@ void Tesseract::recog_training_segmented(const char *filename, PAGE_RES *page_re
|
||||
PAGE_RES_IT page_res_it;
|
||||
page_res_it.page_res = page_res;
|
||||
page_res_it.restart_page();
|
||||
STRING label;
|
||||
std::string label;
|
||||
|
||||
// Process all the words on this page.
|
||||
TBOX tbox; // tesseract-identified box
|
||||
@ -108,14 +108,14 @@ void Tesseract::recog_training_segmented(const char *filename, PAGE_RES *page_re
|
||||
int examined_words = 0;
|
||||
do {
|
||||
keep_going = read_t(&page_res_it, &tbox);
|
||||
keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
|
||||
keep_going &= ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
|
||||
// Align bottom left points of the TBOXes.
|
||||
while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
|
||||
if (bbox.bottom() < tbox.bottom()) {
|
||||
page_res_it.forward();
|
||||
keep_going = read_t(&page_res_it, &tbox);
|
||||
} else {
|
||||
keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
|
||||
keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
|
||||
}
|
||||
}
|
||||
while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
|
||||
@ -123,7 +123,7 @@ void Tesseract::recog_training_segmented(const char *filename, PAGE_RES *page_re
|
||||
page_res_it.forward();
|
||||
keep_going = read_t(&page_res_it, &tbox);
|
||||
} else {
|
||||
keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
|
||||
keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
|
||||
}
|
||||
}
|
||||
// OCR the word if top right points of the TBOXes are similar.
|
||||
|
@ -53,9 +53,6 @@ int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
|
||||
|
||||
namespace tesseract {
|
||||
|
||||
CLISTIZEH(STRING)
|
||||
CLISTIZE(STRING)
|
||||
|
||||
/*************************************************************************
|
||||
* set_done()
|
||||
*
|
||||
@ -196,7 +193,7 @@ void Tesseract::reject_I_1_L(WERD_RES *word) {
|
||||
|
||||
for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
|
||||
offset += word->best_choice->unichar_lengths()[i], i += 1) {
|
||||
if (STRING(conflict_set_I_l_1).contains(word->best_choice->unichar_string()[offset])) {
|
||||
if (conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])) {
|
||||
// rej 1Il conflict
|
||||
word->reject_map[i].setrej_1Il_conflict();
|
||||
}
|
||||
@ -316,7 +313,7 @@ bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) {
|
||||
offset += lengths[i++])
|
||||
non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
|
||||
word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
|
||||
!STRING(conflict_set_I_l_1).contains(word[offset]);
|
||||
!conflict_set_I_l_1.contains(word[offset]);
|
||||
if (!non_conflict_set_char) {
|
||||
if (update_map)
|
||||
reject_I_1_L(word_res);
|
||||
@ -409,7 +406,7 @@ bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) {
|
||||
for (i = 0, offset = 0; word[offset] != '\0';
|
||||
offset += word_res->best_choice->unichar_lengths()[i++]) {
|
||||
if ((!allow_1s || (word[offset] != '1')) &&
|
||||
STRING(conflict_set_I_l_1).contains(word[offset])) {
|
||||
conflict_set_I_l_1.contains(word[offset])) {
|
||||
if (update_map)
|
||||
word_res->reject_map[i].setrej_1Il_conflict();
|
||||
conflict = true;
|
||||
@ -425,7 +422,7 @@ bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) {
|
||||
if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
|
||||
first_alphanum_index_ = first_alphanum_index(word, lengths);
|
||||
first_alphanum_offset_ = first_alphanum_offset(word, lengths);
|
||||
if (STRING(conflict_set_I_l_1).contains(word[first_alphanum_offset_])) {
|
||||
if (conflict_set_I_l_1.contains(word[first_alphanum_offset_])) {
|
||||
if (update_map)
|
||||
word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
|
||||
return true;
|
||||
@ -502,7 +499,7 @@ void Tesseract::dont_allow_1Il(WERD_RES *word) {
|
||||
|
||||
for (i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
|
||||
if (word->reject_map[i].accepted()) {
|
||||
if (STRING(conflict_set_I_l_1).contains(s[offset])) {
|
||||
if (conflict_set_I_l_1.contains(s[offset])) {
|
||||
accepted_1Il = true;
|
||||
} else {
|
||||
if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
|
||||
@ -515,7 +512,7 @@ void Tesseract::dont_allow_1Il(WERD_RES *word) {
|
||||
return; // Nothing to worry about
|
||||
|
||||
for (i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
|
||||
if (STRING(conflict_set_I_l_1).contains(s[offset]) && word->reject_map[i].accepted())
|
||||
if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted())
|
||||
word->reject_map[i].setrej_postNN_1Il();
|
||||
}
|
||||
}
|
||||
@ -549,7 +546,7 @@ bool Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
|
||||
if (word->best_choice->unichar_lengths().length() <= 1)
|
||||
return false;
|
||||
|
||||
if (!STRING(ok_repeated_ch_non_alphanum_wds).contains(word->best_choice->unichar_string()[0]))
|
||||
if (!ok_repeated_ch_non_alphanum_wds.contains(word->best_choice->unichar_string()[0]))
|
||||
return false;
|
||||
|
||||
UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
|
||||
|
@ -440,7 +440,7 @@ Dict &Tesseract::getDict() {
|
||||
}
|
||||
|
||||
void Tesseract::Clear() {
|
||||
STRING debug_name = imagebasename + "_debug.pdf";
|
||||
std::string debug_name = imagebasename + "_debug.pdf";
|
||||
pixa_debug_.WritePDF(debug_name.c_str());
|
||||
pixDestroy(&pix_binary_);
|
||||
pixDestroy(&pix_grey_);
|
||||
|
@ -98,9 +98,9 @@ bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool
|
||||
int num_boxes = 0;
|
||||
for (int i = 0; i < lines.size(); ++i) {
|
||||
int page = 0;
|
||||
STRING utf8_str;
|
||||
std::string utf8_str;
|
||||
TBOX box;
|
||||
if (!ParseBoxFileStr(lines[i].c_str(), &page, &utf8_str, &box)) {
|
||||
if (!ParseBoxFileStr(lines[i].c_str(), &page, utf8_str, &box)) {
|
||||
if (continue_on_failure)
|
||||
continue;
|
||||
else
|
||||
@ -137,14 +137,14 @@ bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool
|
||||
// for valid utf-8 and allows space or tab between fields.
|
||||
// utf8_str is set with the unichar string, and bounding box with the box.
|
||||
// If there are page numbers in the file, it reads them all.
|
||||
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box) {
|
||||
bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box) {
|
||||
return ReadNextBox(-1, line_number, box_file, utf8_str, bounding_box);
|
||||
}
|
||||
|
||||
// As ReadNextBox above, but get a specific page number. (0-based)
|
||||
// Use -1 to read any page number. Files without page number all
|
||||
// read as if they are page 0.
|
||||
bool ReadNextBox(int target_page, int *line_number, FILE *box_file, STRING *utf8_str,
|
||||
bool ReadNextBox(int target_page, int *line_number, FILE *box_file, std::string &utf8_str,
|
||||
TBOX *bounding_box) {
|
||||
int page = 0;
|
||||
char buff[kBoxReadBufSize]; // boxfile read buffer
|
||||
@ -185,10 +185,10 @@ bool ReadNextBox(int target_page, int *line_number, FILE *box_file, STRING *utf8
|
||||
// and for word/line-level boxes:
|
||||
// WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
|
||||
// See applyybox.cpp for more information.
|
||||
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, STRING *utf8_str,
|
||||
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str,
|
||||
TBOX *bounding_box) {
|
||||
*bounding_box = TBOX(); // Initialize it to empty.
|
||||
*utf8_str = "";
|
||||
utf8_str = "";
|
||||
char uch[kBoxReadBufSize];
|
||||
const char *buffptr = boxfile_str;
|
||||
// Read the unichar without messing up on Tibetan.
|
||||
@ -245,7 +245,7 @@ bool ParseBoxFileStr(const char *boxfile_str, int *page_number, STRING *utf8_str
|
||||
}
|
||||
used += new_used;
|
||||
}
|
||||
*utf8_str = uch;
|
||||
utf8_str = uch;
|
||||
if (x_min > x_max)
|
||||
std::swap(x_min, x_max);
|
||||
if (y_min > y_max)
|
||||
|
@ -64,18 +64,18 @@ bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool
|
||||
// utf8_str is set with the unichar string, and bounding box with the box.
|
||||
// If there are page numbers in the file, it reads them all.
|
||||
TESS_API
|
||||
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box);
|
||||
bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box);
|
||||
// As ReadNextBox above, but get a specific page number. (0-based)
|
||||
// Use -1 to read any page number. Files without page number all
|
||||
// read as if they are page 0.
|
||||
TESS_API
|
||||
bool ReadNextBox(int target_page, int *line_number, FILE *box_file, STRING *utf8_str,
|
||||
bool ReadNextBox(int target_page, int *line_number, FILE *box_file, std::string &utf8_str,
|
||||
TBOX *bounding_box);
|
||||
|
||||
// Parses the given box file string into a page_number, utf8_str, and
|
||||
// bounding_box. Returns true on a successful parse.
|
||||
TESS_API
|
||||
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, STRING *utf8_str,
|
||||
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str,
|
||||
TBOX *bounding_box);
|
||||
|
||||
// Creates a box file string from a unichar string, TBOX and page number.
|
||||
|
@ -91,7 +91,7 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
// Fetches the value of the named param as a STRING. Returns false if not
|
||||
// Fetches the value of the named param as a string. Returns false if not
|
||||
// found.
|
||||
static bool GetParamAsString(const char *name, const ParamsVectors *member_params,
|
||||
std::string *value);
|
||||
@ -242,6 +242,9 @@ public:
|
||||
const char *c_str() const {
|
||||
return value_.c_str();
|
||||
}
|
||||
bool contains(char c) {
|
||||
return value_.find(c) != std::string::npos;
|
||||
}
|
||||
bool empty() {
|
||||
return value_.length() <= 0;
|
||||
}
|
||||
|
@ -96,10 +96,6 @@ bool STRING::SkipDeSerialize(TFile *fp) {
|
||||
return fp->Skip(len);
|
||||
}
|
||||
|
||||
bool STRING::contains(const char c) const {
|
||||
return (c != '\0') && (strchr(c_str(), c) != nullptr);
|
||||
}
|
||||
|
||||
void STRING::split(const char c, std::vector<STRING> *splited) {
|
||||
int start_index = 0;
|
||||
const int len = length();
|
||||
|
@ -59,9 +59,6 @@ public:
|
||||
TESS_API
|
||||
static bool SkipDeSerialize(tesseract::TFile *fp);
|
||||
|
||||
TESS_API
|
||||
bool contains(char c) const;
|
||||
|
||||
TESS_API
|
||||
void split(char c, std::vector<STRING> *splited);
|
||||
};
|
||||
|
@ -149,9 +149,9 @@ void MasterTrainer::ReadTrainingSamples(const char *page_name,
|
||||
if (font_id < 0)
|
||||
font_id = 0;
|
||||
int page_number;
|
||||
STRING unichar;
|
||||
std::string unichar;
|
||||
TBOX bounding_box;
|
||||
if (!ParseBoxFileStr(space, &page_number, &unichar, &bounding_box)) {
|
||||
if (!ParseBoxFileStr(space, &page_number, unichar, &bounding_box)) {
|
||||
tprintf("Bad format in tr file, reading box coords\n");
|
||||
continue;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user