mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-03 00:49:01 +08:00
Replace remaining STRING by std::string in src/ccmain
Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
parent
d7823a71c2
commit
db9f963411
@ -1658,7 +1658,7 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const UNICHARSET &char_se
|
|||||||
|
|
||||||
/* Single Leading punctuation char*/
|
/* Single Leading punctuation char*/
|
||||||
|
|
||||||
if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
|
if (s[offset] != '\0' && chs_leading_punct.contains(s[offset]))
|
||||||
offset += lengths[i++];
|
offset += lengths[i++];
|
||||||
leading_punct_count = i;
|
leading_punct_count = i;
|
||||||
|
|
||||||
@ -1705,10 +1705,10 @@ Allow a single hyphen in a lower case word
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Up to two different, constrained trailing punctuation chars */
|
/* Up to two different, constrained trailing punctuation chars */
|
||||||
if (lengths[i] == 1 && s[offset] != '\0' && STRING(chs_trailing_punct1).contains(s[offset]))
|
if (lengths[i] == 1 && s[offset] != '\0' && chs_trailing_punct1.contains(s[offset]))
|
||||||
offset += lengths[i++];
|
offset += lengths[i++];
|
||||||
if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && s[offset - lengths[i - 1]] != s[offset] &&
|
if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && s[offset - lengths[i - 1]] != s[offset] &&
|
||||||
STRING(chs_trailing_punct2).contains(s[offset]))
|
chs_trailing_punct2.contains(s[offset]))
|
||||||
offset += lengths[i++];
|
offset += lengths[i++];
|
||||||
|
|
||||||
if (s[offset] != '\0')
|
if (s[offset] != '\0')
|
||||||
|
@ -107,9 +107,9 @@ void Tesseract::unrej_good_chs(WERD_RES *word) {
|
|||||||
int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
|
int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
|
||||||
int expected_outline_count;
|
int expected_outline_count;
|
||||||
|
|
||||||
if (STRING(outlines_odd).contains(c))
|
if (outlines_odd.contains(c))
|
||||||
return 0; // Don't use this char
|
return 0; // Don't use this char
|
||||||
else if (STRING(outlines_2).contains(c))
|
else if (outlines_2.contains(c))
|
||||||
expected_outline_count = 2;
|
expected_outline_count = 2;
|
||||||
else
|
else
|
||||||
expected_outline_count = 1;
|
expected_outline_count = 1;
|
||||||
|
@ -216,7 +216,7 @@ void EquationDetect::IdentifySpecialText(BLOBNBOX *blobnbox, const int height_th
|
|||||||
|
|
||||||
BlobSpecialTextType EquationDetect::EstimateTypeForUnichar(const UNICHARSET &unicharset,
|
BlobSpecialTextType EquationDetect::EstimateTypeForUnichar(const UNICHARSET &unicharset,
|
||||||
const UNICHAR_ID id) const {
|
const UNICHAR_ID id) const {
|
||||||
const STRING s = unicharset.id_to_unichar(id);
|
const std::string s = unicharset.id_to_unichar(id);
|
||||||
if (unicharset.get_isalpha(id)) {
|
if (unicharset.get_isalpha(id)) {
|
||||||
return BSTT_NONE;
|
return BSTT_NONE;
|
||||||
}
|
}
|
||||||
@ -237,8 +237,8 @@ BlobSpecialTextType EquationDetect::EstimateTypeForUnichar(const UNICHARSET &uni
|
|||||||
|
|
||||||
// Check if it is digit. In addition to the isdigit attribute, we also check
|
// Check if it is digit. In addition to the isdigit attribute, we also check
|
||||||
// if this character belongs to those likely to be confused with a digit.
|
// if this character belongs to those likely to be confused with a digit.
|
||||||
static const STRING kDigitsChars = "|";
|
static const char kDigitsChars[] = "|";
|
||||||
if (unicharset.get_isdigit(id) || (s.length() == 1 && kDigitsChars.contains(s[0]))) {
|
if (unicharset.get_isdigit(id) || (s.length() == 1 && strchr(kDigitsChars, s[0]) != nullptr)) {
|
||||||
return BSTT_DIGIT;
|
return BSTT_DIGIT;
|
||||||
} else {
|
} else {
|
||||||
return BSTT_MATH;
|
return BSTT_MATH;
|
||||||
@ -286,8 +286,8 @@ void EquationDetect::IdentifySpecialText() {
|
|||||||
lang_tesseract_->classify_integer_matcher_multiplier.set_value(classify_integer_matcher);
|
lang_tesseract_->classify_integer_matcher_multiplier.set_value(classify_integer_matcher);
|
||||||
|
|
||||||
if (equationdetect_save_spt_image) { // For debug.
|
if (equationdetect_save_spt_image) { // For debug.
|
||||||
STRING outfile;
|
std::string outfile;
|
||||||
GetOutputTiffName("_spt", &outfile);
|
GetOutputTiffName("_spt", outfile);
|
||||||
PaintSpecialTexts(outfile);
|
PaintSpecialTexts(outfile);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -351,11 +351,11 @@ int EquationDetect::FindEquationParts(ColPartitionGrid *part_grid, ColPartitionS
|
|||||||
part_grid_ = part_grid;
|
part_grid_ = part_grid;
|
||||||
best_columns_ = best_columns;
|
best_columns_ = best_columns;
|
||||||
resolution_ = lang_tesseract_->source_resolution();
|
resolution_ = lang_tesseract_->source_resolution();
|
||||||
STRING outfile;
|
std::string outfile;
|
||||||
page_count_++;
|
page_count_++;
|
||||||
|
|
||||||
if (equationdetect_save_bi_image) {
|
if (equationdetect_save_bi_image) {
|
||||||
GetOutputTiffName("_bi", &outfile);
|
GetOutputTiffName("_bi", outfile);
|
||||||
pixWrite(outfile.c_str(), lang_tesseract_->pix_binary(), IFF_TIFF_G4);
|
pixWrite(outfile.c_str(), lang_tesseract_->pix_binary(), IFF_TIFF_G4);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -371,7 +371,7 @@ int EquationDetect::FindEquationParts(ColPartitionGrid *part_grid, ColPartitionS
|
|||||||
IdentifyInlineParts();
|
IdentifyInlineParts();
|
||||||
|
|
||||||
if (equationdetect_save_seed_image) {
|
if (equationdetect_save_seed_image) {
|
||||||
GetOutputTiffName("_seed", &outfile);
|
GetOutputTiffName("_seed", outfile);
|
||||||
PaintColParts(outfile);
|
PaintColParts(outfile);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -396,7 +396,7 @@ int EquationDetect::FindEquationParts(ColPartitionGrid *part_grid, ColPartitionS
|
|||||||
ProcessMathBlockSatelliteParts();
|
ProcessMathBlockSatelliteParts();
|
||||||
|
|
||||||
if (equationdetect_save_merged_image) { // For debug.
|
if (equationdetect_save_merged_image) { // For debug.
|
||||||
GetOutputTiffName("_merged", &outfile);
|
GetOutputTiffName("_merged", outfile);
|
||||||
PaintColParts(outfile);
|
PaintColParts(outfile);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1383,14 +1383,14 @@ bool EquationDetect::IsNearMathNeighbor(const int y_gap, const ColPartition *nei
|
|||||||
return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh;
|
return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh;
|
||||||
}
|
}
|
||||||
|
|
||||||
void EquationDetect::GetOutputTiffName(const char *name, STRING *image_name) const {
|
void EquationDetect::GetOutputTiffName(const char *name, std::string &image_name) const {
|
||||||
ASSERT_HOST(image_name && name);
|
ASSERT_HOST(name);
|
||||||
char page[50];
|
char page[50];
|
||||||
snprintf(page, sizeof(page), "%04d", page_count_);
|
snprintf(page, sizeof(page), "%04d", page_count_);
|
||||||
*image_name = STRING(lang_tesseract_->imagebasename) + page + name + ".tif";
|
image_name = (lang_tesseract_->imagebasename) + page + name + ".tif";
|
||||||
}
|
}
|
||||||
|
|
||||||
void EquationDetect::PaintSpecialTexts(const STRING &outfile) const {
|
void EquationDetect::PaintSpecialTexts(const std::string &outfile) const {
|
||||||
Pix *pix = nullptr, *pixBi = lang_tesseract_->pix_binary();
|
Pix *pix = nullptr, *pixBi = lang_tesseract_->pix_binary();
|
||||||
pix = pixConvertTo32(pixBi);
|
pix = pixConvertTo32(pixBi);
|
||||||
ColPartitionGridSearch gsearch(part_grid_);
|
ColPartitionGridSearch gsearch(part_grid_);
|
||||||
@ -1407,7 +1407,7 @@ void EquationDetect::PaintSpecialTexts(const STRING &outfile) const {
|
|||||||
pixDestroy(&pix);
|
pixDestroy(&pix);
|
||||||
}
|
}
|
||||||
|
|
||||||
void EquationDetect::PaintColParts(const STRING &outfile) const {
|
void EquationDetect::PaintColParts(const std::string &outfile) const {
|
||||||
Pix *pix = pixConvertTo32(lang_tesseract_->BestPix());
|
Pix *pix = pixConvertTo32(lang_tesseract_->BestPix());
|
||||||
ColPartitionGridSearch gsearch(part_grid_);
|
ColPartitionGridSearch gsearch(part_grid_);
|
||||||
gsearch.StartFullSearch();
|
gsearch.StartFullSearch();
|
||||||
|
@ -202,16 +202,16 @@ protected:
|
|||||||
bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const;
|
bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const;
|
||||||
|
|
||||||
// Generate the tiff file name for output/debug file.
|
// Generate the tiff file name for output/debug file.
|
||||||
void GetOutputTiffName(const char *name, STRING *image_name) const;
|
void GetOutputTiffName(const char *name, std::string &image_name) const;
|
||||||
|
|
||||||
// Debugger function that renders ColPartitions on the input image, where:
|
// Debugger function that renders ColPartitions on the input image, where:
|
||||||
// parts labeled as PT_EQUATION will be painted in red, PT_INLINE_EQUATION
|
// parts labeled as PT_EQUATION will be painted in red, PT_INLINE_EQUATION
|
||||||
// will be painted in green, and other parts will be painted in blue.
|
// will be painted in green, and other parts will be painted in blue.
|
||||||
void PaintColParts(const STRING &outfile) const;
|
void PaintColParts(const std::string &outfile) const;
|
||||||
|
|
||||||
// Debugger function that renders the blobs in part_grid_ over the input
|
// Debugger function that renders the blobs in part_grid_ over the input
|
||||||
// image.
|
// image.
|
||||||
void PaintSpecialTexts(const STRING &outfile) const;
|
void PaintSpecialTexts(const std::string &outfile) const;
|
||||||
|
|
||||||
// Debugger function that print the math blobs density values for a
|
// Debugger function that print the math blobs density values for a
|
||||||
// ColPartition object.
|
// ColPartition object.
|
||||||
|
@ -37,7 +37,6 @@
|
|||||||
|
|
||||||
#include <tesseract/ocrclass.h> // for ETEXT_DESC
|
#include <tesseract/ocrclass.h> // for ETEXT_DESC
|
||||||
#include <tesseract/unichar.h> // for UNICHAR_ID
|
#include <tesseract/unichar.h> // for UNICHAR_ID
|
||||||
#include "strngs.h" // for STRING
|
|
||||||
|
|
||||||
#include <cstdint> // for INT16_MAX, int16_t, int32_t
|
#include <cstdint> // for INT16_MAX, int16_t, int32_t
|
||||||
|
|
||||||
@ -288,7 +287,7 @@ int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
|
|||||||
((word_done && word->best_choice->unichar_lengths().c_str()[0] == 1 &&
|
((word_done && word->best_choice->unichar_lengths().c_str()[0] == 1 &&
|
||||||
word->best_choice->unichar_string()[0] == '1') ||
|
word->best_choice->unichar_string()[0] == '1') ||
|
||||||
(!word_done &&
|
(!word_done &&
|
||||||
STRING(conflict_set_I_l_1).contains(word->best_choice->unichar_string()[0])))))) {
|
conflict_set_I_l_1.contains(word->best_choice->unichar_string()[0])))))) {
|
||||||
total_score += prev_word_score;
|
total_score += prev_word_score;
|
||||||
if (prev_word_done)
|
if (prev_word_done)
|
||||||
done_word_count++;
|
done_word_count++;
|
||||||
@ -330,7 +329,7 @@ int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
|
|||||||
prev_char_1 =
|
prev_char_1 =
|
||||||
((word_done && (word->best_choice->unichar_string()[offset] == '1')) ||
|
((word_done && (word->best_choice->unichar_string()[offset] == '1')) ||
|
||||||
(!word_done &&
|
(!word_done &&
|
||||||
STRING(conflict_set_I_l_1).contains(word->best_choice->unichar_string()[offset])));
|
conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])));
|
||||||
}
|
}
|
||||||
/* Find next word */
|
/* Find next word */
|
||||||
do {
|
do {
|
||||||
@ -356,7 +355,7 @@ bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
|
|||||||
word->uch_set->get_isdigit(word->best_choice->unichar_string().c_str() + offset,
|
word->uch_set->get_isdigit(word->best_choice->unichar_string().c_str() + offset,
|
||||||
word->best_choice->unichar_lengths()[i]) ||
|
word->best_choice->unichar_lengths()[i]) ||
|
||||||
(word->best_choice->permuter() == NUMBER_PERM &&
|
(word->best_choice->permuter() == NUMBER_PERM &&
|
||||||
STRING(numeric_punctuation).contains(word->best_choice->unichar_string().c_str()[offset])));
|
numeric_punctuation.contains(word->best_choice->unichar_string().c_str()[offset])));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -24,8 +24,6 @@
|
|||||||
|
|
||||||
#include <allheaders.h>
|
#include <allheaders.h>
|
||||||
|
|
||||||
#include "strngs.h"
|
|
||||||
|
|
||||||
namespace tesseract {
|
namespace tesseract {
|
||||||
|
|
||||||
LTRResultIterator::LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
|
LTRResultIterator::LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
|
||||||
@ -46,7 +44,7 @@ LTRResultIterator::~LTRResultIterator() = default;
|
|||||||
char *LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
|
char *LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
|
||||||
if (it_->word() == nullptr)
|
if (it_->word() == nullptr)
|
||||||
return nullptr; // Already at the end!
|
return nullptr; // Already at the end!
|
||||||
STRING text;
|
std::string text;
|
||||||
PAGE_RES_IT res_it(*it_);
|
PAGE_RES_IT res_it(*it_);
|
||||||
WERD_CHOICE *best_choice = res_it.word()->best_choice;
|
WERD_CHOICE *best_choice = res_it.word()->best_choice;
|
||||||
ASSERT_HOST(best_choice != nullptr);
|
ASSERT_HOST(best_choice != nullptr);
|
||||||
@ -306,7 +304,7 @@ bool LTRResultIterator::EquivalentToTruth(const char *str) const {
|
|||||||
char *LTRResultIterator::WordTruthUTF8Text() const {
|
char *LTRResultIterator::WordTruthUTF8Text() const {
|
||||||
if (!HasTruthString())
|
if (!HasTruthString())
|
||||||
return nullptr;
|
return nullptr;
|
||||||
STRING truth_text = it_->word()->blamer_bundle->TruthString();
|
std::string truth_text = it_->word()->blamer_bundle->TruthString();
|
||||||
int length = truth_text.length() + 1;
|
int length = truth_text.length() + 1;
|
||||||
char *result = new char[length];
|
char *result = new char[length];
|
||||||
strncpy(result, truth_text.c_str(), length);
|
strncpy(result, truth_text.c_str(), length);
|
||||||
@ -318,7 +316,7 @@ char *LTRResultIterator::WordTruthUTF8Text() const {
|
|||||||
char *LTRResultIterator::WordNormedUTF8Text() const {
|
char *LTRResultIterator::WordNormedUTF8Text() const {
|
||||||
if (it_->word() == nullptr)
|
if (it_->word() == nullptr)
|
||||||
return nullptr; // Already at the end!
|
return nullptr; // Already at the end!
|
||||||
STRING ocr_text;
|
std::string ocr_text;
|
||||||
WERD_CHOICE *best_choice = it_->word()->best_choice;
|
WERD_CHOICE *best_choice = it_->word()->best_choice;
|
||||||
const UNICHARSET *unicharset = it_->word()->uch_set;
|
const UNICHARSET *unicharset = it_->word()->uch_set;
|
||||||
ASSERT_HOST(best_choice != nullptr);
|
ASSERT_HOST(best_choice != nullptr);
|
||||||
|
@ -32,7 +32,6 @@
|
|||||||
#include "ratngs.h" // for WERD_CHOICE
|
#include "ratngs.h" // for WERD_CHOICE
|
||||||
#include "rect.h" // for TBOX
|
#include "rect.h" // for TBOX
|
||||||
#include "statistc.h" // for STATS
|
#include "statistc.h" // for STATS
|
||||||
#include "strngs.h" // for STRING
|
|
||||||
#include "tprintf.h" // for tprintf
|
#include "tprintf.h" // for tprintf
|
||||||
#include "unicharset.h" // for UNICHARSET
|
#include "unicharset.h" // for UNICHARSET
|
||||||
#include "werd.h" // for WERD, W_REP_CHAR
|
#include "werd.h" // for WERD, W_REP_CHAR
|
||||||
@ -91,16 +90,9 @@ static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *fun
|
|||||||
|
|
||||||
// =============================== Debug Code ================================
|
// =============================== Debug Code ================================
|
||||||
|
|
||||||
// Convert an integer to a decimal string.
|
|
||||||
static STRING StrOf(int num) {
|
|
||||||
char buffer[30];
|
|
||||||
snprintf(buffer, sizeof(buffer), "%d", num);
|
|
||||||
return STRING(buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Given a row-major matrix of unicode text and a column separator, print
|
// Given a row-major matrix of unicode text and a column separator, print
|
||||||
// a formatted table. For ASCII, we get good column alignment.
|
// a formatted table. For ASCII, we get good column alignment.
|
||||||
static void PrintTable(const std::vector<std::vector<STRING>> &rows, const STRING &colsep) {
|
static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) {
|
||||||
std::vector<int> max_col_widths;
|
std::vector<int> max_col_widths;
|
||||||
for (const auto &row : rows) {
|
for (const auto &row : rows) {
|
||||||
int num_columns = row.size();
|
int num_columns = row.size();
|
||||||
@ -119,56 +111,56 @@ static void PrintTable(const std::vector<std::vector<STRING>> &rows, const STRIN
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<STRING> col_width_patterns;
|
std::vector<std::string> col_width_patterns;
|
||||||
for (int c = 0; c < max_col_widths.size(); c++) {
|
for (int c = 0; c < max_col_widths.size(); c++) {
|
||||||
col_width_patterns.push_back(STRING("%-") + StrOf(max_col_widths[c]) + "s");
|
col_width_patterns.push_back(std::string("%-") + std::to_string(max_col_widths[c]) + "s");
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int r = 0; r < rows.size(); r++) {
|
for (int r = 0; r < rows.size(); r++) {
|
||||||
for (int c = 0; c < rows[r].size(); c++) {
|
for (int c = 0; c < rows[r].size(); c++) {
|
||||||
if (c > 0)
|
if (c > 0)
|
||||||
tprintf("%s", colsep.c_str());
|
tprintf("%s", colsep);
|
||||||
tprintf(col_width_patterns[c].c_str(), rows[r][c].c_str());
|
tprintf(col_width_patterns[c].c_str(), rows[r][c].c_str());
|
||||||
}
|
}
|
||||||
tprintf("\n");
|
tprintf("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static STRING RtlEmbed(const STRING &word, bool rtlify) {
|
static std::string RtlEmbed(const std::string &word, bool rtlify) {
|
||||||
if (rtlify)
|
if (rtlify)
|
||||||
return STRING(kRLE) + word + STRING(kPDF);
|
return std::string(kRLE) + word + std::string(kPDF);
|
||||||
return word;
|
return word;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Print the current thoughts of the paragraph detector.
|
// Print the current thoughts of the paragraph detector.
|
||||||
static void PrintDetectorState(const ParagraphTheory &theory,
|
static void PrintDetectorState(const ParagraphTheory &theory,
|
||||||
const GenericVector<RowScratchRegisters> &rows) {
|
const GenericVector<RowScratchRegisters> &rows) {
|
||||||
std::vector<std::vector<STRING>> output;
|
std::vector<std::vector<std::string>> output;
|
||||||
output.push_back(std::vector<STRING>());
|
output.push_back(std::vector<std::string>());
|
||||||
output.back().push_back("#row");
|
output.back().push_back("#row");
|
||||||
output.back().push_back("space");
|
output.back().push_back("space");
|
||||||
output.back().push_back("..");
|
output.back().push_back("..");
|
||||||
output.back().push_back("lword[widthSEL]");
|
output.back().push_back("lword[widthSEL]");
|
||||||
output.back().push_back("rword[widthSEL]");
|
output.back().push_back("rword[widthSEL]");
|
||||||
RowScratchRegisters::AppendDebugHeaderFields(&output.back());
|
RowScratchRegisters::AppendDebugHeaderFields(output.back());
|
||||||
output.back().push_back("text");
|
output.back().push_back("text");
|
||||||
|
|
||||||
for (int i = 0; i < rows.size(); i++) {
|
for (int i = 0; i < rows.size(); i++) {
|
||||||
output.push_back(std::vector<STRING>());
|
output.push_back(std::vector<std::string>());
|
||||||
std::vector<STRING> &row = output.back();
|
std::vector<std::string> &row = output.back();
|
||||||
const RowInfo &ri = *rows[i].ri_;
|
const RowInfo &ri = *rows[i].ri_;
|
||||||
row.push_back(StrOf(i));
|
row.push_back(std::to_string(i));
|
||||||
row.push_back(StrOf(ri.average_interword_space));
|
row.push_back(std::to_string(ri.average_interword_space));
|
||||||
row.push_back(ri.has_leaders ? ".." : " ");
|
row.push_back(ri.has_leaders ? ".." : " ");
|
||||||
row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + "[" + StrOf(ri.lword_box.width()) +
|
row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + "[" + std::to_string(ri.lword_box.width()) +
|
||||||
(ri.lword_likely_starts_idea ? "S" : "s") +
|
(ri.lword_likely_starts_idea ? "S" : "s") +
|
||||||
(ri.lword_likely_ends_idea ? "E" : "e") +
|
(ri.lword_likely_ends_idea ? "E" : "e") +
|
||||||
(ri.lword_indicates_list_item ? "L" : "l") + "]");
|
(ri.lword_indicates_list_item ? "L" : "l") + "]");
|
||||||
row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + "[" + StrOf(ri.rword_box.width()) +
|
row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + "[" + std::to_string(ri.rword_box.width()) +
|
||||||
(ri.rword_likely_starts_idea ? "S" : "s") +
|
(ri.rword_likely_starts_idea ? "S" : "s") +
|
||||||
(ri.rword_likely_ends_idea ? "E" : "e") +
|
(ri.rword_likely_ends_idea ? "E" : "e") +
|
||||||
(ri.rword_indicates_list_item ? "L" : "l") + "]");
|
(ri.rword_indicates_list_item ? "L" : "l") + "]");
|
||||||
rows[i].AppendDebugInfo(theory, &row);
|
rows[i].AppendDebugInfo(theory, row);
|
||||||
row.push_back(RtlEmbed(ri.text, !ri.ltr));
|
row.push_back(RtlEmbed(ri.text, !ri.ltr));
|
||||||
}
|
}
|
||||||
PrintTable(output, " ");
|
PrintTable(output, " ");
|
||||||
@ -180,11 +172,11 @@ static void PrintDetectorState(const ParagraphTheory &theory,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void DebugDump(bool should_print, const STRING &phase, const ParagraphTheory &theory,
|
static void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory,
|
||||||
const GenericVector<RowScratchRegisters> &rows) {
|
const GenericVector<RowScratchRegisters> &rows) {
|
||||||
if (!should_print)
|
if (!should_print)
|
||||||
return;
|
return;
|
||||||
tprintf("# %s\n", phase.c_str());
|
tprintf("# %s\n", phase);
|
||||||
PrintDetectorState(theory, rows);
|
PrintDetectorState(theory, rows);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -240,7 +232,7 @@ static const char *SkipOne(const char *str, const char *toskip) {
|
|||||||
// Return whether it is very likely that this is a numeral marker that could
|
// Return whether it is very likely that this is a numeral marker that could
|
||||||
// start a list item. Some examples include:
|
// start a list item. Some examples include:
|
||||||
// A I iii. VI (2) 3.5. [C-4]
|
// A I iii. VI (2) 3.5. [C-4]
|
||||||
static bool LikelyListNumeral(const STRING &word) {
|
static bool LikelyListNumeral(const std::string &word) {
|
||||||
const char *kRomans = "ivxlmdIVXLMD";
|
const char *kRomans = "ivxlmdIVXLMD";
|
||||||
const char *kDigits = "012345789";
|
const char *kDigits = "012345789";
|
||||||
const char *kOpen = "[{(";
|
const char *kOpen = "[{(";
|
||||||
@ -274,12 +266,12 @@ static bool LikelyListNumeral(const STRING &word) {
|
|||||||
return *pos == '\0';
|
return *pos == '\0';
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool LikelyListMark(const STRING &word) {
|
static bool LikelyListMark(const std::string &word) {
|
||||||
const char *kListMarks = "0Oo*.,+.";
|
const char *kListMarks = "0Oo*.,+.";
|
||||||
return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr;
|
return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool AsciiLikelyListItem(const STRING &word) {
|
bool AsciiLikelyListItem(const std::string &word) {
|
||||||
return LikelyListMark(word) || LikelyListNumeral(word);
|
return LikelyListMark(word) || LikelyListNumeral(word);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -348,7 +340,7 @@ int UnicodeSpanSkipper::SkipAlpha(int pos) {
|
|||||||
|
|
||||||
static bool LikelyListMarkUnicode(int ch) {
|
static bool LikelyListMarkUnicode(int ch) {
|
||||||
if (ch < 0x80) {
|
if (ch < 0x80) {
|
||||||
STRING single_ch;
|
std::string single_ch;
|
||||||
single_ch += ch;
|
single_ch += ch;
|
||||||
return LikelyListMark(single_ch);
|
return LikelyListMark(single_ch);
|
||||||
}
|
}
|
||||||
@ -413,7 +405,7 @@ static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
|
|||||||
// is_list - this word might be a list number or bullet.
|
// is_list - this word might be a list number or bullet.
|
||||||
// starts_idea - this word is likely to start a sentence.
|
// starts_idea - this word is likely to start a sentence.
|
||||||
// ends_idea - this word is likely to end a sentence.
|
// ends_idea - this word is likely to end a sentence.
|
||||||
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8,
|
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
|
||||||
bool *is_list, bool *starts_idea, bool *ends_idea) {
|
bool *is_list, bool *starts_idea, bool *ends_idea) {
|
||||||
*is_list = false;
|
*is_list = false;
|
||||||
*starts_idea = false;
|
*starts_idea = false;
|
||||||
@ -459,7 +451,7 @@ void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, c
|
|||||||
// is_list - this word might be a list number or bullet.
|
// is_list - this word might be a list number or bullet.
|
||||||
// starts_idea - this word is likely to start a sentence.
|
// starts_idea - this word is likely to start a sentence.
|
||||||
// ends_idea - this word is likely to end a sentence.
|
// ends_idea - this word is likely to end a sentence.
|
||||||
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8,
|
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
|
||||||
bool *is_list, bool *starts_idea, bool *ends_idea) {
|
bool *is_list, bool *starts_idea, bool *ends_idea) {
|
||||||
*is_list = false;
|
*is_list = false;
|
||||||
*starts_idea = false;
|
*starts_idea = false;
|
||||||
@ -492,17 +484,17 @@ void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
|
|||||||
|
|
||||||
// =============== Implementation of RowScratchRegisters =====================
|
// =============== Implementation of RowScratchRegisters =====================
|
||||||
/* static */
|
/* static */
|
||||||
void RowScratchRegisters::AppendDebugHeaderFields(std::vector<STRING> *header) {
|
void RowScratchRegisters::AppendDebugHeaderFields(std::vector<std::string> &header) {
|
||||||
header->push_back("[lmarg,lind;rind,rmarg]");
|
header.push_back("[lmarg,lind;rind,rmarg]");
|
||||||
header->push_back("model");
|
header.push_back("model");
|
||||||
}
|
}
|
||||||
|
|
||||||
void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,
|
void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,
|
||||||
std::vector<STRING> *dbg) const {
|
std::vector<std::string> &dbg) const {
|
||||||
char s[30];
|
char s[30];
|
||||||
snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]", lmargin_, lindent_, rindent_, rmargin_);
|
snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]", lmargin_, lindent_, rindent_, rmargin_);
|
||||||
dbg->push_back(s);
|
dbg.push_back(s);
|
||||||
STRING model_string;
|
std::string model_string;
|
||||||
model_string += static_cast<char>(GetLineType());
|
model_string += static_cast<char>(GetLineType());
|
||||||
model_string += ":";
|
model_string += ":";
|
||||||
|
|
||||||
@ -513,7 +505,7 @@ void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,
|
|||||||
if (model_numbers > 0)
|
if (model_numbers > 0)
|
||||||
model_string += ",";
|
model_string += ",";
|
||||||
if (StrongModel(hypotheses_[h].model)) {
|
if (StrongModel(hypotheses_[h].model)) {
|
||||||
model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model));
|
model_string += std::to_string(1 + theory.IndexOf(hypotheses_[h].model));
|
||||||
} else if (hypotheses_[h].model == kCrownLeft) {
|
} else if (hypotheses_[h].model == kCrownLeft) {
|
||||||
model_string += "CrL";
|
model_string += "CrL";
|
||||||
} else if (hypotheses_[h].model == kCrownRight) {
|
} else if (hypotheses_[h].model == kCrownRight) {
|
||||||
@ -524,7 +516,7 @@ void RowScratchRegisters::AppendDebugInfo(const ParagraphTheory &theory,
|
|||||||
if (model_numbers == 0)
|
if (model_numbers == 0)
|
||||||
model_string += "0";
|
model_string += "0";
|
||||||
|
|
||||||
dbg->push_back(model_string);
|
dbg.push_back(model_string);
|
||||||
}
|
}
|
||||||
|
|
||||||
void RowScratchRegisters::Init(const RowInfo &row) {
|
void RowScratchRegisters::Init(const RowInfo &row) {
|
||||||
@ -2323,7 +2315,7 @@ void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
|
|||||||
|
|
||||||
static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info) {
|
static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info) {
|
||||||
// Set up text, lword_text, and rword_text (mostly for debug printing).
|
// Set up text, lword_text, and rword_text (mostly for debug printing).
|
||||||
STRING fake_text;
|
std::string fake_text;
|
||||||
PageIterator pit(static_cast<const PageIterator &>(it));
|
PageIterator pit(static_cast<const PageIterator &>(it));
|
||||||
bool first_word = true;
|
bool first_word = true;
|
||||||
if (!pit.Empty(RIL_WORD)) {
|
if (!pit.Empty(RIL_WORD)) {
|
||||||
|
@ -21,8 +21,8 @@
|
|||||||
#define TESSERACT_CCMAIN_PARAGRAPHS_H_
|
#define TESSERACT_CCMAIN_PARAGRAPHS_H_
|
||||||
|
|
||||||
#include <list>
|
#include <list>
|
||||||
|
#include <string>
|
||||||
#include "rect.h" // for TBOX
|
#include "rect.h" // for TBOX
|
||||||
#include "strngs.h" // for STRING
|
|
||||||
|
|
||||||
namespace tesseract {
|
namespace tesseract {
|
||||||
|
|
||||||
@ -41,7 +41,7 @@ class GenericVector;
|
|||||||
class RowInfo {
|
class RowInfo {
|
||||||
public:
|
public:
|
||||||
// Constant data derived from Tesseract output.
|
// Constant data derived from Tesseract output.
|
||||||
STRING text; // the full UTF-8 text of the line.
|
std::string text; // the full UTF-8 text of the line.
|
||||||
bool ltr; // whether the majority of the text is left-to-right
|
bool ltr; // whether the majority of the text is left-to-right
|
||||||
// TODO(eger) make this more fine-grained.
|
// TODO(eger) make this more fine-grained.
|
||||||
|
|
||||||
@ -56,8 +56,8 @@ public:
|
|||||||
TBOX lword_box; // in normalized (horiz text rows) space
|
TBOX lword_box; // in normalized (horiz text rows) space
|
||||||
TBOX rword_box; // in normalized (horiz text rows) space
|
TBOX rword_box; // in normalized (horiz text rows) space
|
||||||
|
|
||||||
STRING lword_text; // the UTF-8 text of the leftmost werd
|
std::string lword_text; // the UTF-8 text of the leftmost werd
|
||||||
STRING rword_text; // the UTF-8 text of the rightmost werd
|
std::string rword_text; // the UTF-8 text of the rightmost werd
|
||||||
|
|
||||||
// The text of a paragraph typically starts with the start of an idea and
|
// The text of a paragraph typically starts with the start of an idea and
|
||||||
// ends with the end of an idea. Here we define paragraph as something that
|
// ends with the end of an idea. Here we define paragraph as something that
|
||||||
|
@ -32,7 +32,7 @@ class WERD_CHOICE;
|
|||||||
|
|
||||||
// Return whether the given word is likely to be a list item start word.
|
// Return whether the given word is likely to be a list item start word.
|
||||||
TESS_API
|
TESS_API
|
||||||
bool AsciiLikelyListItem(const STRING &word);
|
bool AsciiLikelyListItem(const std::string &word);
|
||||||
|
|
||||||
// Return the first Unicode Codepoint from werd[pos].
|
// Return the first Unicode Codepoint from werd[pos].
|
||||||
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
|
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
|
||||||
@ -40,12 +40,12 @@ int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);
|
|||||||
// Set right word attributes given either a unicharset and werd or a utf8
|
// Set right word attributes given either a unicharset and werd or a utf8
|
||||||
// string.
|
// string.
|
||||||
TESS_API
|
TESS_API
|
||||||
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8,
|
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
|
||||||
bool *is_list, bool *starts_idea, bool *ends_idea);
|
bool *is_list, bool *starts_idea, bool *ends_idea);
|
||||||
|
|
||||||
// Set left word attributes given either a unicharset and werd or a utf8 string.
|
// Set left word attributes given either a unicharset and werd or a utf8 string.
|
||||||
TESS_API
|
TESS_API
|
||||||
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8,
|
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
|
||||||
bool *is_list, bool *starts_idea, bool *ends_idea);
|
bool *is_list, bool *starts_idea, bool *ends_idea);
|
||||||
|
|
||||||
enum LineType {
|
enum LineType {
|
||||||
@ -171,10 +171,10 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Append header fields to a vector of row headings.
|
// Append header fields to a vector of row headings.
|
||||||
static void AppendDebugHeaderFields(std::vector<STRING> *header);
|
static void AppendDebugHeaderFields(std::vector<std::string> &header);
|
||||||
|
|
||||||
// Append data for this row to a vector of debug strings.
|
// Append data for this row to a vector of debug strings.
|
||||||
void AppendDebugInfo(const ParagraphTheory &theory, std::vector<STRING> *dbg) const;
|
void AppendDebugInfo(const ParagraphTheory &theory, std::vector<std::string> &dbg) const;
|
||||||
|
|
||||||
const RowInfo *ri_;
|
const RowInfo *ri_;
|
||||||
|
|
||||||
|
@ -151,11 +151,7 @@ std::string ParamContent::GetValue() const {
|
|||||||
} else if (param_type_ == VT_DOUBLE) {
|
} else if (param_type_ == VT_DOUBLE) {
|
||||||
result += std::to_string(*dIt);
|
result += std::to_string(*dIt);
|
||||||
} else if (param_type_ == VT_STRING) {
|
} else if (param_type_ == VT_STRING) {
|
||||||
if (STRING(*(sIt)).c_str() != nullptr) {
|
result = sIt->c_str();
|
||||||
result = sIt->c_str();
|
|
||||||
} else {
|
|
||||||
result = "Null";
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
@ -183,8 +179,8 @@ void ParamContent::SetValue(const char *val) {
|
|||||||
|
|
||||||
// Gets the up to the first 3 prefixes from s (split by _).
|
// Gets the up to the first 3 prefixes from s (split by _).
|
||||||
// For example, tesseract_foo_bar will be split into tesseract,foo and bar.
|
// For example, tesseract_foo_bar will be split into tesseract,foo and bar.
|
||||||
void ParamsEditor::GetPrefixes(const char *s, STRING *level_one, STRING *level_two,
|
void ParamsEditor::GetPrefixes(const char *s, std::string *level_one, std::string *level_two,
|
||||||
STRING *level_three) {
|
std::string *level_three) {
|
||||||
std::unique_ptr<char[]> p(new char[1024]);
|
std::unique_ptr<char[]> p(new char[1024]);
|
||||||
GetFirstWords(s, 1, p.get());
|
GetFirstWords(s, 1, p.get());
|
||||||
*level_one = p.get();
|
*level_one = p.get();
|
||||||
@ -234,9 +230,9 @@ SVMenuNode *ParamsEditor::BuildListOfAllLeaves(tesseract::Tesseract *tess) {
|
|||||||
// Count the # of entries starting with a specific prefix.
|
// Count the # of entries starting with a specific prefix.
|
||||||
for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
|
for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
|
||||||
ParamContent *vc = vc_it.data();
|
ParamContent *vc = vc_it.data();
|
||||||
STRING tag;
|
std::string tag;
|
||||||
STRING tag2;
|
std::string tag2;
|
||||||
STRING tag3;
|
std::string tag3;
|
||||||
|
|
||||||
GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
|
GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
|
||||||
amount[tag.c_str()]++;
|
amount[tag.c_str()]++;
|
||||||
@ -252,9 +248,9 @@ SVMenuNode *ParamsEditor::BuildListOfAllLeaves(tesseract::Tesseract *tess) {
|
|||||||
vc_it.move_to_first();
|
vc_it.move_to_first();
|
||||||
for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
|
for (vc_it.mark_cycle_pt(); !vc_it.cycled_list(); vc_it.forward()) {
|
||||||
ParamContent *vc = vc_it.data();
|
ParamContent *vc = vc_it.data();
|
||||||
STRING tag;
|
std::string tag;
|
||||||
STRING tag2;
|
std::string tag2;
|
||||||
STRING tag3;
|
std::string tag3;
|
||||||
GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
|
GetPrefixes(vc->GetName(), &tag, &tag2, &tag3);
|
||||||
|
|
||||||
if (amount[tag.c_str()] == 1) {
|
if (amount[tag.c_str()] == 1) {
|
||||||
@ -304,7 +300,7 @@ ParamsEditor::ParamsEditor(tesseract::Tesseract *tess, ScrollView *sv) {
|
|||||||
|
|
||||||
SVMenuNode *svMenuRoot = BuildListOfAllLeaves(tess);
|
SVMenuNode *svMenuRoot = BuildListOfAllLeaves(tess);
|
||||||
|
|
||||||
STRING paramfile;
|
std::string paramfile;
|
||||||
paramfile = tess->datadir;
|
paramfile = tess->datadir;
|
||||||
paramfile += VARDIR; // parameters dir
|
paramfile += VARDIR; // parameters dir
|
||||||
paramfile += "edited"; // actual name
|
paramfile += "edited"; // actual name
|
||||||
|
@ -25,7 +25,6 @@
|
|||||||
|
|
||||||
# include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
|
# include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
|
||||||
# include "scrollview.h" // for ScrollView (ptr only), SVEvent (ptr only)
|
# include "scrollview.h" // for ScrollView (ptr only), SVEvent (ptr only)
|
||||||
# include "strngs.h" // for STRING
|
|
||||||
|
|
||||||
namespace tesseract {
|
namespace tesseract {
|
||||||
|
|
||||||
@ -107,7 +106,7 @@ public:
|
|||||||
private:
|
private:
|
||||||
// Gets the up to the first 3 prefixes from s (split by _).
|
// Gets the up to the first 3 prefixes from s (split by _).
|
||||||
// For example, tesseract_foo_bar will be split into tesseract,foo and bar.
|
// For example, tesseract_foo_bar will be split into tesseract,foo and bar.
|
||||||
void GetPrefixes(const char *s, STRING *level_one, STRING *level_two, STRING *level_three);
|
void GetPrefixes(const char *s, std::string *level_one, std::string *level_two, std::string *level_three);
|
||||||
|
|
||||||
// Gets the first n words (split by _) and puts them in t.
|
// Gets the first n words (split by _) and puts them in t.
|
||||||
// For example, tesseract_foo_bar with N=2 will yield tesseract_foo_.
|
// For example, tesseract_foo_bar with N=2 will yield tesseract_foo_.
|
||||||
|
@ -794,8 +794,8 @@ bool Tesseract::word_display(PAGE_RES_IT *pr_it) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Display correct text and blamer information.
|
// Display correct text and blamer information.
|
||||||
STRING text;
|
std::string text;
|
||||||
STRING blame;
|
std::string blame;
|
||||||
if (word->display_flag(DF_TEXT) && word->text() != nullptr) {
|
if (word->display_flag(DF_TEXT) && word->text() != nullptr) {
|
||||||
text = word->text();
|
text = word->text();
|
||||||
}
|
}
|
||||||
@ -810,7 +810,7 @@ bool Tesseract::word_display(PAGE_RES_IT *pr_it) {
|
|||||||
text = blamer_bundle->TruthString();
|
text = blamer_bundle->TruthString();
|
||||||
}
|
}
|
||||||
text += " -> ";
|
text += " -> ";
|
||||||
STRING best_choice_str;
|
std::string best_choice_str;
|
||||||
if (word_res->best_choice == nullptr) {
|
if (word_res->best_choice == nullptr) {
|
||||||
best_choice_str = "NULL";
|
best_choice_str = "NULL";
|
||||||
} else {
|
} else {
|
||||||
|
@ -98,7 +98,7 @@ void Tesseract::recog_training_segmented(const char *filename, PAGE_RES *page_re
|
|||||||
PAGE_RES_IT page_res_it;
|
PAGE_RES_IT page_res_it;
|
||||||
page_res_it.page_res = page_res;
|
page_res_it.page_res = page_res;
|
||||||
page_res_it.restart_page();
|
page_res_it.restart_page();
|
||||||
STRING label;
|
std::string label;
|
||||||
|
|
||||||
// Process all the words on this page.
|
// Process all the words on this page.
|
||||||
TBOX tbox; // tesseract-identified box
|
TBOX tbox; // tesseract-identified box
|
||||||
@ -108,14 +108,14 @@ void Tesseract::recog_training_segmented(const char *filename, PAGE_RES *page_re
|
|||||||
int examined_words = 0;
|
int examined_words = 0;
|
||||||
do {
|
do {
|
||||||
keep_going = read_t(&page_res_it, &tbox);
|
keep_going = read_t(&page_res_it, &tbox);
|
||||||
keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
|
keep_going &= ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
|
||||||
// Align bottom left points of the TBOXes.
|
// Align bottom left points of the TBOXes.
|
||||||
while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
|
while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
|
||||||
if (bbox.bottom() < tbox.bottom()) {
|
if (bbox.bottom() < tbox.bottom()) {
|
||||||
page_res_it.forward();
|
page_res_it.forward();
|
||||||
keep_going = read_t(&page_res_it, &tbox);
|
keep_going = read_t(&page_res_it, &tbox);
|
||||||
} else {
|
} else {
|
||||||
keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
|
keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
|
while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
|
||||||
@ -123,7 +123,7 @@ void Tesseract::recog_training_segmented(const char *filename, PAGE_RES *page_re
|
|||||||
page_res_it.forward();
|
page_res_it.forward();
|
||||||
keep_going = read_t(&page_res_it, &tbox);
|
keep_going = read_t(&page_res_it, &tbox);
|
||||||
} else {
|
} else {
|
||||||
keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
|
keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// OCR the word if top right points of the TBOXes are similar.
|
// OCR the word if top right points of the TBOXes are similar.
|
||||||
|
@ -53,9 +53,6 @@ int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {
|
|||||||
|
|
||||||
namespace tesseract {
|
namespace tesseract {
|
||||||
|
|
||||||
CLISTIZEH(STRING)
|
|
||||||
CLISTIZE(STRING)
|
|
||||||
|
|
||||||
/*************************************************************************
|
/*************************************************************************
|
||||||
* set_done()
|
* set_done()
|
||||||
*
|
*
|
||||||
@ -196,7 +193,7 @@ void Tesseract::reject_I_1_L(WERD_RES *word) {
|
|||||||
|
|
||||||
for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
|
for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
|
||||||
offset += word->best_choice->unichar_lengths()[i], i += 1) {
|
offset += word->best_choice->unichar_lengths()[i], i += 1) {
|
||||||
if (STRING(conflict_set_I_l_1).contains(word->best_choice->unichar_string()[offset])) {
|
if (conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])) {
|
||||||
// rej 1Il conflict
|
// rej 1Il conflict
|
||||||
word->reject_map[i].setrej_1Il_conflict();
|
word->reject_map[i].setrej_1Il_conflict();
|
||||||
}
|
}
|
||||||
@ -316,7 +313,7 @@ bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) {
|
|||||||
offset += lengths[i++])
|
offset += lengths[i++])
|
||||||
non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
|
non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
|
||||||
word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
|
word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
|
||||||
!STRING(conflict_set_I_l_1).contains(word[offset]);
|
!conflict_set_I_l_1.contains(word[offset]);
|
||||||
if (!non_conflict_set_char) {
|
if (!non_conflict_set_char) {
|
||||||
if (update_map)
|
if (update_map)
|
||||||
reject_I_1_L(word_res);
|
reject_I_1_L(word_res);
|
||||||
@ -409,7 +406,7 @@ bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) {
|
|||||||
for (i = 0, offset = 0; word[offset] != '\0';
|
for (i = 0, offset = 0; word[offset] != '\0';
|
||||||
offset += word_res->best_choice->unichar_lengths()[i++]) {
|
offset += word_res->best_choice->unichar_lengths()[i++]) {
|
||||||
if ((!allow_1s || (word[offset] != '1')) &&
|
if ((!allow_1s || (word[offset] != '1')) &&
|
||||||
STRING(conflict_set_I_l_1).contains(word[offset])) {
|
conflict_set_I_l_1.contains(word[offset])) {
|
||||||
if (update_map)
|
if (update_map)
|
||||||
word_res->reject_map[i].setrej_1Il_conflict();
|
word_res->reject_map[i].setrej_1Il_conflict();
|
||||||
conflict = true;
|
conflict = true;
|
||||||
@ -425,7 +422,7 @@ bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) {
|
|||||||
if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
|
if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
|
||||||
first_alphanum_index_ = first_alphanum_index(word, lengths);
|
first_alphanum_index_ = first_alphanum_index(word, lengths);
|
||||||
first_alphanum_offset_ = first_alphanum_offset(word, lengths);
|
first_alphanum_offset_ = first_alphanum_offset(word, lengths);
|
||||||
if (STRING(conflict_set_I_l_1).contains(word[first_alphanum_offset_])) {
|
if (conflict_set_I_l_1.contains(word[first_alphanum_offset_])) {
|
||||||
if (update_map)
|
if (update_map)
|
||||||
word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
|
word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();
|
||||||
return true;
|
return true;
|
||||||
@ -502,7 +499,7 @@ void Tesseract::dont_allow_1Il(WERD_RES *word) {
|
|||||||
|
|
||||||
for (i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
|
for (i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
|
||||||
if (word->reject_map[i].accepted()) {
|
if (word->reject_map[i].accepted()) {
|
||||||
if (STRING(conflict_set_I_l_1).contains(s[offset])) {
|
if (conflict_set_I_l_1.contains(s[offset])) {
|
||||||
accepted_1Il = true;
|
accepted_1Il = true;
|
||||||
} else {
|
} else {
|
||||||
if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
|
if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
|
||||||
@ -515,7 +512,7 @@ void Tesseract::dont_allow_1Il(WERD_RES *word) {
|
|||||||
return; // Nothing to worry about
|
return; // Nothing to worry about
|
||||||
|
|
||||||
for (i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
|
for (i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {
|
||||||
if (STRING(conflict_set_I_l_1).contains(s[offset]) && word->reject_map[i].accepted())
|
if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted())
|
||||||
word->reject_map[i].setrej_postNN_1Il();
|
word->reject_map[i].setrej_postNN_1Il();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -549,7 +546,7 @@ bool Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {
|
|||||||
if (word->best_choice->unichar_lengths().length() <= 1)
|
if (word->best_choice->unichar_lengths().length() <= 1)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
if (!STRING(ok_repeated_ch_non_alphanum_wds).contains(word->best_choice->unichar_string()[0]))
|
if (!ok_repeated_ch_non_alphanum_wds.contains(word->best_choice->unichar_string()[0]))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
|
UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
|
||||||
|
@ -440,7 +440,7 @@ Dict &Tesseract::getDict() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void Tesseract::Clear() {
|
void Tesseract::Clear() {
|
||||||
STRING debug_name = imagebasename + "_debug.pdf";
|
std::string debug_name = imagebasename + "_debug.pdf";
|
||||||
pixa_debug_.WritePDF(debug_name.c_str());
|
pixa_debug_.WritePDF(debug_name.c_str());
|
||||||
pixDestroy(&pix_binary_);
|
pixDestroy(&pix_binary_);
|
||||||
pixDestroy(&pix_grey_);
|
pixDestroy(&pix_grey_);
|
||||||
|
@ -98,9 +98,9 @@ bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool
|
|||||||
int num_boxes = 0;
|
int num_boxes = 0;
|
||||||
for (int i = 0; i < lines.size(); ++i) {
|
for (int i = 0; i < lines.size(); ++i) {
|
||||||
int page = 0;
|
int page = 0;
|
||||||
STRING utf8_str;
|
std::string utf8_str;
|
||||||
TBOX box;
|
TBOX box;
|
||||||
if (!ParseBoxFileStr(lines[i].c_str(), &page, &utf8_str, &box)) {
|
if (!ParseBoxFileStr(lines[i].c_str(), &page, utf8_str, &box)) {
|
||||||
if (continue_on_failure)
|
if (continue_on_failure)
|
||||||
continue;
|
continue;
|
||||||
else
|
else
|
||||||
@ -137,14 +137,14 @@ bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool
|
|||||||
// for valid utf-8 and allows space or tab between fields.
|
// for valid utf-8 and allows space or tab between fields.
|
||||||
// utf8_str is set with the unichar string, and bounding box with the box.
|
// utf8_str is set with the unichar string, and bounding box with the box.
|
||||||
// If there are page numbers in the file, it reads them all.
|
// If there are page numbers in the file, it reads them all.
|
||||||
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box) {
|
bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box) {
|
||||||
return ReadNextBox(-1, line_number, box_file, utf8_str, bounding_box);
|
return ReadNextBox(-1, line_number, box_file, utf8_str, bounding_box);
|
||||||
}
|
}
|
||||||
|
|
||||||
// As ReadNextBox above, but get a specific page number. (0-based)
|
// As ReadNextBox above, but get a specific page number. (0-based)
|
||||||
// Use -1 to read any page number. Files without page number all
|
// Use -1 to read any page number. Files without page number all
|
||||||
// read as if they are page 0.
|
// read as if they are page 0.
|
||||||
bool ReadNextBox(int target_page, int *line_number, FILE *box_file, STRING *utf8_str,
|
bool ReadNextBox(int target_page, int *line_number, FILE *box_file, std::string &utf8_str,
|
||||||
TBOX *bounding_box) {
|
TBOX *bounding_box) {
|
||||||
int page = 0;
|
int page = 0;
|
||||||
char buff[kBoxReadBufSize]; // boxfile read buffer
|
char buff[kBoxReadBufSize]; // boxfile read buffer
|
||||||
@ -185,10 +185,10 @@ bool ReadNextBox(int target_page, int *line_number, FILE *box_file, STRING *utf8
|
|||||||
// and for word/line-level boxes:
|
// and for word/line-level boxes:
|
||||||
// WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
|
// WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
|
||||||
// See applyybox.cpp for more information.
|
// See applyybox.cpp for more information.
|
||||||
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, STRING *utf8_str,
|
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str,
|
||||||
TBOX *bounding_box) {
|
TBOX *bounding_box) {
|
||||||
*bounding_box = TBOX(); // Initialize it to empty.
|
*bounding_box = TBOX(); // Initialize it to empty.
|
||||||
*utf8_str = "";
|
utf8_str = "";
|
||||||
char uch[kBoxReadBufSize];
|
char uch[kBoxReadBufSize];
|
||||||
const char *buffptr = boxfile_str;
|
const char *buffptr = boxfile_str;
|
||||||
// Read the unichar without messing up on Tibetan.
|
// Read the unichar without messing up on Tibetan.
|
||||||
@ -245,7 +245,7 @@ bool ParseBoxFileStr(const char *boxfile_str, int *page_number, STRING *utf8_str
|
|||||||
}
|
}
|
||||||
used += new_used;
|
used += new_used;
|
||||||
}
|
}
|
||||||
*utf8_str = uch;
|
utf8_str = uch;
|
||||||
if (x_min > x_max)
|
if (x_min > x_max)
|
||||||
std::swap(x_min, x_max);
|
std::swap(x_min, x_max);
|
||||||
if (y_min > y_max)
|
if (y_min > y_max)
|
||||||
|
@ -64,18 +64,18 @@ bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool
|
|||||||
// utf8_str is set with the unichar string, and bounding box with the box.
|
// utf8_str is set with the unichar string, and bounding box with the box.
|
||||||
// If there are page numbers in the file, it reads them all.
|
// If there are page numbers in the file, it reads them all.
|
||||||
TESS_API
|
TESS_API
|
||||||
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box);
|
bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box);
|
||||||
// As ReadNextBox above, but get a specific page number. (0-based)
|
// As ReadNextBox above, but get a specific page number. (0-based)
|
||||||
// Use -1 to read any page number. Files without page number all
|
// Use -1 to read any page number. Files without page number all
|
||||||
// read as if they are page 0.
|
// read as if they are page 0.
|
||||||
TESS_API
|
TESS_API
|
||||||
bool ReadNextBox(int target_page, int *line_number, FILE *box_file, STRING *utf8_str,
|
bool ReadNextBox(int target_page, int *line_number, FILE *box_file, std::string &utf8_str,
|
||||||
TBOX *bounding_box);
|
TBOX *bounding_box);
|
||||||
|
|
||||||
// Parses the given box file string into a page_number, utf8_str, and
|
// Parses the given box file string into a page_number, utf8_str, and
|
||||||
// bounding_box. Returns true on a successful parse.
|
// bounding_box. Returns true on a successful parse.
|
||||||
TESS_API
|
TESS_API
|
||||||
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, STRING *utf8_str,
|
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str,
|
||||||
TBOX *bounding_box);
|
TBOX *bounding_box);
|
||||||
|
|
||||||
// Creates a box file string from a unichar string, TBOX and page number.
|
// Creates a box file string from a unichar string, TBOX and page number.
|
||||||
|
@ -91,7 +91,7 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Fetches the value of the named param as a STRING. Returns false if not
|
// Fetches the value of the named param as a string. Returns false if not
|
||||||
// found.
|
// found.
|
||||||
static bool GetParamAsString(const char *name, const ParamsVectors *member_params,
|
static bool GetParamAsString(const char *name, const ParamsVectors *member_params,
|
||||||
std::string *value);
|
std::string *value);
|
||||||
@ -242,6 +242,9 @@ public:
|
|||||||
const char *c_str() const {
|
const char *c_str() const {
|
||||||
return value_.c_str();
|
return value_.c_str();
|
||||||
}
|
}
|
||||||
|
bool contains(char c) {
|
||||||
|
return value_.find(c) != std::string::npos;
|
||||||
|
}
|
||||||
bool empty() {
|
bool empty() {
|
||||||
return value_.length() <= 0;
|
return value_.length() <= 0;
|
||||||
}
|
}
|
||||||
|
@ -96,10 +96,6 @@ bool STRING::SkipDeSerialize(TFile *fp) {
|
|||||||
return fp->Skip(len);
|
return fp->Skip(len);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool STRING::contains(const char c) const {
|
|
||||||
return (c != '\0') && (strchr(c_str(), c) != nullptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
void STRING::split(const char c, std::vector<STRING> *splited) {
|
void STRING::split(const char c, std::vector<STRING> *splited) {
|
||||||
int start_index = 0;
|
int start_index = 0;
|
||||||
const int len = length();
|
const int len = length();
|
||||||
|
@ -59,9 +59,6 @@ public:
|
|||||||
TESS_API
|
TESS_API
|
||||||
static bool SkipDeSerialize(tesseract::TFile *fp);
|
static bool SkipDeSerialize(tesseract::TFile *fp);
|
||||||
|
|
||||||
TESS_API
|
|
||||||
bool contains(char c) const;
|
|
||||||
|
|
||||||
TESS_API
|
TESS_API
|
||||||
void split(char c, std::vector<STRING> *splited);
|
void split(char c, std::vector<STRING> *splited);
|
||||||
};
|
};
|
||||||
|
@ -149,9 +149,9 @@ void MasterTrainer::ReadTrainingSamples(const char *page_name,
|
|||||||
if (font_id < 0)
|
if (font_id < 0)
|
||||||
font_id = 0;
|
font_id = 0;
|
||||||
int page_number;
|
int page_number;
|
||||||
STRING unichar;
|
std::string unichar;
|
||||||
TBOX bounding_box;
|
TBOX bounding_box;
|
||||||
if (!ParseBoxFileStr(space, &page_number, &unichar, &bounding_box)) {
|
if (!ParseBoxFileStr(space, &page_number, unichar, &bounding_box)) {
|
||||||
tprintf("Bad format in tr file, reading box coords\n");
|
tprintf("Bad format in tr file, reading box coords\n");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user