mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-30 23:49:05 +08:00
preserve interword spaces patch - Issue 1409
This commit is contained in:
parent
e0441d0c6b
commit
36883b4faf
@ -34,6 +34,12 @@ ResultIterator::ResultIterator(const LTRResultIterator &resit)
|
|||||||
: LTRResultIterator(resit) {
|
: LTRResultIterator(resit) {
|
||||||
in_minor_direction_ = false;
|
in_minor_direction_ = false;
|
||||||
at_beginning_of_minor_run_ = false;
|
at_beginning_of_minor_run_ = false;
|
||||||
|
|
||||||
|
BoolParam *p = ParamUtils::FindParam<BoolParam>(
|
||||||
|
"preserve_interword_spaces", GlobalParams()->bool_params,
|
||||||
|
tesseract_->params()->bool_params);
|
||||||
|
if (p != NULL) preserve_interword_spaces_ = (bool)(*p);
|
||||||
|
|
||||||
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
|
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
|
||||||
MoveToLogicalStartOfTextline();
|
MoveToLogicalStartOfTextline();
|
||||||
}
|
}
|
||||||
@ -629,14 +635,16 @@ void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) {
|
|||||||
|
|
||||||
int words_appended = 0;
|
int words_appended = 0;
|
||||||
do {
|
do {
|
||||||
|
int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() : 1;
|
||||||
|
for(int i = 0 ; i < numSpaces ; ++i) {
|
||||||
|
*text += " ";
|
||||||
|
}
|
||||||
AppendUTF8WordText(text);
|
AppendUTF8WordText(text);
|
||||||
words_appended++;
|
words_appended++;
|
||||||
*text += " ";
|
|
||||||
} while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
|
} while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
|
||||||
if (BidiDebug(1)) {
|
if (BidiDebug(1)) {
|
||||||
tprintf("%d words printed\n", words_appended);
|
tprintf("%d words printed\n", words_appended);
|
||||||
}
|
}
|
||||||
text->truncate_at(text->length() - 1);
|
|
||||||
*text += line_separator_;
|
*text += line_separator_;
|
||||||
// If we just finished a paragraph, add an extra newline.
|
// If we just finished a paragraph, add an extra newline.
|
||||||
if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA))
|
if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA))
|
||||||
|
@ -46,8 +46,8 @@ class TESS_API ResultIterator : public LTRResultIterator {
|
|||||||
virtual ~ResultIterator() {}
|
virtual ~ResultIterator() {}
|
||||||
|
|
||||||
// ============= Moving around within the page ============.
|
// ============= Moving around within the page ============.
|
||||||
/**
|
/**
|
||||||
* Moves the iterator to point to the start of the page to begin
|
* Moves the iterator to point to the start of the page to begin
|
||||||
* an iteration.
|
* an iteration.
|
||||||
*/
|
*/
|
||||||
virtual void Begin();
|
virtual void Begin();
|
||||||
@ -181,7 +181,7 @@ class TESS_API ResultIterator : public LTRResultIterator {
|
|||||||
void MoveToLogicalStartOfTextline();
|
void MoveToLogicalStartOfTextline();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Precondition: current_paragraph_is_ltr_ and in_minor_direction_
|
* Precondition: current_paragraph_is_ltr_ and in_minor_direction_
|
||||||
* are set.
|
* are set.
|
||||||
*/
|
*/
|
||||||
void MoveToLogicalStartOfWord();
|
void MoveToLogicalStartOfWord();
|
||||||
@ -231,6 +231,12 @@ class TESS_API ResultIterator : public LTRResultIterator {
|
|||||||
|
|
||||||
/** Is the currently pointed-at character in a minor-direction sequence? */
|
/** Is the currently pointed-at character in a minor-direction sequence? */
|
||||||
bool in_minor_direction_;
|
bool in_minor_direction_;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should detected inter-word spaces be preserved, or "compressed" to a single
|
||||||
|
* space character (default behavior).
|
||||||
|
*/
|
||||||
|
bool preserve_interword_spaces_ = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace tesseract.
|
} // namespace tesseract.
|
||||||
|
@ -440,6 +440,8 @@ Tesseract::Tesseract()
|
|||||||
this->params()),
|
this->params()),
|
||||||
INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
|
INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
|
||||||
this->params()),
|
this->params()),
|
||||||
|
BOOL_MEMBER(preserve_interword_spaces, false,
|
||||||
|
"Preserve multiple interword spaces", this->params()),
|
||||||
|
|
||||||
// The following parameters were deprecated and removed from their original
|
// The following parameters were deprecated and removed from their original
|
||||||
// locations. The parameters are temporarily kept here to give Tesseract
|
// locations. The parameters are temporarily kept here to give Tesseract
|
||||||
|
@ -1009,6 +1009,7 @@ class Tesseract : public Wordrec {
|
|||||||
double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
|
double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
|
||||||
"Fraction of height used as a minimum gap for aligned blobs.");
|
"Fraction of height used as a minimum gap for aligned blobs.");
|
||||||
INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible");
|
INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible");
|
||||||
|
BOOL_VAR_H(preserve_interword_spaces, false, "Preserve multiple interword spaces");
|
||||||
|
|
||||||
// The following parameters were deprecated and removed from their original
|
// The following parameters were deprecated and removed from their original
|
||||||
// locations. The parameters are temporarily kept here to give Tesseract
|
// locations. The parameters are temporarily kept here to give Tesseract
|
||||||
|
Loading…
Reference in New Issue
Block a user