preserve interword spaces patch - Issue 1409

This commit is contained in:
Zdenko Podobný 2015-01-27 22:58:04 +01:00
parent e0441d0c6b
commit 36883b4faf
4 changed files with 22 additions and 5 deletions

View File

@ -34,6 +34,12 @@ ResultIterator::ResultIterator(const LTRResultIterator &resit)
: LTRResultIterator(resit) { : LTRResultIterator(resit) {
in_minor_direction_ = false; in_minor_direction_ = false;
at_beginning_of_minor_run_ = false; at_beginning_of_minor_run_ = false;
BoolParam *p = ParamUtils::FindParam<BoolParam>(
"preserve_interword_spaces", GlobalParams()->bool_params,
tesseract_->params()->bool_params);
if (p != NULL) preserve_interword_spaces_ = (bool)(*p);
current_paragraph_is_ltr_ = CurrentParagraphIsLtr(); current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
MoveToLogicalStartOfTextline(); MoveToLogicalStartOfTextline();
} }
@ -629,14 +635,16 @@ void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) {
int words_appended = 0; int words_appended = 0;
do { do {
int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() : 1;
for(int i = 0 ; i < numSpaces ; ++i) {
*text += " ";
}
AppendUTF8WordText(text); AppendUTF8WordText(text);
words_appended++; words_appended++;
*text += " ";
} while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE)); } while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
if (BidiDebug(1)) { if (BidiDebug(1)) {
tprintf("%d words printed\n", words_appended); tprintf("%d words printed\n", words_appended);
} }
text->truncate_at(text->length() - 1);
*text += line_separator_; *text += line_separator_;
// If we just finished a paragraph, add an extra newline. // If we just finished a paragraph, add an extra newline.
if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA)) if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA))

View File

@ -46,8 +46,8 @@ class TESS_API ResultIterator : public LTRResultIterator {
virtual ~ResultIterator() {} virtual ~ResultIterator() {}
// ============= Moving around within the page ============. // ============= Moving around within the page ============.
/** /**
* Moves the iterator to point to the start of the page to begin * Moves the iterator to point to the start of the page to begin
* an iteration. * an iteration.
*/ */
virtual void Begin(); virtual void Begin();
@ -181,7 +181,7 @@ class TESS_API ResultIterator : public LTRResultIterator {
void MoveToLogicalStartOfTextline(); void MoveToLogicalStartOfTextline();
/** /**
* Precondition: current_paragraph_is_ltr_ and in_minor_direction_ * Precondition: current_paragraph_is_ltr_ and in_minor_direction_
* are set. * are set.
*/ */
void MoveToLogicalStartOfWord(); void MoveToLogicalStartOfWord();
@ -231,6 +231,12 @@ class TESS_API ResultIterator : public LTRResultIterator {
/** Is the currently pointed-at character in a minor-direction sequence? */ /** Is the currently pointed-at character in a minor-direction sequence? */
bool in_minor_direction_; bool in_minor_direction_;
/**
* Should detected inter-word spaces be preserved, or "compressed" to a single
* space character (default behavior).
*/
bool preserve_interword_spaces_ = false;
}; };
} // namespace tesseract. } // namespace tesseract.

View File

@ -440,6 +440,8 @@ Tesseract::Tesseract()
this->params()), this->params()),
INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible", INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
this->params()), this->params()),
BOOL_MEMBER(preserve_interword_spaces, false,
"Preserve multiple interword spaces", this->params()),
// The following parameters were deprecated and removed from their original // The following parameters were deprecated and removed from their original
// locations. The parameters are temporarily kept here to give Tesseract // locations. The parameters are temporarily kept here to give Tesseract

View File

@ -1009,6 +1009,7 @@ class Tesseract : public Wordrec {
double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75, double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
"Fraction of height used as a minimum gap for aligned blobs."); "Fraction of height used as a minimum gap for aligned blobs.");
INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible"); INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible");
BOOL_VAR_H(preserve_interword_spaces, false, "Preserve multiple interword spaces");
// The following parameters were deprecated and removed from their original // The following parameters were deprecated and removed from their original
// locations. The parameters are temporarily kept here to give Tesseract // locations. The parameters are temporarily kept here to give Tesseract