mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-10 20:23:12 +08:00
Fixed bidi handling in PDF output
This commit is contained in:
parent
f927728169
commit
d9699c4099
@ -60,74 +60,22 @@ long dist2(int x1, int y1, int x2, int y2) {
|
|||||||
return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
|
return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
|
||||||
}
|
}
|
||||||
|
|
||||||
char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
|
// Viewers like evince can get really confused during copy-paste when
|
||||||
double width, double height) {
|
// the baseline wanders around. So I've decided to project every word
|
||||||
double ppi = api->GetSourceYResolution();
|
// onto the (straight) line baseline. All numbers are in the native
|
||||||
STRING pdf_str("");
|
// PDF coordinate system, which has the origin in the bottom left and
|
||||||
double old_x = 0.0, old_y = 0.0;
|
// the unit is points, which is 1/72 inch. Tesseract reports baselines
|
||||||
int old_pointsize = 0;
|
// left-to-right no matter what the reading order is. We need the
|
||||||
|
// word baseline in reading order, so we do that conversion here. Returns
|
||||||
// TODO(jbreiden) Slightly cleaner from an abstraction standpoint
|
// the word's baseline origin and length.
|
||||||
// if this were to live inside a separate text object.
|
void GetWordBaseline(int writing_direction, int ppi, int height,
|
||||||
pdf_str += "q ";
|
int word_x1, int word_y1, int word_x2, int word_y2,
|
||||||
pdf_str.add_str_double("", prec(width));
|
int line_x1, int line_y1, int line_x2, int line_y2,
|
||||||
pdf_str += " 0 0 ";
|
double *x0, double *y0, double *length) {
|
||||||
pdf_str.add_str_double("", prec(height));
|
|
||||||
pdf_str += " 0 0 cm /Im1 Do Q\n";
|
|
||||||
|
|
||||||
ResultIterator *res_it = api->GetIterator();
|
|
||||||
|
|
||||||
while (!res_it->Empty(RIL_BLOCK)) {
|
|
||||||
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
|
||||||
pdf_str += "BT\n3 Tr\n"; // Begin text object, use invisible ink
|
|
||||||
old_pointsize = 0.0; // Every block will declare its font
|
|
||||||
}
|
|
||||||
|
|
||||||
int line_x1, line_y1, line_x2, line_y2;
|
|
||||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
|
||||||
res_it->Baseline(RIL_TEXTLINE,
|
|
||||||
&line_x1, &line_y1, &line_x2, &line_y2);
|
|
||||||
double rise = abs(line_y2 - line_y1) * 72 / ppi;
|
|
||||||
double run = abs(line_x2 - line_x1) * 72 / ppi;
|
|
||||||
// There are some really stupid PDF viewers in the wild, such as
|
|
||||||
// 'Preview' which ships with the Mac. They might do a better
|
|
||||||
// job with text selection and highlighting when given perfectly
|
|
||||||
// straight text instead of very slightly tilted text. I chose
|
|
||||||
// this threshold large enough to absorb noise, but small enough
|
|
||||||
// that lines probably won't cross each other if the whole page
|
|
||||||
// is tilted at almost exactly the clipping threshold.
|
|
||||||
if (rise < 2.0 && 2.0 < run)
|
|
||||||
line_y1 = line_y2 = (line_y1 + line_y2) / 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (res_it->Empty(RIL_WORD)) {
|
|
||||||
res_it->Next(RIL_WORD);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
int word_x1, word_y1, word_x2, word_y2;
|
|
||||||
res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
|
|
||||||
|
|
||||||
// The critical one is writing_direction
|
|
||||||
tesseract::Orientation orientation;
|
|
||||||
tesseract::WritingDirection writing_direction;
|
|
||||||
tesseract::TextlineOrder textline_order;
|
|
||||||
float deskew_angle;
|
|
||||||
res_it->Orientation(&orientation, &writing_direction,
|
|
||||||
&textline_order, &deskew_angle);
|
|
||||||
|
|
||||||
// Unlike Tesseract, we always want the word baseline in reading order.
|
|
||||||
if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
|
if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
|
||||||
Swap(&word_x1, &word_x2);
|
Swap(&word_x1, &word_x2);
|
||||||
Swap(&word_y1, &word_y2);
|
Swap(&word_y1, &word_y2);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Viewers like evince can get really confused during copy-paste
|
|
||||||
// when the baseline wanders around. I've decided to force every
|
|
||||||
// word to match the (straight) baseline. The math below is just
|
|
||||||
// projecting the word origin onto the baseline. All numbers are
|
|
||||||
// in the native PDF coordinate system, which has the origin in
|
|
||||||
// the bottom left and the unit is points, which is 1/72 inch.
|
|
||||||
double word_length;
|
double word_length;
|
||||||
double x, y;
|
double x, y;
|
||||||
{
|
{
|
||||||
@ -149,76 +97,179 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
|
|||||||
x = x * 72 / ppi;
|
x = x * 72 / ppi;
|
||||||
y = height - (y * 72.0 / ppi);
|
y = height - (y * 72.0 / ppi);
|
||||||
}
|
}
|
||||||
|
*x0 = x;
|
||||||
|
*y0 = y;
|
||||||
|
*length = word_length;
|
||||||
|
}
|
||||||
|
|
||||||
int pointsize = 0;
|
// Compute coefficients for an affine matrix describing the rotation
|
||||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
// of the text. If the text is right-to-left such as Arabic or Hebrew,
|
||||||
// Calculate the rotation angle in the PDF cooordinate system,
|
// we reflect over the Y-axis. This matrix will set the coordinate
|
||||||
// which has the origin in the bottom left. The Tesseract
|
// system for placing text in the PDF file.
|
||||||
// coordinate system has the origin in the upper left.
|
|
||||||
//
|
|
||||||
// PDF is kind of a like turtle graphics, and we orient the
|
|
||||||
// turtle (errr... initial cursor position) with an affine
|
|
||||||
// transformation.
|
|
||||||
//
|
|
||||||
// Rotate RTL Translate
|
|
||||||
//
|
|
||||||
// [ x' y' 1 ] = [ x y 1 ] [ cos𝜃 -sin𝜃 0 ] [ -1 0 0 ] [ 1 0 0 ]
|
|
||||||
// [ sin𝜃 cos𝜃 0 ] [ 0 1 0 ] [ 0 1 0 ]
|
|
||||||
// [ 0 0 1 ] [ 0 0 1 ] [ x y 1 ]
|
|
||||||
//
|
//
|
||||||
|
// RTL
|
||||||
|
// [ x' ] = [ a b ][ x ] = [-1 0 ] [ cos sin ][ x ]
|
||||||
|
// [ y' ] [ c d ][ y ] [ 0 1 ] [-sin cos ][ y ]
|
||||||
|
void AffineMatrix(int writing_direction,
|
||||||
|
int line_x1, int line_y1, int line_x2, int line_y2,
|
||||||
|
double *a, double *b, double *c, double *d) {
|
||||||
double theta = atan2(static_cast<double>(line_y1 - line_y2),
|
double theta = atan2(static_cast<double>(line_y1 - line_y2),
|
||||||
static_cast<double>(line_x2 - line_x1));
|
static_cast<double>(line_x2 - line_x1));
|
||||||
double a, b, c, d;
|
*a = cos(theta);
|
||||||
a = cos(theta);
|
*b = sin(theta);
|
||||||
b = sin(theta);
|
*c = -sin(theta);
|
||||||
c = -sin(theta);
|
*d = cos(theta);
|
||||||
d = cos(theta);
|
|
||||||
switch(writing_direction) {
|
switch(writing_direction) {
|
||||||
case WRITING_DIRECTION_RIGHT_TO_LEFT:
|
case WRITING_DIRECTION_RIGHT_TO_LEFT:
|
||||||
a = -a;
|
*a = -*a;
|
||||||
b = -b;
|
*b = -*b;
|
||||||
c = -c;
|
|
||||||
break;
|
break;
|
||||||
case WRITING_DIRECTION_TOP_TO_BOTTOM:
|
case WRITING_DIRECTION_TOP_TO_BOTTOM:
|
||||||
// TODO(jbreiden) Consider switching PDF writing mode to vertical.
|
// TODO(jbreiden) Consider using the vertical PDF writing mode.
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// There are some really stupid PDF viewers in the wild, such as
|
||||||
|
// 'Preview' which ships with the Mac. They do a better job with text
|
||||||
|
// selection and highlighting when given perfectly flat baseline
|
||||||
|
// instead of very slightly tilted. We clip small tilts to appease
|
||||||
|
// these viewers. I chose this threshold large enough to absorb noise,
|
||||||
|
// but small enough that lines probably won't cross each other if the
|
||||||
|
// whole page is tilted at almost exactly the clipping threshold.
|
||||||
|
void ClipBaseline(int ppi, int x1, int y1, int x2, int y2,
|
||||||
|
int *line_x1, int *line_y1,
|
||||||
|
int *line_x2, int *line_y2) {
|
||||||
|
*line_x1 = x1;
|
||||||
|
*line_y1 = y1;
|
||||||
|
*line_x2 = x2;
|
||||||
|
*line_y2 = y2;
|
||||||
|
double rise = abs(y2 - y1) * 72 / ppi;
|
||||||
|
double run = abs(x2 - x1) * 72 / ppi;
|
||||||
|
if (rise < 2.0 && 2.0 < run)
|
||||||
|
*line_y1 = *line_y2 = (y1 + y2) / 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
|
||||||
|
double width, double height) {
|
||||||
|
STRING pdf_str("");
|
||||||
|
double ppi = api->GetSourceYResolution();
|
||||||
|
|
||||||
|
// These initial conditions are all arbitrary and will be overwritten
|
||||||
|
double old_x = 0.0, old_y = 0.0;
|
||||||
|
int old_fontsize = 0;
|
||||||
|
tesseract::WritingDirection old_writing_direction =
|
||||||
|
WRITING_DIRECTION_LEFT_TO_RIGHT;
|
||||||
|
bool new_block = true;
|
||||||
|
int fontsize = 0;
|
||||||
|
double a = 1;
|
||||||
|
double b = 0;
|
||||||
|
double c = 0;
|
||||||
|
double d = 1;
|
||||||
|
|
||||||
|
// TODO(jbreiden) This marries the text and image together.
|
||||||
|
// Slightly cleaner from an abstraction standpoint if this were to
|
||||||
|
// live inside a separate text object.
|
||||||
|
pdf_str += "q ";
|
||||||
|
pdf_str.add_str_double("", prec(width));
|
||||||
|
pdf_str += " 0 0 ";
|
||||||
|
pdf_str.add_str_double("", prec(height));
|
||||||
|
pdf_str += " 0 0 cm /Im1 Do Q\n";
|
||||||
|
|
||||||
|
ResultIterator *res_it = api->GetIterator();
|
||||||
|
while (!res_it->Empty(RIL_BLOCK)) {
|
||||||
|
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
|
||||||
|
pdf_str += "BT\n3 Tr"; // Begin text object, use invisible ink
|
||||||
|
old_fontsize = 0; // Every block will declare its fontsize
|
||||||
|
new_block = true; // Every block will declare its affine matrix
|
||||||
|
}
|
||||||
|
|
||||||
|
int line_x1, line_y1, line_x2, line_y2;
|
||||||
|
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||||
|
int x1, y1, x2, y2;
|
||||||
|
res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
|
||||||
|
ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (res_it->Empty(RIL_WORD)) {
|
||||||
|
res_it->Next(RIL_WORD);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Writing direction changes at a per-word granularity
|
||||||
|
tesseract::WritingDirection writing_direction;
|
||||||
|
{
|
||||||
|
tesseract::Orientation orientation;
|
||||||
|
tesseract::TextlineOrder textline_order;
|
||||||
|
float deskew_angle;
|
||||||
|
res_it->Orientation(&orientation, &writing_direction,
|
||||||
|
&textline_order, &deskew_angle);
|
||||||
|
if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) {
|
||||||
|
switch (res_it->WordDirection()) {
|
||||||
|
case DIR_LEFT_TO_RIGHT:
|
||||||
|
writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
|
||||||
|
break;
|
||||||
|
case DIR_RIGHT_TO_LEFT:
|
||||||
|
writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
writing_direction = old_writing_direction;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Where is word origin and how long is it?
|
||||||
|
double x, y, word_length;
|
||||||
|
{
|
||||||
|
int word_x1, word_y1, word_x2, word_y2;
|
||||||
|
res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
|
||||||
|
GetWordBaseline(writing_direction, ppi, height,
|
||||||
|
word_x1, word_y1, word_x2, word_y2,
|
||||||
|
line_x1, line_y1, line_x2, line_y2,
|
||||||
|
&x, &y, &word_length);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (writing_direction != old_writing_direction || new_block) {
|
||||||
|
AffineMatrix(writing_direction,
|
||||||
|
line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
|
||||||
pdf_str.add_str_double(" ", prec(a)); // . This affine matrix
|
pdf_str.add_str_double(" ", prec(a)); // . This affine matrix
|
||||||
pdf_str.add_str_double(" ", prec(b)); // . sets the coordinate
|
pdf_str.add_str_double(" ", prec(b)); // . sets the coordinate
|
||||||
pdf_str.add_str_double(" ", prec(c)); // . system for all
|
pdf_str.add_str_double(" ", prec(c)); // . system for all
|
||||||
pdf_str.add_str_double(" ", prec(d)); // . text in the entire
|
pdf_str.add_str_double(" ", prec(d)); // . text that follows.
|
||||||
pdf_str.add_str_double(" ", prec(x)); // . line.
|
pdf_str.add_str_double(" ", prec(x)); // .
|
||||||
pdf_str.add_str_double(" ", prec(y)); // .
|
pdf_str.add_str_double(" ", prec(y)); // .
|
||||||
pdf_str += (" Tm "); // Place cursor absolutely
|
pdf_str += (" Tm "); // Place cursor absolutely
|
||||||
|
new_block = false;
|
||||||
} else {
|
} else {
|
||||||
double offset = sqrt(static_cast<double>(dist2(old_x, old_y, x, y)));
|
double dx = x - old_x;
|
||||||
pdf_str.add_str_double(" ", prec(offset)); // Delta x in pts
|
double dy = y - old_y;
|
||||||
pdf_str.add_str_double(" ", 0); // Delta y in pts
|
pdf_str.add_str_double(" ", prec(dx * a + dy * b));
|
||||||
|
pdf_str.add_str_double(" ", prec(dx * c + dy * d));
|
||||||
pdf_str += (" Td "); // Relative moveto
|
pdf_str += (" Td "); // Relative moveto
|
||||||
}
|
}
|
||||||
old_x = x;
|
old_x = x;
|
||||||
old_y = y;
|
old_y = y;
|
||||||
|
old_writing_direction = writing_direction;
|
||||||
|
|
||||||
// Adjust font size on a per word granularity. Pay attention to
|
// Adjust font size on a per word granularity. Pay attention to
|
||||||
// pointsize, old_pointsize, and pdf_str. We've found that for
|
// fontsize, old_fontsize, and pdf_str. We've found that for
|
||||||
// in Arabic, Tesseract will happily return a pointsize of zero,
|
// in Arabic, Tesseract will happily return a fontsize of zero,
|
||||||
// so we make up a default number to protect ourselves.
|
// so we make up a default number to protect ourselves.
|
||||||
{
|
{
|
||||||
bool bold, italic, underlined, monospace, serif, smallcaps;
|
bool bold, italic, underlined, monospace, serif, smallcaps;
|
||||||
int font_id;
|
int font_id;
|
||||||
res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
|
res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
|
||||||
&serif, &smallcaps, &pointsize, &font_id);
|
&serif, &smallcaps, &fontsize, &font_id);
|
||||||
const int kDefaultPointSize = 8;
|
const int kDefaultFontsize = 8;
|
||||||
if (pointsize <= 0)
|
if (fontsize <= 0)
|
||||||
pointsize = kDefaultPointSize;
|
fontsize = kDefaultFontsize;
|
||||||
if (pointsize != old_pointsize) {
|
if (fontsize != old_fontsize) {
|
||||||
char textfont[20];
|
char textfont[20];
|
||||||
snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", pointsize);
|
snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", fontsize);
|
||||||
pdf_str += textfont;
|
pdf_str += textfont;
|
||||||
old_pointsize = pointsize;
|
old_fontsize = fontsize;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -243,9 +294,9 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
|
|||||||
delete []grapheme;
|
delete []grapheme;
|
||||||
res_it->Next(RIL_SYMBOL);
|
res_it->Next(RIL_SYMBOL);
|
||||||
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
|
||||||
if (word_length > 0 && pdf_word_len > 0 && pointsize > 0) {
|
if (word_length > 0 && pdf_word_len > 0 && fontsize > 0) {
|
||||||
double h_stretch =
|
double h_stretch =
|
||||||
kCharWidth * prec(100.0 * word_length / (pointsize * pdf_word_len));
|
kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
|
||||||
pdf_str.add_str_double("", h_stretch);
|
pdf_str.add_str_double("", h_stretch);
|
||||||
pdf_str += " Tz"; // horizontal stretch
|
pdf_str += " Tz"; // horizontal stretch
|
||||||
pdf_str += " [ ";
|
pdf_str += " [ ";
|
||||||
@ -449,7 +500,7 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix,
|
|||||||
|
|
||||||
L_COMP_DATA *cid = NULL;
|
L_COMP_DATA *cid = NULL;
|
||||||
const int kJpegQuality = 85;
|
const int kJpegQuality = 85;
|
||||||
l_generateCIDataForPdf(filename, pix, kJpegQuality, &cid);
|
|
||||||
// TODO(jbreiden) Leptonica 1.71 doesn't correctly handle certain
|
// TODO(jbreiden) Leptonica 1.71 doesn't correctly handle certain
|
||||||
// types of PNG files, especially if there are 2 samples per pixel.
|
// types of PNG files, especially if there are 2 samples per pixel.
|
||||||
// We can get rid of this logic after Leptonica 1.72 is released and
|
// We can get rid of this logic after Leptonica 1.72 is released and
|
||||||
@ -747,5 +798,4 @@ bool TessPDFRenderer::EndDocumentHandler() {
|
|||||||
AppendString(buf);
|
AppendString(buf);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace tesseract
|
} // namespace tesseract
|
||||||
|
BIN
tessdata/pdf.ttf
BIN
tessdata/pdf.ttf
Binary file not shown.
1928
tessdata/pdf.ttx
1928
tessdata/pdf.ttx
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user