// Include automatically generated configuration file if running autoconf. #ifdef HAVE_CONFIG_H #include "config_auto.h" #endif #include "baseapi.h" #include "renderer.h" #include "math.h" #include "strngs.h" #include "cube_utils.h" #include "allheaders.h" #if !defined(VERSION) #include "version.h" #endif #ifdef _MSC_VER #include "mathfix.h" #endif namespace tesseract { // Use for PDF object fragments. Must be large enough // to hold a colormap with 256 colors in the verbose // PDF representation. const int kBasicBufSize = 2048; /********************************************************************** * PDF Renderer interface implementation **********************************************************************/ TessPDFRenderer::TessPDFRenderer(const char *datadir) : TessResultRenderer("PDF", "pdf") { obj_ = 0; datadir_ = datadir; offsets_.push_back(0); } void TessPDFRenderer::AppendPDFObjectDIY(size_t objectsize) { offsets_.push_back(objectsize + offsets_.back()); obj_++; } void TessPDFRenderer::AppendPDFObject(const char *data) { AppendPDFObjectDIY(strlen(data)); AppendString((const char *)data); } // Helper function to prevent us from accidentaly writing // scientific notation to an HOCR or PDF file. Besides, three // decimal points are all you really need. double prec(double x) { double kPrecision = 1000.0; double a = round(x * kPrecision) / kPrecision; if (a == -0) return 0; return a; } long dist2(int x1, int y1, int x2, int y2) { return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1); } char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api, double width, double height, int page_number) { double ppi = api->GetSourceYResolution(); STRING pdf_str(""); double old_x = 0.0, old_y = 0.0; int old_pointsize = 0; // TODO(jbreiden) Slightly cleaner from an abstraction standpoint // if this were to live inside a separate text object. pdf_str += "q "; pdf_str.add_str_double("", prec(width)); pdf_str += " 0 0 "; pdf_str.add_str_double("", prec(height)); pdf_str += " 0 0 cm /Im1 Do Q\n"; ResultIterator *res_it = api->GetIterator(); while (!res_it->Empty(RIL_BLOCK)) { if (res_it->IsAtBeginningOf(RIL_BLOCK)) { pdf_str += "BT\n3 Tr\n"; // Begin text object, use invisible ink old_pointsize = 0.0; // Let's always declare our fonts at this scope } int line_x1, line_y1, line_x2, line_y2; if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { res_it->Baseline(RIL_TEXTLINE, &line_x1, &line_y1, &line_x2, &line_y2); } if (res_it->Empty(RIL_WORD)) { res_it->Next(RIL_WORD); continue; } int word_x1, word_y1, word_x2, word_y2; res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2); // The critical one is writing_direction tesseract::Orientation orientation; tesseract::WritingDirection writing_direction; tesseract::TextlineOrder textline_order; float deskew_angle; res_it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle); // Unlike Tesseract, we always want the word baseline in reading order. if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) { Swap(&word_x1, &word_x2); Swap(&word_y1, &word_y2); } // Viewers like evince can get really confused during copy-paste // when the baseline wanders around. I've decided to force every // word to match the (straight) baseline. The math below is just // projecting the word origin onto the baseline. All numbers are // in the native PDF coordinate system, which has the origin in // the bottom left and the unit is points, which is 1/72 inch. double word_length; double x, y; { int px = word_x1; int py = word_y1; double l2 = dist2(line_x1, line_y1, line_x2, line_y2); if (l2 == 0) { x = line_x1; y = line_y1; } else { double t = ((px - line_x2) * (line_x2 - line_x1) + (py - line_y2) * (line_y2 - line_y1)) / l2; x = line_x2 + t * (line_x2 - line_x1); y = line_y2 + t * (line_y2 - line_y1); } word_length = sqrt(double(dist2(word_x1, word_y1, word_x2, word_y2))); word_length = word_length * 72.0 / ppi; x = x * 72 / ppi; y = height - (y * 72.0 / ppi); } int pointsize; if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { // Calculate the rotation angle in the PDF cooordinate system, // which has the origin in the bottom left. The Tesseract // coordinate system has the origin in the upper left. // // PDF is kind of a like turtle graphics, and we orient the // turtle (errr... initial cursor position) with an affine // transformation. // // Rotate RTL Translate // // [ x' y' 1 ] = [ x y 1 ] [ cos𝜃 -sin𝜃 0 ] [ -1 0 0 ] [ 1 0 0 ] // [ sin𝜃 cos𝜃 0 ] [ 0 1 0 ] [ 0 1 0 ] // [ 0 0 1 ] [ 0 0 1 ] [ x y 1 ] // double theta = atan2(double(line_y1 - line_y2), double(line_x2 - line_x1)); double a, b, c, d; a = cos(theta); b = sin(theta); c = -sin(theta); d = cos(theta); switch(writing_direction) { case WRITING_DIRECTION_RIGHT_TO_LEFT: a = -a; b = -b; c = -c; break; case WRITING_DIRECTION_TOP_TO_BOTTOM: // TODO(jbreiden) Consider switching PDF writing mode to vertical. break; default: break; } const char *font_name; bool bold, italic, underlined, monospace, serif, smallcaps; int font_id; font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps, &pointsize, &font_id); if (pointsize != old_pointsize) { char textfont[20]; snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf\n", pointsize); pdf_str += textfont; // Custom font old_pointsize = pointsize; } pdf_str.add_str_double("", prec(a)); // . This affine matrix pdf_str.add_str_double(" ", prec(b)); // . sets the coordinate pdf_str.add_str_double(" ", prec(c)); // . system for all pdf_str.add_str_double(" ", prec(d)); // . text in the entire pdf_str.add_str_double(" ", prec(x)); // . line. pdf_str.add_str_double(" ", prec(y)); // . pdf_str += (" Tm "); // Place cursor absolutely } else { double offset = sqrt(double(dist2(old_x, old_y, x, y))); pdf_str.add_str_double(" ", prec(offset)); // Delta x in pts pdf_str.add_str_double(" ", 0); // Delta y in pts pdf_str += (" Td "); // Relative moveto } old_x = x; old_y = y; bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); STRING pdf_word(""); int pdf_word_len = 0; do { const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL); if (grapheme && grapheme[0] != 0) { // TODO(jbreiden) Do a real UTF-16BE conversion // http://en.wikipedia.org/wiki/UTF-16#Example_UTF-16_encoding_procedure string_32 utf32; CubeUtils::UTF8ToUTF32(grapheme, &utf32); char utf16[20]; for (int i = 0; i < utf32.length(); i++) { snprintf(utf16, sizeof(utf16), "<%04X>", utf32[i]); pdf_word += utf16; pdf_word_len++; } } delete []grapheme; res_it->Next(RIL_SYMBOL); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); if (word_length > 0 && pdf_word_len > 0 && pointsize > 0) { double h_stretch = prec(100.0 * word_length / (pointsize * pdf_word_len)); pdf_str.add_str_double("", h_stretch); pdf_str += " Tz"; // horizontal stretch pdf_str += " [ "; pdf_str += pdf_word; // word in UTF-16BE representation pdf_str += " ] TJ"; // show the text } if (last_word_in_line) { pdf_str += " \n"; } if (last_word_in_block) { pdf_str += "ET\n"; // end the text object } } char *ret = new char[pdf_str.length() + 1]; strcpy(ret, pdf_str.string()); delete res_it; return ret; } bool TessPDFRenderer::BeginDocumentHandler() { char buf[kBasicBufSize]; snprintf(buf, sizeof(buf), "%%PDF-1.5\n" "%%%c%c%c%c\n", 0xDE, 0xAD, 0xBE, 0xEB); AppendPDFObject(buf); // CATALOG long int catalog = obj_; snprintf(buf, sizeof(buf), "1 0 obj\n" "<<\n" " /Type /Catalog\n" " /Pages %ld 0 R\n" ">>\n" "endobj\n", 2L); AppendPDFObject(buf); // We are reserving object #2 for the /Pages // object, which I am going to create and write // at the end of the PDF file. AppendPDFObject(""); // TYPE0 FONT snprintf(buf, sizeof(buf), "3 0 obj\n" "<<\n" " /BaseFont /GlyphLessFont\n" " /DescendantFonts [ %ld 0 R ]\n" " /Encoding /Identity-H\n" " /Subtype /Type0\n" " /ToUnicode %ld 0 R\n" " /Type /Font\n" ">>\n" "endobj\n", 4L, // CIDFontType2 font 5L // ToUnicode ); AppendPDFObject(buf); // CIDFONTTYPE2 snprintf(buf, sizeof(buf), "4 0 obj\n" "<<\n" " /BaseFont /GlyphLessFont\n" " /CIDToGIDMap /Identity\n" " /CIDSystemInfo\n" " <<\n" " /Ordering (Identity)\n" " /Registry (Adobe)\n" " /Supplement 0\n" " >>\n" " /FontDescriptor %ld 0 R\n" " /Subtype /CIDFontType2\n" " /Type /Font\n" " /DW 1000\n" ">>\n" "endobj\n", 6L // Font descriptor ); AppendPDFObject(buf); const char *stream = "/CIDInit /ProcSet findresource begin\n" "12 dict begin\n" "begincmap\n" "/CIDSystemInfo\n" "<<\n" " /Registry (Adobe)\n" " /Ordering (UCS)\n" " /Supplement 0\n" ">> def\n" "/CMapName /Adobe-Identify-UCS def\n" "/CMapType 2 def\n" "1 begincodespacerange\n" "<0000> \n" "endcodespacerange\n" "1 beginbfrange\n" "<0000> <0000>\n" "endbfrange\n" "endcmap\n" "CMapName currentdict /CMap defineresource pop\n" "end\n" "end\n"; // TOUNICODE snprintf(buf, sizeof(buf), "5 0 obj\n" "<< /Length %lu >>\n" "stream\n" "%s" "endstream\n" "endobj\n", (unsigned long) strlen(stream), stream); AppendPDFObject(buf); // TODO(jbreiden) Fix the FontBBox entry. And of course make // the font data match the descriptor. // FONT DESCRIPTOR snprintf(buf, sizeof(buf), "6 0 obj\n" "<<\n" " /Ascent 1000\n" " /CapHeight 1000\n" " /Descent 0\n" // Nothing goes below baseline " /Flags 4\n" " /FontBBox [ 0 0 1000 1000 ]\n" " /FontFile2 %ld 0 R\n" " /FontName /GlyphLessFont\n" " /ItalicAngle 0\n" " /StemV 80\n" " /Type /FontDescriptor\n" ">>\n" "endobj\n", 7L // Font data ); AppendPDFObject(buf); snprintf(buf, sizeof(buf), "%s/pdf.ttf", datadir_); FILE *fp = fopen(buf, "rb"); if (!fp) return false; fseek(fp, 0, SEEK_END); long int size = ftell(fp); fseek(fp, 0, SEEK_SET); char *buffer = new char[size]; fread(buffer, 1, size, fp); fclose(fp); // FONTFILE2 snprintf(buf, sizeof(buf), "7 0 obj\n" "<<\n" " /Length %ld\n" " /Length1 %ld\n" ">>\n" "stream\n", size, size); AppendString(buf); size_t objsize = strlen(buf); AppendData(buffer, size); objsize += size; snprintf(buf, sizeof(buf), "endstream\n" "endobj\n"); AppendString(buf); objsize += strlen(buf); AppendPDFObjectDIY(objsize); return true; } // TODO(jbreiden) I hear that you can pull the flate stream out // of a PNG file and, by mentioning the predictor in the PDF object, // make most of them work without transcoding. If so that's a big win // versus what we do now. Try it out. bool TessPDFRenderer::fileToPDFObj(char *filename, long int objnum, char **pdf_object, long int *pdf_object_size) { char b1[kBasicBufSize]; char b2[kBasicBufSize]; if (!pdf_object_size || !pdf_object) return false; *pdf_object = NULL; *pdf_object_size = 0; if (!filename) return false; FILE *fp = fopen(filename, "rb"); if (!fp) return false; int format; findFileFormatStream(fp, &format); if (format != IFF_JFIF_JPEG) { fclose(fp); return false; } fseek(fp, 0, SEEK_END); long int jpeg_size = ftell(fp); fseek(fp, 0, SEEK_SET); int spp, cmyk, w, h; freadHeaderJpeg(fp, &w, &h, &spp, NULL, &cmyk); const char *colorspace; switch (spp) { case 1: colorspace = "/DeviceGray"; break; case 3: colorspace = "/DeviceRGB"; break; case 4: if (cmyk) colorspace = "/DeviceCMYK"; else return false; break; default: return false; } // IMAGE snprintf(b1, sizeof(b1), "%ld 0 obj\n" "<<\n" " /Length %ld\n" " /Subtype /Image\n" " /ColorSpace %s\n" " /Width %d\n" " /Height %d\n" " /BitsPerComponent 8\n" " /Filter /DCTDecode\n" ">>\n" "stream\n", objnum, jpeg_size, colorspace, w, h); size_t b1_len = strlen(b1); snprintf(b2, sizeof(b2), "\n" "endstream\n" "endobj\n"); size_t b2_len = strlen(b2); *pdf_object_size = b1_len + jpeg_size + b2_len; *pdf_object = new char[*pdf_object_size]; if (!pdf_object) return false; memcpy(*pdf_object, b1, b1_len); if (fread(*pdf_object + b1_len, 1, jpeg_size, fp) != jpeg_size) { delete[] pdf_object; return false; } memcpy(*pdf_object + b1_len + jpeg_size, b2, b2_len); fclose(fp); return true; } bool TessPDFRenderer::pixToPDFObj(Pix *pix, long int objnum, char **pdf_object, long int *pdf_object_size) { if (!pdf_object_size || !pdf_object) return false; *pdf_object = NULL; *pdf_object_size = 0; char b0[kBasicBufSize]; char b1[kBasicBufSize * 2]; char b2[kBasicBufSize]; int encoding_type; if (selectDefaultPdfEncoding(pix, &encoding_type) != 0) return false; #if 0 const int kJpegQuality = 85; L_COMP_DATA *cid; if (pixGenerateCIData(pix, encoding_type, kJpegQuality, 0, &cid) != 0) return false; #endif const char *filter; switch(encoding_type) { case L_FLATE_ENCODE: filter = "/FlateDecode"; break; case L_JPEG_ENCODE: filter = "/DCTDecode"; break; case L_G4_ENCODE: filter = "/CCITTFaxDecode"; break; default: return false; } const char *colorspace = "/DeviceColor"; #if 0 if (cid->ncolors > 0) { snprintf(b0, sizeof(b0), "[ /Indexed /DeviceRGB %d %s ]", cid->ncolors - 1, cid->cmapdatahex); colorspace = b0; } else { switch (cid->spp) { case 1: colorspace = "/DeviceGray"; break; case 3: colorspace = "/DeviceColor"; break; default: return false; } } snprintf(b1, sizeof(b1), "%ld 0 obj\n" "<<\n" " /Length %lu\n" " /Subtype /Image\n" " /ColorSpace %s\n" " /Width %d\n" " /Height %d\n" " /BitsPerComponent %d\n" " /Filter %s\n" " /DecodeParms\n" " <<\n" " /K -1\n" " /Columns %d\n" " >>\n" ">>\n" "stream\n", objnum, (unsigned long) cid->nbytescomp, colorspace, cid->w, cid->h, cid->bps, filter, cid->w); size_t b1_len = strlen(b1); snprintf(b2, sizeof(b2), "\n" "endstream\n" "endobj\n"); size_t b2_len = strlen(b2); *pdf_object_size = b1_len + cid->nbytescomp + b2_len; *pdf_object = new char[*pdf_object_size]; if (!pdf_object) return false; memcpy(*pdf_object, b1, b1_len); memcpy(*pdf_object + b1_len, cid->datacomp, cid->nbytescomp); memcpy(*pdf_object + b1_len + cid->nbytescomp, b2, b2_len); #endif return true; } bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) { char buf[kBasicBufSize]; Pix *pix = api->GetInputImage(); char *filename = (char *)api->GetInputName(); int ppi = api->GetSourceYResolution(); if (!pix || ppi <= 0) return false; double width = pixGetWidth(pix) * 72.0 / ppi; double height = pixGetHeight(pix) * 72.0 / ppi; // PAGE snprintf(buf, sizeof(buf), "%ld 0 obj\n" "<<\n" " /Type /Page\n" " /Parent %ld 0 R\n" " /MediaBox [0 0 %.2f %.2f]\n" " /Contents %ld 0 R\n" " /Resources\n" " <<\n" " /XObject << /Im1 %ld 0 R >>\n" " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n" " /Font << /f-0-0 %ld 0 R >>\n" " >>\n" ">>\n" "endobj\n", obj_, 2L, // Pages object width, height, obj_ + 1, // Contents object obj_ + 2, // Image object 3L); // Type0 Font pages_.push_back(obj_); AppendPDFObject(buf); // CONTENTS char* pdftext = GetPDFTextObjects(api, width, height, imagenum()); long pdftext_len = strlen(pdftext); unsigned char *pdftext_casted = reinterpret_cast(pdftext); size_t len; unsigned char *comp_pdftext = zlibCompress(pdftext_casted, pdftext_len, &len); long comp_pdftext_len = len; snprintf(buf, sizeof(buf), "%ld 0 obj\n" "<<\n" " /Length %ld /Filter /FlateDecode\n" ">>\n" "stream\n", obj_, comp_pdftext_len); AppendString(buf); long objsize = strlen(buf); AppendData(reinterpret_cast(comp_pdftext), comp_pdftext_len); objsize += comp_pdftext_len; lept_free(comp_pdftext); delete[] pdftext; snprintf(buf, sizeof(buf), "endstream\n" "endobj\n"); AppendString(buf); objsize += strlen(buf); AppendPDFObjectDIY(objsize); char *pdf_object; if (!fileToPDFObj(filename, obj_, &pdf_object, &objsize)) { if (!pixToPDFObj(pix, obj_, &pdf_object, &objsize)) { return false; } } AppendData(pdf_object, objsize); AppendPDFObjectDIY(objsize); delete[] pdf_object; return true; } bool TessPDFRenderer::EndDocumentHandler() { char buf[kBasicBufSize]; // We reserved the /Pages object number early, so that the /Page // objects could refer to their parent. We finally have enough // information to go fill it in. Using lower level calls to manipulate // the offset record in two spots, because we are placing objects // out of order in the file. // PAGES const long int kPagesObjectNumber = 2; offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1 snprintf(buf, sizeof(buf), "%ld 0 obj\n" "<<\n" " /Type /Pages\n" " /Kids [ ", kPagesObjectNumber); AppendString(buf); size_t pages_objsize = strlen(buf); for (size_t i = 0; i < pages_.size(); i++) { snprintf(buf, sizeof(buf), "%ld 0 R ", pages_[i]); AppendString(buf); pages_objsize += strlen(buf); } snprintf(buf, sizeof(buf), "]\n" " /Count %d\n" ">>\n" "endobj\n", pages_.size()); AppendString(buf); pages_objsize += strlen(buf); offsets_.back() += pages_objsize; // manipulation #2 // INFO char* datestr = l_getFormattedDate(); snprintf(buf, sizeof(buf), "%ld 0 obj\n" "<<\n" " /Producer (Tesseract %s)\n" " /CreationDate (D:%s)\n" " /Title (%s)" ">>\n" "endobj\n", obj_, VERSION, datestr, title()); lept_free(datestr); AppendPDFObject(buf); snprintf(buf, sizeof(buf), "xref\n" "0 %ld\n" "0000000000 65535 f \n", obj_); AppendString(buf); for (int i = 1; i < obj_; i++) { snprintf(buf, sizeof(buf), "%010ld 00000 n \n", offsets_[i]); AppendString(buf); } snprintf(buf, sizeof(buf), "trailer\n" "<<\n" " /Size %ld\n" " /Root %ld 0 R\n" " /Info %ld 0 R\n" ">>\n" "startxref\n" "%ld\n" "%%%%EOF\n", obj_, 1L, // catalog obj_ - 1, // info offsets_.back()); AppendString(buf); return true; } } // namespace tesseract