tesseract  4.00.00dev
tesseract::TessPDFRenderer Class Reference

#include <renderer.h>

Inheritance diagram for tesseract::TessPDFRenderer:
tesseract::TessResultRenderer

Public Member Functions

 TessPDFRenderer (const char *outputbase, const char *datadir, bool textonly)
 
- Public Member Functions inherited from tesseract::TessResultRenderer
virtual ~TessResultRenderer ()
 
void insert (TessResultRenderer *next)
 
TessResultRenderernext ()
 
bool BeginDocument (const char *title)
 
bool AddImage (TessBaseAPI *api)
 
bool EndDocument ()
 
const char * file_extension () const
 
const char * title () const
 
int imagenum () const
 

Protected Member Functions

virtual bool BeginDocumentHandler ()
 
virtual bool AddImageHandler (TessBaseAPI *api)
 
virtual bool EndDocumentHandler ()
 
- Protected Member Functions inherited from tesseract::TessResultRenderer
 TessResultRenderer (const char *outputbase, const char *extension)
 
void AppendString (const char *s)
 
void AppendData (const char *s, int len)
 

Detailed Description

Renders tesseract output into searchable PDF

Definition at line 186 of file renderer.h.

Constructor & Destructor Documentation

◆ TessPDFRenderer()

tesseract::TessPDFRenderer::TessPDFRenderer ( const char *  outputbase,
const char *  datadir,
bool  textonly 
)

Definition at line 187 of file pdfrenderer.cpp.

189  : TessResultRenderer(outputbase, "pdf") {
190  obj_ = 0;
191  datadir_ = datadir;
192  textonly_ = textonly;
193  offsets_.push_back(0);
194 }
int push_back(T object)
TessResultRenderer(const char *outputbase, const char *extension)
Definition: renderer.cpp:33

Member Function Documentation

◆ AddImageHandler()

bool tesseract::TessPDFRenderer::AddImageHandler ( TessBaseAPI api)
protectedvirtual

Implements tesseract::TessResultRenderer.

Definition at line 844 of file pdfrenderer.cpp.

844  {
845  size_t n;
846  char buf[kBasicBufSize];
847  char buf2[kBasicBufSize];
848  Pix *pix = api->GetInputImage();
849  char *filename = (char *)api->GetInputName();
850  int ppi = api->GetSourceYResolution();
851  if (!pix || ppi <= 0)
852  return false;
853  double width = pixGetWidth(pix) * 72.0 / ppi;
854  double height = pixGetHeight(pix) * 72.0 / ppi;
855 
856  snprintf(buf2, sizeof(buf2), "/XObject << /Im1 %ld 0 R >>\n", obj_ + 2);
857  const char *xobject = (textonly_) ? "" : buf2;
858 
859  // PAGE
860  n = snprintf(buf, sizeof(buf),
861  "%ld 0 obj\n"
862  "<<\n"
863  " /Type /Page\n"
864  " /Parent %ld 0 R\n"
865  " /MediaBox [0 0 %.2f %.2f]\n"
866  " /Contents %ld 0 R\n"
867  " /Resources\n"
868  " <<\n"
869  " %s"
870  " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
871  " /Font << /f-0-0 %ld 0 R >>\n"
872  " >>\n"
873  ">>\n"
874  "endobj\n",
875  obj_,
876  2L, // Pages object
877  width, height,
878  obj_ + 1, // Contents object
879  xobject, // Image object
880  3L); // Type0 Font
881  if (n >= sizeof(buf)) return false;
882  pages_.push_back(obj_);
883  AppendPDFObject(buf);
884 
885  // CONTENTS
886  const std::unique_ptr</*non-const*/ char[]> pdftext(GetPDFTextObjects(api, width, height));
887  const long pdftext_len = strlen(pdftext.get());
888  size_t len;
889  unsigned char *comp_pdftext =
890  zlibCompress(reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
891  long comp_pdftext_len = len;
892  n = snprintf(buf, sizeof(buf),
893  "%ld 0 obj\n"
894  "<<\n"
895  " /Length %ld /Filter /FlateDecode\n"
896  ">>\n"
897  "stream\n", obj_, comp_pdftext_len);
898  if (n >= sizeof(buf)) {
899  lept_free(comp_pdftext);
900  return false;
901  }
902  AppendString(buf);
903  long objsize = strlen(buf);
904  AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
905  objsize += comp_pdftext_len;
906  lept_free(comp_pdftext);
907  const char *b2 =
908  "endstream\n"
909  "endobj\n";
910  AppendString(b2);
911  objsize += strlen(b2);
912  AppendPDFObjectDIY(objsize);
913 
914  if (!textonly_) {
915  char *pdf_object = nullptr;
916  if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) {
917  return false;
918  }
919  AppendData(pdf_object, objsize);
920  AppendPDFObjectDIY(objsize);
921  delete[] pdf_object;
922  }
923  return true;
924 }
int push_back(T object)
void AppendData(const char *s, int len)
Definition: renderer.cpp:106
const char * filename
Definition: ioapi.h:38
voidpf void * buf
Definition: ioapi.h:39
void AppendString(const char *s)
Definition: renderer.cpp:102

◆ BeginDocumentHandler()

bool tesseract::TessPDFRenderer::BeginDocumentHandler ( )
protectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 501 of file pdfrenderer.cpp.

501  {
502  char buf[kBasicBufSize];
503  size_t n;
504 
505  n = snprintf(buf, sizeof(buf),
506  "%%PDF-1.5\n"
507  "%%%c%c%c%c\n",
508  0xDE, 0xAD, 0xBE, 0xEB);
509  if (n >= sizeof(buf)) return false;
510  AppendPDFObject(buf);
511 
512  // CATALOG
513  n = snprintf(buf, sizeof(buf),
514  "1 0 obj\n"
515  "<<\n"
516  " /Type /Catalog\n"
517  " /Pages %ld 0 R\n"
518  ">>\n"
519  "endobj\n",
520  2L);
521  if (n >= sizeof(buf)) return false;
522  AppendPDFObject(buf);
523 
524  // We are reserving object #2 for the /Pages
525  // object, which I am going to create and write
526  // at the end of the PDF file.
527  AppendPDFObject("");
528 
529  // TYPE0 FONT
530  n = snprintf(buf, sizeof(buf),
531  "3 0 obj\n"
532  "<<\n"
533  " /BaseFont /GlyphLessFont\n"
534  " /DescendantFonts [ %ld 0 R ]\n"
535  " /Encoding /Identity-H\n"
536  " /Subtype /Type0\n"
537  " /ToUnicode %ld 0 R\n"
538  " /Type /Font\n"
539  ">>\n"
540  "endobj\n",
541  4L, // CIDFontType2 font
542  6L // ToUnicode
543  );
544  if (n >= sizeof(buf)) return false;
545  AppendPDFObject(buf);
546 
547  // CIDFONTTYPE2
548  n = snprintf(buf, sizeof(buf),
549  "4 0 obj\n"
550  "<<\n"
551  " /BaseFont /GlyphLessFont\n"
552  " /CIDToGIDMap %ld 0 R\n"
553  " /CIDSystemInfo\n"
554  " <<\n"
555  " /Ordering (Identity)\n"
556  " /Registry (Adobe)\n"
557  " /Supplement 0\n"
558  " >>\n"
559  " /FontDescriptor %ld 0 R\n"
560  " /Subtype /CIDFontType2\n"
561  " /Type /Font\n"
562  " /DW %d\n"
563  ">>\n"
564  "endobj\n",
565  5L, // CIDToGIDMap
566  7L, // Font descriptor
567  1000 / kCharWidth);
568  if (n >= sizeof(buf)) return false;
569  AppendPDFObject(buf);
570 
571  // CIDTOGIDMAP
572  const int kCIDToGIDMapSize = 2 * (1 << 16);
573  const std::unique_ptr</*non-const*/ unsigned char[]> cidtogidmap(new unsigned char[kCIDToGIDMapSize]);
574  for (int i = 0; i < kCIDToGIDMapSize; i++) {
575  cidtogidmap[i] = (i % 2) ? 1 : 0;
576  }
577  size_t len;
578  unsigned char *comp =
579  zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
580  n = snprintf(buf, sizeof(buf),
581  "5 0 obj\n"
582  "<<\n"
583  " /Length %lu /Filter /FlateDecode\n"
584  ">>\n"
585  "stream\n",
586  (unsigned long)len);
587  if (n >= sizeof(buf)) {
588  lept_free(comp);
589  return false;
590  }
591  AppendString(buf);
592  long objsize = strlen(buf);
593  AppendData(reinterpret_cast<char *>(comp), len);
594  objsize += len;
595  lept_free(comp);
596  const char *endstream_endobj =
597  "endstream\n"
598  "endobj\n";
599  AppendString(endstream_endobj);
600  objsize += strlen(endstream_endobj);
601  AppendPDFObjectDIY(objsize);
602 
603  const char *stream =
604  "/CIDInit /ProcSet findresource begin\n"
605  "12 dict begin\n"
606  "begincmap\n"
607  "/CIDSystemInfo\n"
608  "<<\n"
609  " /Registry (Adobe)\n"
610  " /Ordering (UCS)\n"
611  " /Supplement 0\n"
612  ">> def\n"
613  "/CMapName /Adobe-Identify-UCS def\n"
614  "/CMapType 2 def\n"
615  "1 begincodespacerange\n"
616  "<0000> <FFFF>\n"
617  "endcodespacerange\n"
618  "1 beginbfrange\n"
619  "<0000> <FFFF> <0000>\n"
620  "endbfrange\n"
621  "endcmap\n"
622  "CMapName currentdict /CMap defineresource pop\n"
623  "end\n"
624  "end\n";
625 
626  // TOUNICODE
627  n = snprintf(buf, sizeof(buf),
628  "6 0 obj\n"
629  "<< /Length %lu >>\n"
630  "stream\n"
631  "%s"
632  "endstream\n"
633  "endobj\n", (unsigned long) strlen(stream), stream);
634  if (n >= sizeof(buf)) return false;
635  AppendPDFObject(buf);
636 
637  // FONT DESCRIPTOR
638  n = snprintf(buf, sizeof(buf),
639  "7 0 obj\n"
640  "<<\n"
641  " /Ascent %d\n"
642  " /CapHeight %d\n"
643  " /Descent -1\n" // Spec says must be negative
644  " /Flags 5\n" // FixedPitch + Symbolic
645  " /FontBBox [ 0 0 %d %d ]\n"
646  " /FontFile2 %ld 0 R\n"
647  " /FontName /GlyphLessFont\n"
648  " /ItalicAngle 0\n"
649  " /StemV 80\n"
650  " /Type /FontDescriptor\n"
651  ">>\n"
652  "endobj\n",
653  1000,
654  1000,
655  1000 / kCharWidth,
656  1000,
657  8L // Font data
658  );
659  if (n >= sizeof(buf)) return false;
660  AppendPDFObject(buf);
661 
662  n = snprintf(buf, sizeof(buf), "%s/pdf.ttf", datadir_);
663  if (n >= sizeof(buf)) return false;
664  FILE *fp = fopen(buf, "rb");
665  if (!fp) {
666  tprintf("Can not open file \"%s\"!\n", buf);
667  return false;
668  }
669  fseek(fp, 0, SEEK_END);
670  long int size = ftell(fp);
671  fseek(fp, 0, SEEK_SET);
672  const std::unique_ptr</*non-const*/ char[]> buffer(new char[size]);
673  if (fread(buffer.get(), 1, size, fp) != static_cast<unsigned long>(size)) {
674  fclose(fp);
675  return false;
676  }
677  fclose(fp);
678  // FONTFILE2
679  n = snprintf(buf, sizeof(buf),
680  "8 0 obj\n"
681  "<<\n"
682  " /Length %ld\n"
683  " /Length1 %ld\n"
684  ">>\n"
685  "stream\n", size, size);
686  if (n >= sizeof(buf)) {
687  return false;
688  }
689  AppendString(buf);
690  objsize = strlen(buf);
691  AppendData(buffer.get(), size);
692  objsize += size;
693  AppendString(endstream_endobj);
694  objsize += strlen(endstream_endobj);
695  AppendPDFObjectDIY(objsize);
696  return true;
697 }
voidpf stream
Definition: ioapi.h:39
voidpf void uLong size
Definition: ioapi.h:39
#define tprintf(...)
Definition: tprintf.h:31
#define SEEK_SET
Definition: ioapi.c:29
void AppendData(const char *s, int len)
Definition: renderer.cpp:106
voidpf void * buf
Definition: ioapi.h:39
#define SEEK_END
Definition: ioapi.c:25
void AppendString(const char *s)
Definition: renderer.cpp:102

◆ EndDocumentHandler()

bool tesseract::TessPDFRenderer::EndDocumentHandler ( )
protectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 927 of file pdfrenderer.cpp.

927  {
928  size_t n;
929  char buf[kBasicBufSize];
930 
931  // We reserved the /Pages object number early, so that the /Page
932  // objects could refer to their parent. We finally have enough
933  // information to go fill it in. Using lower level calls to manipulate
934  // the offset record in two spots, because we are placing objects
935  // out of order in the file.
936 
937  // PAGES
938  const long int kPagesObjectNumber = 2;
939  offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
940  n = snprintf(buf, sizeof(buf),
941  "%ld 0 obj\n"
942  "<<\n"
943  " /Type /Pages\n"
944  " /Kids [ ", kPagesObjectNumber);
945  if (n >= sizeof(buf)) return false;
946  AppendString(buf);
947  size_t pages_objsize = strlen(buf);
948  for (size_t i = 0; i < pages_.unsigned_size(); i++) {
949  n = snprintf(buf, sizeof(buf),
950  "%ld 0 R ", pages_[i]);
951  if (n >= sizeof(buf)) return false;
952  AppendString(buf);
953  pages_objsize += strlen(buf);
954  }
955  n = snprintf(buf, sizeof(buf),
956  "]\n"
957  " /Count %d\n"
958  ">>\n"
959  "endobj\n", pages_.size());
960  if (n >= sizeof(buf)) return false;
961  AppendString(buf);
962  pages_objsize += strlen(buf);
963  offsets_.back() += pages_objsize; // manipulation #2
964 
965  // INFO
966  STRING utf16_title = "FEFF"; // byte_order_marker
967  GenericVector<int> unicodes;
968  UNICHAR::UTF8ToUnicode(title(), &unicodes);
969  char utf16[kMaxBytesPerCodepoint];
970  for (int i = 0; i < unicodes.length(); i++) {
971  int code = unicodes[i];
972  if (CodepointToUtf16be(code, utf16)) {
973  utf16_title += utf16;
974  }
975  }
976 
977  char* datestr = l_getFormattedDate();
978  n = snprintf(buf, sizeof(buf),
979  "%ld 0 obj\n"
980  "<<\n"
981  " /Producer (Tesseract %s)\n"
982  " /CreationDate (D:%s)\n"
983  " /Title <%s>\n"
984  ">>\n"
985  "endobj\n",
986  obj_, TESSERACT_VERSION_STR, datestr, utf16_title.c_str());
987  lept_free(datestr);
988  if (n >= sizeof(buf)) return false;
989  AppendPDFObject(buf);
990  n = snprintf(buf, sizeof(buf),
991  "xref\n"
992  "0 %ld\n"
993  "0000000000 65535 f \n", obj_);
994  if (n >= sizeof(buf)) return false;
995  AppendString(buf);
996  for (int i = 1; i < obj_; i++) {
997  n = snprintf(buf, sizeof(buf), "%010ld 00000 n \n", offsets_[i]);
998  if (n >= sizeof(buf)) return false;
999  AppendString(buf);
1000  }
1001  n = snprintf(buf, sizeof(buf),
1002  "trailer\n"
1003  "<<\n"
1004  " /Size %ld\n"
1005  " /Root %ld 0 R\n"
1006  " /Info %ld 0 R\n"
1007  ">>\n"
1008  "startxref\n"
1009  "%ld\n"
1010  "%%%%EOF\n",
1011  obj_,
1012  1L, // catalog
1013  obj_ - 1, // info
1014  offsets_.back());
1015  if (n >= sizeof(buf)) return false;
1016  AppendString(buf);
1017  return true;
1018 }
unsigned int unsigned_size() const
Definition: genericvector.h:76
int size() const
Definition: genericvector.h:72
bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint])
Definition: strngs.h:45
int length() const
Definition: genericvector.h:85
T & back() const
const char * c_str() const
Definition: strngs.cpp:209
static bool UTF8ToUnicode(const char *utf8_str, GenericVector< int > *unicodes)
Definition: unichar.cpp:211
voidpf void * buf
Definition: ioapi.h:39
#define TESSERACT_VERSION_STR
Definition: baseapi.h:23
void AppendString(const char *s)
Definition: renderer.cpp:102
const char * title() const
Definition: renderer.h:81

The documentation for this class was generated from the following files: