#include <renderer.h>
Renders tesseract output into searchable PDF
Definition at line 186 of file renderer.h.
◆ TessPDFRenderer()
tesseract::TessPDFRenderer::TessPDFRenderer |
( |
const char * |
outputbase, |
|
|
const char * |
datadir, |
|
|
bool |
textonly |
|
) |
| |
Definition at line 187 of file pdfrenderer.cpp.
192 textonly_ = textonly;
TessResultRenderer(const char *outputbase, const char *extension)
◆ AddImageHandler()
bool tesseract::TessPDFRenderer::AddImageHandler |
( |
TessBaseAPI * |
api | ) |
|
|
protectedvirtual |
Implements tesseract::TessResultRenderer.
Definition at line 844 of file pdfrenderer.cpp.
846 char buf[kBasicBufSize];
847 char buf2[kBasicBufSize];
848 Pix *pix =
api->GetInputImage();
850 int ppi =
api->GetSourceYResolution();
851 if (!pix || ppi <= 0)
853 double width = pixGetWidth(pix) * 72.0 / ppi;
854 double height = pixGetHeight(pix) * 72.0 / ppi;
856 snprintf(buf2,
sizeof(buf2),
"/XObject << /Im1 %ld 0 R >>\n", obj_ + 2);
857 const char *xobject = (textonly_) ?
"" : buf2;
860 n = snprintf(buf,
sizeof(buf),
865 " /MediaBox [0 0 %.2f %.2f]\n" 866 " /Contents %ld 0 R\n" 870 " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n" 871 " /Font << /f-0-0 %ld 0 R >>\n" 881 if (n >=
sizeof(buf))
return false;
883 AppendPDFObject(buf);
886 const std::unique_ptr<
char[]> pdftext(GetPDFTextObjects(
api, width, height));
887 const long pdftext_len = strlen(pdftext.get());
889 unsigned char *comp_pdftext =
890 zlibCompress(reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
891 long comp_pdftext_len = len;
892 n = snprintf(buf,
sizeof(buf),
895 " /Length %ld /Filter /FlateDecode\n" 897 "stream\n", obj_, comp_pdftext_len);
898 if (n >=
sizeof(buf)) {
899 lept_free(comp_pdftext);
903 long objsize = strlen(buf);
904 AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
905 objsize += comp_pdftext_len;
906 lept_free(comp_pdftext);
911 objsize += strlen(b2);
912 AppendPDFObjectDIY(objsize);
915 char *pdf_object =
nullptr;
916 if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) {
920 AppendPDFObjectDIY(objsize);
void AppendData(const char *s, int len)
void AppendString(const char *s)
◆ BeginDocumentHandler()
bool tesseract::TessPDFRenderer::BeginDocumentHandler |
( |
| ) |
|
|
protectedvirtual |
Reimplemented from tesseract::TessResultRenderer.
Definition at line 501 of file pdfrenderer.cpp.
502 char buf[kBasicBufSize];
505 n = snprintf(buf,
sizeof(buf),
508 0xDE, 0xAD, 0xBE, 0xEB);
509 if (n >=
sizeof(buf))
return false;
510 AppendPDFObject(buf);
513 n = snprintf(buf,
sizeof(buf),
521 if (n >=
sizeof(buf))
return false;
522 AppendPDFObject(buf);
530 n = snprintf(buf,
sizeof(buf),
533 " /BaseFont /GlyphLessFont\n" 534 " /DescendantFonts [ %ld 0 R ]\n" 535 " /Encoding /Identity-H\n" 537 " /ToUnicode %ld 0 R\n" 544 if (n >=
sizeof(buf))
return false;
545 AppendPDFObject(buf);
548 n = snprintf(buf,
sizeof(buf),
551 " /BaseFont /GlyphLessFont\n" 552 " /CIDToGIDMap %ld 0 R\n" 555 " /Ordering (Identity)\n" 556 " /Registry (Adobe)\n" 559 " /FontDescriptor %ld 0 R\n" 560 " /Subtype /CIDFontType2\n" 568 if (n >=
sizeof(buf))
return false;
569 AppendPDFObject(buf);
572 const int kCIDToGIDMapSize = 2 * (1 << 16);
573 const std::unique_ptr<
unsigned char[]> cidtogidmap(
new unsigned char[kCIDToGIDMapSize]);
574 for (
int i = 0; i < kCIDToGIDMapSize; i++) {
575 cidtogidmap[i] = (i % 2) ? 1 : 0;
578 unsigned char *comp =
579 zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
580 n = snprintf(buf,
sizeof(buf),
583 " /Length %lu /Filter /FlateDecode\n" 587 if (n >=
sizeof(buf)) {
592 long objsize = strlen(buf);
593 AppendData(reinterpret_cast<char *>(comp), len);
596 const char *endstream_endobj =
600 objsize += strlen(endstream_endobj);
601 AppendPDFObjectDIY(objsize);
604 "/CIDInit /ProcSet findresource begin\n" 609 " /Registry (Adobe)\n" 613 "/CMapName /Adobe-Identify-UCS def\n" 615 "1 begincodespacerange\n" 617 "endcodespacerange\n" 619 "<0000> <FFFF> <0000>\n" 622 "CMapName currentdict /CMap defineresource pop\n" 627 n = snprintf(buf,
sizeof(buf),
629 "<< /Length %lu >>\n" 633 "endobj\n", (
unsigned long) strlen(stream), stream);
634 if (n >=
sizeof(buf))
return false;
635 AppendPDFObject(buf);
638 n = snprintf(buf,
sizeof(buf),
645 " /FontBBox [ 0 0 %d %d ]\n" 646 " /FontFile2 %ld 0 R\n" 647 " /FontName /GlyphLessFont\n" 650 " /Type /FontDescriptor\n" 659 if (n >=
sizeof(buf))
return false;
660 AppendPDFObject(buf);
662 n = snprintf(buf,
sizeof(buf),
"%s/pdf.ttf", datadir_);
663 if (n >=
sizeof(buf))
return false;
664 FILE *fp = fopen(buf,
"rb");
666 tprintf(
"Can not open file \"%s\"!\n", buf);
670 long int size = ftell(fp);
672 const std::unique_ptr<
char[]> buffer(
new char[size]);
673 if (fread(buffer.get(), 1,
size, fp) != static_cast<unsigned long>(size)) {
679 n = snprintf(buf,
sizeof(buf),
685 "stream\n", size, size);
686 if (n >=
sizeof(buf)) {
690 objsize = strlen(buf);
694 objsize += strlen(endstream_endobj);
695 AppendPDFObjectDIY(objsize);
void AppendData(const char *s, int len)
void AppendString(const char *s)
◆ EndDocumentHandler()
bool tesseract::TessPDFRenderer::EndDocumentHandler |
( |
| ) |
|
|
protectedvirtual |
Reimplemented from tesseract::TessResultRenderer.
Definition at line 927 of file pdfrenderer.cpp.
929 char buf[kBasicBufSize];
938 const long int kPagesObjectNumber = 2;
939 offsets_[kPagesObjectNumber] = offsets_.
back();
940 n = snprintf(buf,
sizeof(buf),
944 " /Kids [ ", kPagesObjectNumber);
945 if (n >=
sizeof(buf))
return false;
947 size_t pages_objsize = strlen(buf);
949 n = snprintf(buf,
sizeof(buf),
950 "%ld 0 R ", pages_[i]);
951 if (n >=
sizeof(buf))
return false;
953 pages_objsize += strlen(buf);
955 n = snprintf(buf,
sizeof(buf),
959 "endobj\n", pages_.
size());
960 if (n >=
sizeof(buf))
return false;
962 pages_objsize += strlen(buf);
963 offsets_.
back() += pages_objsize;
966 STRING utf16_title =
"FEFF";
969 char utf16[kMaxBytesPerCodepoint];
970 for (
int i = 0; i < unicodes.
length(); i++) {
971 int code = unicodes[i];
973 utf16_title += utf16;
977 char* datestr = l_getFormattedDate();
978 n = snprintf(buf,
sizeof(buf),
981 " /Producer (Tesseract %s)\n" 982 " /CreationDate (D:%s)\n" 988 if (n >=
sizeof(buf))
return false;
989 AppendPDFObject(buf);
990 n = snprintf(buf,
sizeof(buf),
993 "0000000000 65535 f \n", obj_);
994 if (n >=
sizeof(buf))
return false;
996 for (
int i = 1; i < obj_; i++) {
997 n = snprintf(buf,
sizeof(buf),
"%010ld 00000 n \n", offsets_[i]);
998 if (n >=
sizeof(buf))
return false;
1001 n = snprintf(buf,
sizeof(buf),
1015 if (n >=
sizeof(buf))
return false;
unsigned int unsigned_size() const
bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint])
const char * c_str() const
static bool UTF8ToUnicode(const char *utf8_str, GenericVector< int > *unicodes)
#define TESSERACT_VERSION_STR
void AppendString(const char *s)
const char * title() const
The documentation for this class was generated from the following files:
- /home/stefan/src/github/tesseract-ocr/tesseract/api/renderer.h
- /home/stefan/src/github/tesseract-ocr/tesseract/api/pdfrenderer.cpp