diff --git a/pdftour.py b/pdftour.py index b265387545a820da661742ead9c82204a6a1cf31..cfc66b6a4667da841de71e6a87c5d2995cae1ca9 100755 --- a/pdftour.py +++ b/pdftour.py @@ -33,7 +33,7 @@ debug representation is a mix of Python and PostScript syntax: >>> f['F2'] << /Type /Font /BaseFont /Helvetica /Subtype /Type1 /Encoding /WinAnsiEncoding >> -Those look the same, but they're distinct objects: +Those fonts look the same, but they're distinct objects: >>> f.lstat('Xi0') ('ref', (('int', 1), ('int', 0))) @@ -45,6 +45,14 @@ We can see the underlying bytes in the file: >>> p.read(p.xrefs.offset_of(17), 128) b'17 0 obj\n<</Type/Font/BaseFont/Helvetica/Subtype/Type1/Encoding/WinAnsiEncoding>>\nendobj\n18 0 obj\n<</Border[0 0 0]/Rect[56 696 5' +We can attempt parsing at that point with a custom ad-hoc grammar: + + >>> p.parse(p.xrefs.offset_of(17), pdftour.integer + pdftour.wschar.some() + pdftour.integer + b' obj\n<<' + pdftour.pdf_obj.some()) + (247827, ((((('int', 17), [32]), ('int', 0)), b' obj\n<<'), + [('name', b'Type'), ('name', b'Font'), ('name', b'BaseFont'), + ('name', b'Helvetica'), ('name', b'Subtype'), ('name', b'Type1'), + ('name', b'Encoding'), ('name', b'WinAnsiEncoding')])) + We can pull CMaps out of the PDF and parse them and compile their code space ranges to a grammar for tokenizing text strings to paint on the page: @@ -109,7 +117,11 @@ Still unimplemented: - CMap bfranges that *use* arrays; - decoding of parenthesized strings; - nesting of parenthesized strings; -- actually decoding a tokenized character using the tables extracted from the CMap. +- actually decoding a tokenized character using the tables extracted from the CMap; +- compressed object streams and xref streams; +- multiple xref sections; +- multiple xref subsections; +- pulling out the character strings being drawn from the page contents. """ import sys, types, functools, operator, zlib @@ -706,23 +718,6 @@ class Pdf: Still very incomplete. - Example: - >>> p = parsecmaps.read_pdf('../Descargas/1.2754649.pdf') - >>> p.read(p.xrefs.offset_of(55)) - b'55 0 obj\n<</Type/Font/FirstChar 37/FontDescriptor 116 0 R/BaseFo' - >>> p.read(p.xrefs.offset_of(72)) - b'72 0 obj\n<</Border[0 0 0]/Type/Annot/Dest[74 0 R/XYZ 307 380 nul' - >>> p.read(p.xrefs.offset_of(72), size=256) - b'72 0 obj\n<</Border[0 0 0]/Type/Annot/Dest[74 0 R/XYZ 307 380 null]/Rect[394.024017 55.089005 409.314026 65.729996]/Subtype/Link>>\nendobj\n73 0 obj\n<</Border[0 0 0]/Type/Annot/Dest[74 0 R/XYZ 43 253 null]/Rect[464.366028 55.089005 475.665009 65.729996]/Subty' - >>> p.xrefs[72] - b'0000350098 00000 n \n' - >>> p.parse(p.xrefs.offset_of(72), parsecmaps.integer + parsecmaps.wschar.some() + parsecmaps.integer) - (350102, ((('int', 72), [32]), ('int', 0))) - >>> p.parse(p.xrefs.offset_of(72), parsecmaps.integer + parsecmaps.wschar.some() + parsecmaps.integer + b' obj') - (350106, (((('int', 72), [32]), ('int', 0)), b' obj')) - >>> p.trailer - (729445, (b'trailer', ('dict', {'ID': [('str', b"\xc7\x9c\x81't\x81X\xb5\xdf\x04Y\xf3\x9c\xf0-`"), ('str', b"\x1f\xd0\xd9d'\x7fR\x88\xed\xba`\xb5\x81\xb1\xf9\x10")], 'Root': ('ref', (('int', 126), ('int', 0))), 'Size': ('int', 127), 'Info': ('ref', (('int', 2), ('int', 0)))}))) - """ def __init__(self, blob): self.blob = blob @@ -903,7 +898,6 @@ def read_pdf(filename): def all_pages(pdf, whine=lambda *args: None): - # XXX these structures don't quite work yet return page_descendants(pdf.catalog['Pages'], whine) def page_descendants(pages, whine=lambda *args: None):