diff --git a/parsecmaps.py b/parsecmaps.py index 494a90e569918567038c479e5943931eff9a9fd0..4731077197c00f9077ab53de4d7876a56f2111c0 100755 --- a/parsecmaps.py +++ b/parsecmaps.py @@ -1,13 +1,45 @@ #!/usr/bin/python3 -"""Read a CMap file or page contents that may have been extracted from a PDF. +"""Explore PDF file structure, at least enough to parse a CMap file, hopefully. + +Example: + + >>> import parsecmaps + >>> p = parsecmaps.read_pdf('../Descargas/1.2754649.pdf') + >>> p.catalog + << /Type /Catalog /Pages 13 0 R >> + >>> p.catalog['Pages'] + << /Count 4 /Type /Pages /ITXT b'5.1.2' /Kids [11 0 R 38 0 R 74 0 R 86 0 R] >> + >>> len(p.catalog['Pages']['Kids']) + 4 + >>> p.catalog['Pages']['Kids'][0].isa('Page') + True + >>> p.catalog['Pages']['Kids'][0] + << /Type /Page /Contents [5 0 R 12 0 R 6 0 R] /Parent 13 0 R /Resources << /XObject << /img0 14 0 R /img1 15 0 R >> /ProcSet [/PDF /Text /ImageB /ImageC /ImageI] /Font << /F1 16 0 R /Xi0 1 0 R /F2 17 0 R >> >> /Annots [18 0 R 19 0 R 20 0 R 21 0 R 22 0 R 23 0 R 24 0 R 25 0 R 26 0 R 27 0 R 28 0 R 29 0 R 30 0 R 31 0 R 32 0 R 33 0 R 34 0 R 35 0 R 36 0 R 37 0 R] /MediaBox [0 0 612 792] >> + >>> _['Resources'] + << /XObject << /img0 14 0 R /img1 15 0 R >> /ProcSet [/PDF /Text /ImageB /ImageC /ImageI] /Font << /F1 16 0 R /Xi0 1 0 R /F2 17 0 R >> >> + >>> _['Font'] + << /F1 16 0 R /Xi0 1 0 R /F2 17 0 R >> + >>> f = _ + >>> f.keys() + dict_keys(['F1', 'Xi0', 'F2']) + >>> f['F1'] + << /Type /Font /DescendantFonts [98 0 R] /ToUnicode 99 0 R /BaseFont /MFRGCI+ArialMT /Subtype /Type0 /Encoding /Identity-H >> + +Almost, but not quite, to the point of being able to pull CMaps out of +the PDF: + + >>> f['F1']['ToUnicode'] + Traceback (most recent call last): + ... + TypeError: cannot unpack non-iterable NoneType object Too slow to use in practice (30 kilobytes/sec), but hopefully validates our understanding of the problem and communicates it more clearly than a lower-level language would. -A pitfall I've run into a lot is that if you try to parse a Unicode -string with a byte-string grammar or vice versa, you just get silent -failures to parse. +A pitfall I've run into a lot with the PEG parser in here is that if +you try to parse a Unicode string with a byte-string grammar or vice +versa, you just get silent failures to parse. Still unimplemented: @@ -456,30 +488,58 @@ class Charset(Prod): ws = set(b'\0\t\n\014\r ') wschar = Charset(ws) # Whitespace character wschar.name = 'wschar' +eol = Lit(b'\r\n') | b'\r' | b'\n' +eol.name = 'eol' digit = Charset(set(b'0123456789')) digit.name = 'digit' integer = [b'+', b'-', b''] + digit.some() integer.xform = lambda d: ('int', int(d[0] + bytes(d[1]))) -def ps_grammar(): - "Construct a grammar that tokenizes a subset of PostScript/PDF." - delim = set(b'()<>[]{}/%') - delimchar = Charset(delim) # Delimiter character +delim = set(b'()<>[]{}/%') +delimchar = Charset(delim) # Delimiter character + +any_byte = set(range(256)) +reg = any_byte - delim - ws # "Regular" character +regchar = Charset(reg) +regchar.name = 'regchar' + +name = b'/' + Any(regchar) # say, /Type or /Page +name.name = 'name' +name.xform = lambda d: ('name', bytes(d[1])) - any_byte = set(range(256)) - reg = any_byte - delim - ws # "Regular" character - regchar = Charset(reg) - regchar.name = 'regchar' +def drop_ws(prod): + prod = prod + Any(wschar) + prod.xform = lambda d: d[0] + return prod +def decode_hex(d): + s = bytes(d[0][1]) + if len(s) % 2 != 0: + s += b'0' + return ('str', bytes(int(s[i:i+2], 16) for i in range(0, len(s), 2))) + +hexstring = b'<' + Any(drop_ws(Charset(b'0123456789abcdefABCDEF'))) + b'>' +hexstring.xform = decode_hex + +string_element = ( Charset(any_byte - set(br'\()')) + | (lambda: parenstring) + | b'\\' + Charset(any_byte) + ) +parenstring = b'(' + Any(string_element) + b')' +parenstring.xform = lambda d: ('str', bytes(d[0][1])) # XXX croaks on anything with \() +ok(Parse(b'()').do(0, parenstring), (2, ('str', b''))) +ok(Parse(b'(hi)').do(0, parenstring), (4, ('str', b'hi'))) +# XXX this one won't work until I write code to un-nest the paren strings +#ok(Parse(b'(hi(x))').do(0, parenstring), (7, ('str', b'hi(x)'))) + + +def ps_grammar(): + "Construct a grammar that tokenizes a subset of PostScript/PDF." op = regchar.some() + b'' # operator. XXX this +b'' is to allow us to compose xforms op.name = 'op' op.xform = lambda d: ('op', bytes(d[0])) - name = b'/' + Any(regchar) # /Type or /Page - name.name = 'name' - name.xform = lambda d: ('name', bytes(d[1])) - # XXX no real-number support yet dictdelim = Lit(b'<<') | b'>>' @@ -487,31 +547,6 @@ def ps_grammar(): arraydelim = Charset(set(b'[]')) arraydelim.xform = lambda d: ('ad', d) - string_element = ( Charset(any_byte - set(br'\()')) - | (lambda: parenstring) - | b'\\' + Charset(any_byte) - ) - parenstring = b'(' + Any(string_element) + b')' - parenstring.xform = lambda d: ('str', bytes(d[0][1])) # XXX croaks on anything with \() - ok(Parse(b'()').do(0, parenstring), (2, ('str', b''))) - ok(Parse(b'(hi)').do(0, parenstring), (4, ('str', b'hi'))) - # XXX this one won't work until I write code to un-nest the paren strings - #ok(Parse(b'(hi(x))').do(0, parenstring), (7, ('str', b'hi(x)'))) - - def drop_ws(prod): - prod = prod + Any(wschar) - prod.xform = lambda d: d[0] - return prod - - def decode_hex(d): - s = bytes(d[0][1]) - if len(s) % 2 != 0: - s += b'0' - return ('str', bytes(int(s[i:i+2], 16) for i in range(0, len(s), 2))) - - hexstring = b'<' + Any(drop_ws(Charset(b'0123456789abcdefABCDEF'))) + b'>' - hexstring.xform = decode_hex - tokens = Any(drop_ws(integer | op | name| dictdelim | arraydelim | parenstring | hexstring)) root = Any(wschar) + tokens root.xform = lambda d: d[1] @@ -567,10 +602,22 @@ def please_be(a, b): if a != b: raise ValueError(a, '!=', b) -xref_header = Lit(b'xref\n') + integer + wschar.some() + integer + b'\n' # XXX too strict +xref_header = Lit(b'xref\n') + integer + b' ' + integer + eol # clause 7.5.4 # XXX This xform clearly indicates that I need to rethink how # concatenation works for parsing expressions. xref_header.xform = lambda d: (d[0][0][0][1][1], d[0][1][1]) +dictionary = Thunk(lambda: drop_ws(b'<<') + Any(drop_ws(name) + pdf_obj) + drop_ws(b'>>')) +dictionary.xform = lambda d: ('dict', {k[1].decode('utf-8'): v for k, v in d[0][1]}) +array = Thunk(lambda: drop_ws(b'[') + Any(pdf_obj) + drop_ws(b']')) +array.xform = lambda d: ('array', d[0][1]) +ref = drop_ws(integer) + drop_ws(integer) + drop_ws(b'R') +ref.xform = lambda d: ('ref', d[0]) +# XXX integer and name should drop their own damn ws +pdf_obj = drop_ws(dictionary | name | ref | hexstring | array | integer | parenstring) +# XXX whitespace after 'obj' is not required according to clause +# 7.3.10; probably also endobj should require to be followed by a delimiter char +indirect_obj = drop_ws(integer) + drop_ws(integer) + drop_ws(b'obj') + pdf_obj + drop_ws(b'endobj') +indirect_obj.xform = lambda d: d[0][1] class Pdf: """Comprehends enough of the PDF format to facilitate exploration. @@ -591,6 +638,8 @@ class Pdf: (350102, ((('int', 72), [32]), ('int', 0))) >>> p.parse(p.xrefs.offset_of(72), parsecmaps.integer + parsecmaps.wschar.some() + parsecmaps.integer + b' obj') (350106, (((('int', 72), [32]), ('int', 0)), b' obj')) + >>> p.trailer + (729445, (b'trailer', ('dict', {'ID': [('str', b"\xc7\x9c\x81't\x81X\xb5\xdf\x04Y\xf3\x9c\xf0-`"), ('str', b"\x1f\xd0\xd9d'\x7fR\x88\xed\xba`\xb5\x81\xb1\xf9\x10")], 'Root': ('ref', (('int', 126), ('int', 0))), 'Size': ('int', 127), 'Info': ('ref', (('int', 2), ('int', 0)))}))) """ def __init__(self, blob): @@ -600,6 +649,9 @@ class Pdf: self.xref_start = int(blob[sx:].split()[1]) # XXX there could be many sections self.xrefs = XrefSection(self, self.xref_start) + self.trailer_plumbing = self.parse(self.xrefs.end, drop_ws(b'trailer') + dictionary) + self.trailer = porcelainize(self, self.trailer_plumbing[1][1]) + self.catalog = self.trailer['Root'] def read(self, offset, size=64): return self.blob[offset:offset+size] @@ -607,11 +659,35 @@ class Pdf: def parse(self, offset, ex): return self.parser.do(offset, ex) + def get_indirect_obj(self, oid, generation=0): + "Returns a plumbing object." + result = self.parse(self.xrefs.offset_of(oid), indirect_obj) + offset, plumb = result + return plumb + + def dereference(self, plumb): + """Given a plumbing object, follow indirect object refs if necessary. + + Returns another plumbing object. Returns ('null', None) for + dangling (XXX this is broken) or circular links. + + """ + seen = set() + while plumb[0] == 'ref': + if plumb in seen: + return 'null', None + seen.add(plumb) + _, ((_, oid), (_, gen)) = plumb + plumb = self.get_indirect_obj(oid, gen) + + return plumb + class XrefSection: def __init__(self, pdf, offset): self.pdf = pdf # XXX this is a subsection, bozo, there could be more than one self.offset, (self.first, self.size) = pdf.parse(offset, xref_header) + self.end = self.offset + 20 * self.size def __getitem__(self, oid): if not self.first <= oid < self.first + self.size: @@ -619,15 +695,100 @@ class XrefSection: return self.pdf.read(self.offset + 20 * (oid - self.first), size=20) def offset_of(self, oid): + # XXX check that it's n, not f (clause 7.5.4) return int(self[oid][:10]) + +class PorcelainDictionary: + def __init__(self, pdf, d): + self.pdf = pdf + self.contents = d + + def __repr__(self): + return '<< %s >>' % ' '.join('/%s %r' % (k, porcelainize(self.pdf, v)) for k, v in self.contents.items()) + + def keys(self): + return self.contents.keys() + + def __getitem__(self, key): + return porcelainize(self.pdf, self.pdf.dereference(self.contents[key])) + + def lstat(self, key): + return self.contents.get(key) + + def isa(self, typename): + return self['Type'] == PorcelainName(self.pdf, typename.encode('utf-8')) + + +class PorcelainArray: + def __init__(self, pdf, d): + self.pdf = pdf + self.contents = d + + def __repr__(self): + return '[%s]' % ' '.join(repr(porcelainize(self.pdf, x)) for x in self.contents) + + def __getitem__(self, i): + return porcelainize(self.pdf, self.pdf.dereference(self.contents[i])) + + def lstat(self, key): + # XXX porcelain ref? + return self.contents[key] + + def __len__(self): + return len(self.contents) + + +class PorcelainName: + def __init__(self, pdf, name): + self.pdf = pdf + # XXX need to decode hex bytes + self.name = name.decode('utf-8') + + def __repr__(self): + return '/%s' % self.name + + def __hash__(self): + return hash(self.name) * hash(self.pdf) + + def __eq__(self, other): + return (isinstance(other, PorcelainName) + and (self.pdf, self.name) == (other.pdf, other.name)) + +class PorcelainRef: + def __init__(self, pdf, data): + self.pdf = pdf + self.data = data + + def __repr__(self): + return '%r %r R' % (self.data[0][1], self.data[1][1]) + + def __call__(self): + # XXX this is unimpressive code and signals the need for a + # rethinking + return self.pdf.dereference(('ref', data)) + +porcelain_classes = { + 'dict': PorcelainDictionary, + 'array': PorcelainArray, + 'name': PorcelainName, + 'ref': PorcelainRef, + 'str': lambda pdf, s: s, + 'int': lambda pdf, s: s, +} + +def porcelainize(pdf, plumb): + "Add a porcelain façade to a plumbing object, which is just a tuple." + return porcelain_classes[plumb[0]](pdf, plumb[1]) + + def read_pdf(filename): with open(filename, 'rb') as fo: return Pdf(fo.read()) def all_pages(pdf, whine=lambda *args: None): - # XXX these structures aren't implemented yet + # XXX these structures don't quite work yet return page_descendants(pdf.catalog['Pages'], whine) def page_descendants(pages, whine=lambda *args: None):